Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(381)

Side by Side Diff: core/src/fpdftext/fpdf_text.cpp

Issue 1265503005: clang-format all pdfium code. (Closed) Base URL: https://pdfium.googlesource.com/pdfium.git@master
Patch Set: sigh Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 PDFium Authors. All rights reserved. 1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 6
7 #include "../../../third_party/base/nonstd_unique_ptr.h" 7 #include "../../../third_party/base/nonstd_unique_ptr.h"
8 #include "../../include/fpdfapi/fpdf_page.h" 8 #include "../../include/fpdfapi/fpdf_page.h"
9 #include "../../include/fpdfapi/fpdf_pageobj.h" 9 #include "../../include/fpdfapi/fpdf_pageobj.h"
10 #include "../../include/fpdfapi/fpdf_resource.h" 10 #include "../../include/fpdfapi/fpdf_resource.h"
11 #include "../../include/fpdftext/fpdf_text.h" 11 #include "../../include/fpdftext/fpdf_text.h"
12 #include "../../include/fxcrt/fx_arb.h" 12 #include "../../include/fxcrt/fx_arb.h"
13 #include "../../include/fxcrt/fx_ucd.h" 13 #include "../../include/fxcrt/fx_ucd.h"
14 #include "text_int.h" 14 #include "text_int.h"
15 #include "txtproc.h" 15 #include "txtproc.h"
16 16
17 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, const FX_CHAR* d efchar) 17 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode,
18 { 18 int destcp,
19 if (destcp == 0) { 19 const FX_CHAR* defchar) {
20 if (unicode < 0x80) { 20 if (destcp == 0) {
21 return CFX_ByteString((char)unicode); 21 if (unicode < 0x80) {
22 } 22 return CFX_ByteString((char)unicode);
23 const FX_CHAR* altstr = FCS_GetAltStr(unicode);
24 if (altstr) {
25 return CFX_ByteString(altstr, -1);
26 }
27 return CFX_ByteString(defchar, -1);
28 }
29 char buf[10];
30 int iDef = 0;
31 int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 1 0, NULL, &iDef);
32 if (ret && !iDef) {
33 return CFX_ByteString(buf, ret);
34 } 23 }
35 const FX_CHAR* altstr = FCS_GetAltStr(unicode); 24 const FX_CHAR* altstr = FCS_GetAltStr(unicode);
36 if (altstr) { 25 if (altstr) {
37 return CFX_ByteString(altstr, -1); 26 return CFX_ByteString(altstr, -1);
38 } 27 }
39 return CFX_ByteString(defchar, -1); 28 return CFX_ByteString(defchar, -1);
40 } 29 }
41 CTextPage::CTextPage() 30 char buf[10];
42 { 31 int iDef = 0;
43 } 32 int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 10,
44 CTextPage::~CTextPage() 33 NULL, &iDef);
45 { 34 if (ret && !iDef) {
35 return CFX_ByteString(buf, ret);
36 }
37 const FX_CHAR* altstr = FCS_GetAltStr(unicode);
38 if (altstr) {
39 return CFX_ByteString(altstr, -1);
40 }
41 return CFX_ByteString(defchar, -1);
42 }
43 CTextPage::CTextPage() {}
44 CTextPage::~CTextPage() {
45 int i;
46 for (i = 0; i < m_BaseLines.GetSize(); i++) {
47 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
48 delete pBaseLine;
49 }
50 for (i = 0; i < m_TextColumns.GetSize(); i++) {
51 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
52 delete pTextColumn;
53 }
54 }
55 void CTextPage::ProcessObject(CPDF_PageObject* pObject) {
56 if (pObject->m_Type != PDFPAGE_TEXT) {
57 return;
58 }
59 CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
60 CPDF_Font* pFont = pText->m_TextState.GetFont();
61 int count = pText->CountItems();
62 FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2);
63 pText->CalcCharPos(pPosArray);
64
65 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
66 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
67 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
68 FX_FLOAT spacew = 0;
69 if (space_charcode != -1) {
70 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
71 }
72 if (spacew == 0) {
73 spacew = fontsize_h / 4;
74 }
75 if (pText->m_TextState.GetBaselineAngle() != 0) {
76 int cc = 0;
77 CFX_AffineMatrix matrix;
78 pText->GetTextMatrix(&matrix);
79 for (int i = 0; i < pText->m_nChars; i++) {
80 FX_DWORD charcode = pText->m_nChars == 1
81 ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes
82 : pText->m_pCharCodes[i];
83 if (charcode == (FX_DWORD)-1) {
84 continue;
85 }
86 FX_RECT char_box;
87 pFont->GetCharBBox(charcode, char_box);
88 FX_FLOAT char_left =
89 pPosArray ? pPosArray[cc * 2]
90 : char_box.left * pText->m_TextState.GetFontSize() / 1000;
91 FX_FLOAT char_right =
92 pPosArray ? pPosArray[cc * 2 + 1]
93 : char_box.right * pText->m_TextState.GetFontSize() / 1000;
94 FX_FLOAT char_top =
95 char_box.top * pText->m_TextState.GetFontSize() / 1000;
96 FX_FLOAT char_bottom =
97 char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
98 cc++;
99 FX_FLOAT char_origx, char_origy;
100 matrix.Transform(char_left, 0, char_origx, char_origy);
101 matrix.TransformRect(char_left, char_right, char_top, char_bottom);
102 CFX_ByteString str;
103 pFont->AppendChar(str, charcode);
104 InsertTextBox(NULL, char_origy, char_left, char_right, char_top,
105 char_bottom, spacew, fontsize_v, str, pFont);
106 }
107 if (pPosArray) {
108 FX_Free(pPosArray);
109 }
110 return;
111 }
112 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
113 for (int ii = 0; ii < count * 2; ii++) {
114 pPosArray[ii] *= ratio_h;
115 }
116 FX_FLOAT baseline = pText->m_PosY;
117 CTextBaseLine* pBaseLine = NULL;
118 FX_FLOAT topy = pText->m_Top;
119 FX_FLOAT bottomy = pText->m_Bottom;
120 FX_FLOAT leftx = pText->m_Left;
121 int cc = 0;
122 CFX_ByteString segment;
123 int space_count = 0;
124 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
125 for (int i = 0; i < pText->m_nChars; i++) {
126 FX_DWORD charcode = pText->m_nChars == 1
127 ? (FX_DWORD)(uintptr_t)pText->m_pCharCodes
128 : pText->m_pCharCodes[i];
129 if (charcode == (FX_DWORD)-1) {
130 continue;
131 }
132 FX_FLOAT char_left = pPosArray[cc * 2];
133 FX_FLOAT char_right = pPosArray[cc * 2 + 1];
134 cc++;
135 if (char_left < last_left || (char_left - last_right) > spacew / 2) {
136 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
137 leftx + segment_right, topy, bottomy, spacew,
138 fontsize_v, segment, pFont);
139 segment_left = char_left;
140 segment = "";
141 }
142 if (space_count > 1) {
143 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
144 leftx + segment_right, topy, bottomy, spacew,
145 fontsize_v, segment, pFont);
146 segment = "";
147 } else if (space_count == 1) {
148 pFont->AppendChar(segment, ' ');
149 }
150 if (segment.GetLength() == 0) {
151 segment_left = char_left;
152 }
153 segment_right = char_right;
154 pFont->AppendChar(segment, charcode);
155 space_count = 0;
156 last_left = char_left;
157 last_right = char_right;
158 }
159 if (segment.GetLength())
160 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
161 leftx + segment_right, topy, bottomy, spacew,
162 fontsize_v, segment, pFont);
163 FX_Free(pPosArray);
164 }
165 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine,
166 FX_FLOAT basey,
167 FX_FLOAT leftx,
168 FX_FLOAT rightx,
169 FX_FLOAT topy,
170 FX_FLOAT bottomy,
171 FX_FLOAT spacew,
172 FX_FLOAT fontsize_v,
173 CFX_ByteString& str,
174 CPDF_Font* pFont) {
175 if (str.GetLength() == 0) {
176 return NULL;
177 }
178 if (pBaseLine == NULL) {
46 int i; 179 int i;
47 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 180 for (i = 0; i < m_BaseLines.GetSize(); i++) {
48 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 181 CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
49 delete pBaseLine; 182 if (pExistLine->m_BaseLine == basey) {
50 } 183 pBaseLine = pExistLine;
51 for (i = 0; i < m_TextColumns.GetSize(); i ++) { 184 break;
52 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i); 185 }
53 delete pTextColumn; 186 if (pExistLine->m_BaseLine < basey) {
54 } 187 break;
55 } 188 }
56 void CTextPage::ProcessObject(CPDF_PageObject* pObject) 189 }
57 { 190 if (pBaseLine == NULL) {
58 if (pObject->m_Type != PDFPAGE_TEXT) { 191 pBaseLine = new CTextBaseLine;
59 return; 192 pBaseLine->m_BaseLine = basey;
60 } 193 m_BaseLines.InsertAt(i, pBaseLine);
61 CPDF_TextObject* pText = (CPDF_TextObject*)pObject; 194 }
62 CPDF_Font* pFont = pText->m_TextState.GetFont(); 195 }
63 int count = pText->CountItems(); 196 CFX_WideString text;
64 FX_FLOAT* pPosArray = FX_Alloc2D(FX_FLOAT, count, 2); 197 const FX_CHAR* pStr = str;
65 pText->CalcCharPos(pPosArray); 198 int len = str.GetLength(), offset = 0;
66 199 while (offset < len) {
67 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH(); 200 FX_DWORD ch = pFont->GetNextChar(pStr, len, offset);
68 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV(); 201 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
69 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); 202 if (unicode_str.IsEmpty()) {
70 FX_FLOAT spacew = 0; 203 text += (FX_WCHAR)ch;
71 if (space_charcode != -1) { 204 } else {
72 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; 205 text += unicode_str;
73 } 206 }
74 if (spacew == 0) { 207 }
75 spacew = fontsize_h / 4; 208 pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v,
76 } 209 text);
77 if (pText->m_TextState.GetBaselineAngle() != 0) { 210 return pBaseLine;
78 int cc = 0; 211 }
79 CFX_AffineMatrix matrix; 212 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) {
80 pText->GetTextMatrix(&matrix); 213 FX_FLOAT lastheight = -1;
81 for (int i = 0; i < pText->m_nChars; i ++) { 214 FX_FLOAT lastbaseline = -1;
82 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(uintptr_t)pTex t->m_pCharCodes : pText->m_pCharCodes[i]; 215 FX_FLOAT MinLeftX = 1000000;
83 if (charcode == (FX_DWORD) - 1) { 216 FX_FLOAT MaxRightX = 0;
84 continue; 217 int i;
218 for (i = 0; i < m_BaseLines.GetSize(); i++) {
219 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
220 FX_FLOAT leftx, rightx;
221 if (pBaseLine->GetWidth(leftx, rightx)) {
222 if (leftx < MinLeftX) {
223 MinLeftX = leftx;
224 }
225 if (rightx > MaxRightX) {
226 MaxRightX = rightx;
227 }
228 }
229 }
230 for (i = 0; i < m_BaseLines.GetSize(); i++) {
231 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
232 pBaseLine->MergeBoxes();
233 }
234 for (i = 1; i < m_BaseLines.GetSize(); i++) {
235 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
236 CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1);
237 if (pBaseLine->CanMerge(pPrevLine)) {
238 pPrevLine->Merge(pBaseLine);
239 delete pBaseLine;
240 m_BaseLines.RemoveAt(i);
241 i--;
242 }
243 }
244 if (m_bAutoWidth) {
245 int* widths = FX_Alloc(int, m_BaseLines.GetSize());
246 for (i = 0; i < m_BaseLines.GetSize(); i++) {
247 widths[i] = 0;
248 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
249 int TotalChars = 0;
250 FX_FLOAT TotalWidth = 0;
251 int minchars;
252 pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
253 if (TotalChars) {
254 FX_FLOAT charwidth = TotalWidth / TotalChars;
255 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
256 }
257 if (widths[i] > 1000) {
258 widths[i] = 1000;
259 }
260 if (widths[i] < minchars) {
261 widths[i] = minchars;
262 }
263 }
264 int AvgWidth = 0, widthcount = 0;
265 for (i = 0; i < m_BaseLines.GetSize(); i++)
266 if (widths[i]) {
267 AvgWidth += widths[i];
268 widthcount++;
269 }
270 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
271 int MaxWidth = 0;
272 for (i = 0; i < m_BaseLines.GetSize(); i++)
273 if (MaxWidth < widths[i]) {
274 MaxWidth = widths[i];
275 }
276 if (MaxWidth > AvgWidth * 6 / 5) {
277 MaxWidth = AvgWidth * 6 / 5;
278 }
279 FX_Free(widths);
280 if (iMinWidth < MaxWidth) {
281 iMinWidth = MaxWidth;
282 }
283 }
284 for (i = 0; i < m_BaseLines.GetSize(); i++) {
285 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
286 pBaseLine->MergeBoxes();
287 }
288 if (m_bKeepColumn) {
289 FindColumns();
290 }
291 for (i = 0; i < m_BaseLines.GetSize(); i++) {
292 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
293 if (lastheight >= 0) {
294 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
295 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
296 lines.Add(L"");
297 }
298 }
299 lastheight = pBaseLine->m_MaxFontSizeV;
300 lastbaseline = pBaseLine->m_BaseLine;
301 CFX_WideString str;
302 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
303 lines.Add(str);
304 }
305 }
306 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) {
307 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
308 FX_WCHAR* pDst = NULL;
309 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
310 if (nCount < 1) {
311 sDest += wChar;
312 return;
313 }
314 pDst = new FX_WCHAR[nCount];
315 FX_Unicode_GetNormalization(wChar, pDst);
316 for (int nIndex = 0; nIndex < nCount; nIndex++) {
317 sDest += pDst[nIndex];
318 }
319 delete[] pDst;
320 }
321 void NormalizeString(CFX_WideString& str) {
322 if (str.GetLength() <= 0) {
323 return;
324 }
325 CFX_WideString sBuffer;
326 nonstd::unique_ptr<IFX_BidiChar> pBidiChar(IFX_BidiChar::Create());
327 CFX_WordArray order;
328 FX_BOOL bR2L = FALSE;
329 int32_t start = 0, count = 0, i = 0;
330 int nR2L = 0, nL2R = 0;
331 for (i = 0; i < str.GetLength(); i++) {
332 if (pBidiChar->AppendChar(str.GetAt(i))) {
333 int32_t ret = pBidiChar->GetBidiInfo(start, count);
334 order.Add(start);
335 order.Add(count);
336 order.Add(ret);
337 if (!bR2L) {
338 if (ret == 2) {
339 nR2L++;
340 } else if (ret == 1) {
341 nL2R++;
342 }
343 }
344 }
345 }
346 if (pBidiChar->EndChar()) {
347 int32_t ret = pBidiChar->GetBidiInfo(start, count);
348 order.Add(start);
349 order.Add(count);
350 order.Add(ret);
351 if (!bR2L) {
352 if (ret == 2) {
353 nR2L++;
354 } else if (ret == 1) {
355 nL2R++;
356 }
357 }
358 }
359 if (nR2L > 0 && nR2L >= nL2R) {
360 bR2L = TRUE;
361 }
362 if (bR2L) {
363 int count = order.GetSize();
364 for (int j = count - 1; j > 0; j -= 3) {
365 int ret = order.GetAt(j);
366 int start = order.GetAt(j - 2);
367 int count1 = order.GetAt(j - 1);
368 if (ret == 2 || ret == 0) {
369 for (int i = start + count1 - 1; i >= start; i--) {
370 NormalizeCompositeChar(str[i], sBuffer);
371 }
372 } else {
373 i = j;
374 FX_BOOL bSymbol = FALSE;
375 while (i > 0 && order.GetAt(i) != 2) {
376 bSymbol = !order.GetAt(i);
377 i -= 3;
378 }
379 int end = start + count1;
380 int n = 0;
381 if (bSymbol) {
382 n = i + 6;
383 } else {
384 n = i + 3;
385 }
386 if (n >= j) {
387 for (int m = start; m < end; m++) {
388 sBuffer += str[m];
389 }
390 } else {
391 i = j;
392 j = n;
393 for (; n <= i; n += 3) {
394 int start = order.GetAt(n - 2);
395 int count1 = order.GetAt(n - 1);
396 int end = start + count1;
397 for (int m = start; m < end; m++) {
398 sBuffer += str[m];
85 } 399 }
86 FX_RECT char_box; 400 }
87 pFont->GetCharBBox(charcode, char_box); 401 }
88 FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000; 402 }
89 FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.r ight * pText->m_TextState.GetFontSize() / 1000; 403 }
90 FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000; 404 } else {
91 FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontS ize() / 1000; 405 int count = order.GetSize();
92 cc ++; 406 FX_BOOL bL2R = FALSE;
93 FX_FLOAT char_origx, char_origy; 407 for (int j = 0; j < count; j += 3) {
94 matrix.Transform(char_left, 0, char_origx, char_origy); 408 int ret = order.GetAt(j + 2);
95 matrix.TransformRect(char_left, char_right, char_top, char_bottom); 409 int start = order.GetAt(j);
96 CFX_ByteString str; 410 int count1 = order.GetAt(j + 1);
97 pFont->AppendChar(str, charcode); 411 if (ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
98 InsertTextBox(NULL, char_origy, char_left, char_right, char_top, 412 int i = j + 3;
99 char_bottom, spacew, fontsize_v, str, pFont); 413 while (bR2L && i < count) {
100 } 414 if (order.GetAt(i + 2) == 1) {
101 if (pPosArray) { 415 break;
102 FX_Free(pPosArray); 416 } else {
103 } 417 i += 3;
104 return; 418 }
105 } 419 }
106 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize(); 420 if (i == 3) {
107 for (int ii = 0; ii < count * 2; ii ++) { 421 j = -3;
108 pPosArray[ii] *= ratio_h; 422 bL2R = TRUE;
109 } 423 continue;
110 FX_FLOAT baseline = pText->m_PosY; 424 }
111 CTextBaseLine* pBaseLine = NULL; 425 int end = str.GetLength() - 1;
112 FX_FLOAT topy = pText->m_Top; 426 if (i < count) {
113 FX_FLOAT bottomy = pText->m_Bottom; 427 end = order.GetAt(i) - 1;
114 FX_FLOAT leftx = pText->m_Left; 428 }
115 int cc = 0; 429 j = i - 3;
116 CFX_ByteString segment; 430 for (int n = end; n >= start; n--) {
117 int space_count = 0; 431 NormalizeCompositeChar(str[i], sBuffer);
118 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0; 432 }
119 for (int i = 0; i < pText->m_nChars; i ++) { 433 } else {
120 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(uintptr_t)pText->m _pCharCodes : pText->m_pCharCodes[i]; 434 int end = start + count1;
121 if (charcode == (FX_DWORD) - 1) { 435 for (int i = start; i < end; i++) {
122 continue; 436 sBuffer += str[i];
123 } 437 }
124 FX_FLOAT char_left = pPosArray[cc * 2]; 438 }
125 FX_FLOAT char_right = pPosArray[cc * 2 + 1]; 439 }
126 cc ++; 440 }
127 if (char_left < last_left || (char_left - last_right) > spacew / 2) { 441 str.Empty();
128 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, 442 str += sBuffer;
129 topy, bottomy, spacew, fontsize_v, segment , pFont); 443 }
130 segment_left = char_left; 444 static FX_BOOL IsNumber(CFX_WideString& str) {
131 segment = ""; 445 for (int i = 0; i < str.GetLength(); i++) {
132 } 446 FX_WCHAR ch = str[i];
133 if (space_count > 1) { 447 if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' &&
134 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, 448 ch != ' ') {
135 topy, bottomy, spacew, fontsize_v, segment , pFont); 449 return FALSE;
136 segment = ""; 450 }
137 } else if (space_count == 1) { 451 }
138 pFont->AppendChar(segment, ' '); 452 return TRUE;
139 } 453 }
140 if (segment.GetLength() == 0) { 454 void CTextPage::FindColumns() {
141 segment_left = char_left; 455 int i;
142 } 456 for (i = 0; i < m_BaseLines.GetSize(); i++) {
143 segment_right = char_right; 457 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
144 pFont->AppendChar(segment, charcode); 458 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
145 space_count = 0; 459 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
146 last_left = char_left; 460 CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
147 last_right = char_right; 461 if (pColumn == NULL) {
148 } 462 pColumn = new CTextColumn;
149 if (segment.GetLength()) 463 pColumn->m_Count = 1;
150 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, lef tx + segment_right, 464 pColumn->m_AvgPos = pTextBox->m_Right;
151 topy, bottomy, spacew, fontsize_v, segment, pF ont); 465 pColumn->m_TextPos = -1;
152 FX_Free(pPosArray); 466 m_TextColumns.Add(pColumn);
153 } 467 } else {
154 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey , FX_FLOAT leftx, 468 pColumn->m_AvgPos =
155 FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v, 469 (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
156 CFX_ByteString& str, CPDF_Font* pFont) 470 (pColumn->m_Count + 1);
157 { 471 pColumn->m_Count++;
158 if (str.GetLength() == 0) { 472 }
159 return NULL; 473 }
160 } 474 }
161 if (pBaseLine == NULL) { 475 int mincount = m_BaseLines.GetSize() / 4;
162 int i; 476 for (i = 0; i < m_TextColumns.GetSize(); i++) {
163 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 477 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
164 CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 478 if (pTextColumn->m_Count >= mincount) {
165 if (pExistLine->m_BaseLine == basey) { 479 continue;
166 pBaseLine = pExistLine; 480 }
167 break; 481 delete pTextColumn;
168 } 482 m_TextColumns.RemoveAt(i);
169 if (pExistLine->m_BaseLine < basey) { 483 i--;
170 break; 484 }
171 } 485 for (i = 0; i < m_BaseLines.GetSize(); i++) {
172 } 486 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
173 if (pBaseLine == NULL) { 487 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
174 pBaseLine = new CTextBaseLine; 488 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
175 pBaseLine->m_BaseLine = basey; 489 if (IsNumber(pTextBox->m_Text)) {
176 m_BaseLines.InsertAt(i, pBaseLine); 490 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
177 } 491 }
178 } 492 }
179 CFX_WideString text; 493 }
180 const FX_CHAR* pStr = str; 494 }
181 int len = str.GetLength(), offset = 0; 495 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) {
182 while (offset < len) { 496 for (int i = 0; i < m_TextColumns.GetSize(); i++) {
183 FX_DWORD ch = pFont->GetNextChar(pStr, len, offset); 497 CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i);
184 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch); 498 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
185 if (unicode_str.IsEmpty()) { 499 return pColumn;
186 text += (FX_WCHAR)ch; 500 }
187 } 501 }
188 else { 502 return NULL;
189 text += unicode_str; 503 }
190 } 504 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) {}
191 } 505 CTextBaseLine::CTextBaseLine() {
192 pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, t ext); 506 m_Top = -100000;
193 return pBaseLine; 507 m_Bottom = 100000;
194 } 508 m_MaxFontSizeV = 0;
195 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) 509 }
196 { 510 CTextBaseLine::~CTextBaseLine() {
197 FX_FLOAT lastheight = -1; 511 for (int i = 0; i < m_TextList.GetSize(); i++) {
198 FX_FLOAT lastbaseline = -1; 512 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
199 FX_FLOAT MinLeftX = 1000000; 513 delete pText;
200 FX_FLOAT MaxRightX = 0; 514 }
201 int i; 515 }
202 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 516 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx,
203 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 517 FX_FLOAT rightx,
204 FX_FLOAT leftx, rightx; 518 FX_FLOAT topy,
205 if (pBaseLine->GetWidth(leftx, rightx)) { 519 FX_FLOAT bottomy,
206 if (leftx < MinLeftX) { 520 FX_FLOAT spacew,
207 MinLeftX = leftx; 521 FX_FLOAT fontsize_v,
208 } 522 const CFX_WideString& text) {
209 if (rightx > MaxRightX) { 523 if (m_Top < topy) {
210 MaxRightX = rightx; 524 m_Top = topy;
211 } 525 }
212 } 526 if (m_Bottom > bottomy) {
213 } 527 m_Bottom = bottomy;
214 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 528 }
215 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 529 if (m_MaxFontSizeV < fontsize_v) {
216 pBaseLine->MergeBoxes(); 530 m_MaxFontSizeV = fontsize_v;
217 } 531 }
218 for (i = 1; i < m_BaseLines.GetSize(); i ++) { 532 int i;
219 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 533 for (i = 0; i < m_TextList.GetSize(); i++) {
220 CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1); 534 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
221 if (pBaseLine->CanMerge(pPrevLine)) { 535 if (pText->m_Left > leftx) {
222 pPrevLine->Merge(pBaseLine); 536 break;
223 delete pBaseLine; 537 }
224 m_BaseLines.RemoveAt(i); 538 }
225 i --; 539 CTextBox* pText = new CTextBox;
226 } 540 pText->m_Text = text;
227 } 541 pText->m_Left = leftx;
228 if (m_bAutoWidth) { 542 pText->m_Right = rightx;
229 int* widths = FX_Alloc(int, m_BaseLines.GetSize()); 543 pText->m_Top = topy;
230 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 544 pText->m_Bottom = bottomy;
231 widths[i] = 0; 545 pText->m_SpaceWidth = spacew;
232 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 546 pText->m_FontSizeV = fontsize_v;
233 int TotalChars = 0; 547 pText->m_pColumn = NULL;
234 FX_FLOAT TotalWidth = 0; 548 m_TextList.InsertAt(i, pText);
235 int minchars; 549 }
236 pBaseLine->CountChars(TotalChars, TotalWidth, minchars); 550 FX_BOOL GetIntersection(FX_FLOAT low1,
237 if (TotalChars) { 551 FX_FLOAT high1,
238 FX_FLOAT charwidth = TotalWidth / TotalChars; 552 FX_FLOAT low2,
239 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth); 553 FX_FLOAT high2,
240 } 554 FX_FLOAT& interlow,
241 if (widths[i] > 1000) { 555 FX_FLOAT& interhigh);
242 widths[i] = 1000; 556 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) {
243 } 557 FX_FLOAT inter_top, inter_bottom;
244 if (widths[i] < minchars) { 558 if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
245 widths[i] = minchars; 559 inter_bottom, inter_top)) {
246 } 560 return FALSE;
247 } 561 }
248 int AvgWidth = 0, widthcount = 0; 562 FX_FLOAT inter_h = inter_top - inter_bottom;
249 for (i = 0; i < m_BaseLines.GetSize(); i ++) 563 if (inter_h < (m_Top - m_Bottom) / 2 &&
250 if (widths[i]) { 564 inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
251 AvgWidth += widths[i]; 565 return FALSE;
252 widthcount ++; 566 }
253 } 567 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
254 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5); 568 for (int i = 0; i < m_TextList.GetSize(); i++) {
255 int MaxWidth = 0; 569 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
256 for (i = 0; i < m_BaseLines.GetSize(); i ++) 570 for (int j = 0; j < pOther->m_TextList.GetSize(); j++) {
257 if (MaxWidth < widths[i]) { 571 CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j);
258 MaxWidth = widths[i]; 572 FX_FLOAT inter_left, inter_right;
259 } 573 if (!GetIntersection(pText->m_Left, pText->m_Right, pOtherText->m_Left,
260 if (MaxWidth > AvgWidth * 6 / 5) { 574 pOtherText->m_Right, inter_left, inter_right)) {
261 MaxWidth = AvgWidth * 6 / 5; 575 continue;
262 } 576 }
263 FX_Free(widths); 577 FX_FLOAT inter_w = inter_right - inter_left;
264 if (iMinWidth < MaxWidth) { 578 if (inter_w < pText->m_SpaceWidth / 2 &&
265 iMinWidth = MaxWidth; 579 inter_w < pOtherText->m_SpaceWidth / 2) {
266 } 580 continue;
267 } 581 }
268 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 582 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
269 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 583 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
270 pBaseLine->MergeBoxes(); 584 return FALSE;
271 } 585 }
272 if (m_bKeepColumn) { 586 }
273 FindColumns(); 587 }
274 } 588 return TRUE;
275 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 589 }
276 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 590 void CTextBaseLine::Merge(CTextBaseLine* pOther) {
277 if (lastheight >= 0) { 591 for (int i = 0; i < pOther->m_TextList.GetSize(); i++) {
278 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine; 592 CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i);
279 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1. 5) { 593 InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bottom,
280 lines.Add(L""); 594 pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
281 } 595 }
282 } 596 }
283 lastheight = pBaseLine->m_MaxFontSizeV; 597 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) {
284 lastbaseline = pBaseLine->m_BaseLine; 598 int i;
285 CFX_WideString str; 599 for (i = 0; i < m_TextList.GetSize(); i++) {
286 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth); 600 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
287 lines.Add(str); 601 if (pText->m_Text != L" ") {
288 } 602 break;
289 } 603 }
290 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) 604 }
291 { 605 if (i == m_TextList.GetSize()) {
292 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); 606 return FALSE;
293 FX_WCHAR* pDst = NULL; 607 }
294 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); 608 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
295 if (nCount < 1 ) { 609 leftx = pText->m_Left;
296 sDest += wChar; 610 for (i = m_TextList.GetSize() - 1; i >= 0; i--) {
297 return; 611 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
298 } 612 if (pText->m_Text != L" ") {
299 pDst = new FX_WCHAR[nCount]; 613 break;
300 FX_Unicode_GetNormalization(wChar, pDst); 614 }
301 for (int nIndex = 0; nIndex < nCount; nIndex++) { 615 }
302 sDest += pDst[nIndex]; 616 pText = (CTextBox*)m_TextList.GetAt(i);
303 } 617 rightx = pText->m_Right;
304 delete[] pDst; 618 return TRUE;
305 } 619 }
306 void NormalizeString(CFX_WideString& str) 620 void CTextBaseLine::MergeBoxes() {
307 { 621 int i = 0;
308 if (str.GetLength() <= 0) { 622 while (1) {
309 return; 623 if (i >= m_TextList.GetSize() - 1) {
310 } 624 break;
311 CFX_WideString sBuffer; 625 }
312 nonstd::unique_ptr<IFX_BidiChar> pBidiChar(IFX_BidiChar::Create()); 626 CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i);
313 CFX_WordArray order; 627 CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1);
314 FX_BOOL bR2L = FALSE; 628 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
315 int32_t start = 0, count = 0, i = 0; 629 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0)
316 int nR2L = 0, nL2R = 0; 630 ? pNextText->m_SpaceWidth
317 for (i = 0; i < str.GetLength(); i++) { 631 : pThisText->m_SpaceWidth;
318 if(pBidiChar->AppendChar(str.GetAt(i))) { 632 if (spacew > 0.0 && dx < spacew * 2) {
319 int32_t ret = pBidiChar->GetBidiInfo(start, count); 633 pThisText->m_Right = pNextText->m_Right;
320 order.Add(start); 634 if (dx > spacew * 1.5) {
321 order.Add(count); 635 pThisText->m_Text += L" ";
322 order.Add(ret); 636 } else if (dx > spacew / 3) {
323 if(!bR2L) { 637 pThisText->m_Text += L' ';
324 if(ret == 2) { 638 }
325 nR2L++; 639 pThisText->m_Text += pNextText->m_Text;
326 } else if (ret == 1) { 640 pThisText->m_SpaceWidth =
327 nL2R++; 641 pNextText->m_SpaceWidth == 0.0 ? spacew : pNextText->m_SpaceWidth;
328 } 642 m_TextList.RemoveAt(i + 1);
329 } 643 delete pNextText;
330 }
331 }
332 if(pBidiChar->EndChar()) {
333 int32_t ret = pBidiChar->GetBidiInfo(start, count);
334 order.Add(start);
335 order.Add(count);
336 order.Add(ret);
337 if(!bR2L) {
338 if(ret == 2) {
339 nR2L++;
340 } else if(ret == 1) {
341 nL2R++;
342 }
343 }
344 }
345 if(nR2L > 0 && nR2L >= nL2R) {
346 bR2L = TRUE;
347 }
348 if(bR2L) {
349 int count = order.GetSize();
350 for(int j = count - 1; j > 0; j -= 3) {
351 int ret = order.GetAt(j);
352 int start = order.GetAt(j - 2);
353 int count1 = order.GetAt(j - 1);
354 if(ret == 2 || ret == 0) {
355 for(int i = start + count1 - 1; i >= start; i--) {
356 NormalizeCompositeChar(str[i], sBuffer);
357 }
358 } else {
359 i = j;
360 FX_BOOL bSymbol = FALSE;
361 while(i > 0 && order.GetAt(i) != 2) {
362 bSymbol = !order.GetAt(i);
363 i -= 3;
364 }
365 int end = start + count1 ;
366 int n = 0;
367 if(bSymbol) {
368 n = i + 6;
369 } else {
370 n = i + 3;
371 }
372 if(n >= j) {
373 for(int m = start; m < end; m++) {
374 sBuffer += str[m];
375 }
376 } else {
377 i = j;
378 j = n;
379 for(; n <= i; n += 3) {
380 int start = order.GetAt(n - 2);
381 int count1 = order.GetAt(n - 1);
382 int end = start + count1 ;
383 for(int m = start; m < end; m++) {
384 sBuffer += str[m];
385 }
386 }
387 }
388 }
389 }
390 } else { 644 } else {
391 int count = order.GetSize(); 645 i++;
392 FX_BOOL bL2R = FALSE; 646 }
393 for(int j = 0; j < count; j += 3) { 647 }
394 int ret = order.GetAt(j + 2); 648 }
395 int start = order.GetAt(j); 649 void CTextBaseLine::WriteOutput(CFX_WideString& str,
396 int count1 = order.GetAt(j + 1); 650 FX_FLOAT leftx,
397 if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) { 651 FX_FLOAT pagewidth,
398 int i = j + 3; 652 int iTextWidth) {
399 while(bR2L && i < count) { 653 int lastpos = -1;
400 if(order.GetAt(i + 2) == 1) { 654 for (int i = 0; i < m_TextList.GetSize(); i++) {
401 break;
402 } else {
403 i += 3;
404 }
405 }
406 if(i == 3) {
407 j = -3;
408 bL2R = TRUE;
409 continue;
410 }
411 int end = str.GetLength() - 1;
412 if(i < count) {
413 end = order.GetAt(i) - 1;
414 }
415 j = i - 3;
416 for(int n = end; n >= start; n--) {
417 NormalizeCompositeChar(str[i], sBuffer);
418 }
419 } else {
420 int end = start + count1 ;
421 for(int i = start; i < end; i++) {
422 sBuffer += str[i];
423 }
424 }
425 }
426 }
427 str.Empty();
428 str += sBuffer;
429 }
430 static FX_BOOL IsNumber(CFX_WideString& str)
431 {
432 for (int i = 0; i < str.GetLength(); i ++) {
433 FX_WCHAR ch = str[i];
434 if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch != ' ') {
435 return FALSE;
436 }
437 }
438 return TRUE;
439 }
440 void CTextPage::FindColumns()
441 {
442 int i;
443 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
444 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
445 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
446 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
447 CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
448 if (pColumn == NULL) {
449 pColumn = new CTextColumn;
450 pColumn->m_Count = 1;
451 pColumn->m_AvgPos = pTextBox->m_Right;
452 pColumn->m_TextPos = -1;
453 m_TextColumns.Add(pColumn);
454 } else {
455 pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTex tBox->m_Right) /
456 (pColumn->m_Count + 1);
457 pColumn->m_Count ++;
458 }
459 }
460 }
461 int mincount = m_BaseLines.GetSize() / 4;
462 for (i = 0; i < m_TextColumns.GetSize(); i ++) {
463 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
464 if (pTextColumn->m_Count >= mincount) {
465 continue;
466 }
467 delete pTextColumn;
468 m_TextColumns.RemoveAt(i);
469 i --;
470 }
471 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
472 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
473 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
474 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
475 if (IsNumber(pTextBox->m_Text)) {
476 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
477 }
478 }
479 }
480 }
481 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos)
482 {
483 for (int i = 0; i < m_TextColumns.GetSize(); i ++) {
484 CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i);
485 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
486 return pColumn;
487 }
488 }
489 return NULL;
490 }
491 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj)
492 {
493 }
494 CTextBaseLine::CTextBaseLine()
495 {
496 m_Top = -100000;
497 m_Bottom = 100000;
498 m_MaxFontSizeV = 0;
499 }
500 CTextBaseLine::~CTextBaseLine()
501 {
502 for (int i = 0; i < m_TextList.GetSize(); i ++) {
503 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
504 delete pText;
505 }
506 }
507 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy , FX_FLOAT bottomy,
508 FX_FLOAT spacew, FX_FLOAT fontsize_v, const CF X_WideString& text)
509 {
510 if (m_Top < topy) {
511 m_Top = topy;
512 }
513 if (m_Bottom > bottomy) {
514 m_Bottom = bottomy;
515 }
516 if (m_MaxFontSizeV < fontsize_v) {
517 m_MaxFontSizeV = fontsize_v;
518 }
519 int i;
520 for (i = 0; i < m_TextList.GetSize(); i ++) {
521 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
522 if (pText->m_Left > leftx) {
523 break;
524 }
525 }
526 CTextBox* pText = new CTextBox;
527 pText->m_Text = text;
528 pText->m_Left = leftx;
529 pText->m_Right = rightx;
530 pText->m_Top = topy;
531 pText->m_Bottom = bottomy;
532 pText->m_SpaceWidth = spacew;
533 pText->m_FontSizeV = fontsize_v;
534 pText->m_pColumn = NULL;
535 m_TextList.InsertAt(i, pText);
536 }
537 FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT h igh2,
538 FX_FLOAT& interlow, FX_FLOAT& interhigh);
539 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther)
540 {
541 FX_FLOAT inter_top, inter_bottom;
542 if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
543 inter_bottom, inter_top)) {
544 return FALSE;
545 }
546 FX_FLOAT inter_h = inter_top - inter_bottom;
547 if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m _Bottom) / 2) {
548 return FALSE;
549 }
550 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
551 for (int i = 0; i < m_TextList.GetSize(); i ++) {
552 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
553 for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) {
554 CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j);
555 FX_FLOAT inter_left, inter_right;
556 if (!GetIntersection(pText->m_Left, pText->m_Right,
557 pOtherText->m_Left, pOtherText->m_Right, inter_ left, inter_right)) {
558 continue;
559 }
560 FX_FLOAT inter_w = inter_right - inter_left;
561 if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_Spa ceWidth / 2) {
562 continue;
563 }
564 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
565 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
566 return FALSE;
567 }
568 }
569 }
570 return TRUE;
571 }
572 void CTextBaseLine::Merge(CTextBaseLine* pOther)
573 {
574 for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) {
575 CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i);
576 InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bott om,
577 pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
578 }
579 }
580 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx)
581 {
582 int i;
583 for (i = 0; i < m_TextList.GetSize(); i ++) {
584 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
585 if (pText->m_Text != L" ") {
586 break;
587 }
588 }
589 if (i == m_TextList.GetSize()) {
590 return FALSE;
591 }
592 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 655 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
593 leftx = pText->m_Left; 656 int xpos;
594 for (i = m_TextList.GetSize() - 1; i >= 0; i --) { 657 if (pText->m_pColumn) {
595 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 658 xpos =
596 if (pText->m_Text != L" ") { 659 (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth +
597 break; 660 0.5);
598 } 661 xpos -= pText->m_Text.GetLength();
599 } 662 } else {
600 pText = (CTextBox*)m_TextList.GetAt(i); 663 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
601 rightx = pText->m_Right; 664 }
602 return TRUE; 665 if (xpos <= lastpos) {
603 } 666 xpos = lastpos + 1;
604 void CTextBaseLine::MergeBoxes() 667 }
605 { 668 for (int j = lastpos + 1; j < xpos; j++) {
606 int i = 0; 669 str += ' ';
607 while (1) { 670 }
608 if (i >= m_TextList.GetSize() - 1) { 671 CFX_WideString sSrc(pText->m_Text);
609 break; 672 NormalizeString(sSrc);
610 } 673 str += sSrc;
611 CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i); 674 str += ' ';
612 CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1); 675 lastpos = xpos + pText->m_Text.GetLength();
613 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right; 676 }
614 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ? 677 }
615 pNextText->m_SpaceWidth : pThisText->m_SpaceWidth; 678 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) {
616 if (spacew > 0.0 && dx < spacew * 2) { 679 minchars = 0;
617 pThisText->m_Right = pNextText->m_Right; 680 for (int i = 0; i < m_TextList.GetSize(); i++) {
618 if (dx > spacew * 1.5) { 681 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
619 pThisText->m_Text += L" "; 682 if (pText->m_Right - pText->m_Left < 0.002) {
620 } else if (dx > spacew / 3) { 683 continue;
621 pThisText->m_Text += L' '; 684 }
622 } 685 count += pText->m_Text.GetLength();
623 pThisText->m_Text += pNextText->m_Text; 686 width += pText->m_Right - pText->m_Left;
624 pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ? 687 minchars += pText->m_Text.GetLength() + 1;
625 spacew : pNextText->m_SpaceWidth; 688 }
626 m_TextList.RemoveAt(i + 1);
627 delete pNextText;
628 } else {
629 i ++;
630 }
631 }
632 }
633 void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pa gewidth,
634 int iTextWidth)
635 {
636 int lastpos = -1;
637 for (int i = 0; i < m_TextList.GetSize(); i ++) {
638 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
639 int xpos;
640 if (pText->m_pColumn) {
641 xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pag ewidth + 0.5);
642 xpos -= pText->m_Text.GetLength();
643 } else {
644 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5) ;
645 }
646 if (xpos <= lastpos) {
647 xpos = lastpos + 1;
648 }
649 for (int j = lastpos + 1; j < xpos; j ++) {
650 str += ' ';
651 }
652 CFX_WideString sSrc(pText->m_Text);
653 NormalizeString(sSrc);
654 str += sSrc;
655 str += ' ';
656 lastpos = xpos + pText->m_Text.GetLength();
657 }
658 }
659 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars)
660 {
661 minchars = 0;
662 for (int i = 0; i < m_TextList.GetSize(); i ++) {
663 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
664 if (pText->m_Right - pText->m_Left < 0.002) {
665 continue;
666 }
667 count += pText->m_Text.GetLength();
668 width += pText->m_Right - pText->m_Left;
669 minchars += pText->m_Text.GetLength() + 1;
670 }
671 } 689 }
672 #define PI 3.1415926535897932384626433832795 690 #define PI 3.1415926535897932384626433832795
673 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) 691 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) {
674 { 692 int total_count = 0, rotated_count[3] = {0, 0, 0};
675 int total_count = 0, rotated_count[3] = {0, 0, 0}; 693 FX_POSITION pos = page.GetFirstObjectPosition();
676 FX_POSITION pos = page.GetFirstObjectPosition(); 694 while (pos) {
677 while (pos) { 695 CPDF_PageObject* pObj = page.GetNextObject(pos);
678 CPDF_PageObject* pObj = page.GetNextObject(pos); 696 if (pObj->m_Type != PDFPAGE_TEXT) {
679 if (pObj->m_Type != PDFPAGE_TEXT) { 697 continue;
680 continue; 698 }
681 } 699 total_count++;
682 total_count ++; 700 CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
683 CPDF_TextObject* pText = (CPDF_TextObject*)pObj; 701 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
684 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle(); 702 if (angle == 0.0) {
685 if (angle == 0.0) { 703 continue;
686 continue; 704 }
687 } 705 int degree = (int)(angle * 180 / PI + 0.5);
688 int degree = (int)(angle * 180 / PI + 0.5); 706 if (degree % 90) {
689 if (degree % 90) { 707 continue;
690 continue; 708 }
691 } 709 if (degree < 0) {
692 if (degree < 0) { 710 degree += 360;
693 degree += 360; 711 }
694 } 712 int index = degree / 90 % 3 - 1;
695 int index = degree / 90 % 3 - 1; 713 if (index < 0) {
696 if (index < 0) { 714 continue;
697 continue; 715 }
698 } 716 rotated_count[index]++;
699 rotated_count[index] ++; 717 }
700 } 718 if (total_count == 0) {
701 if (total_count == 0) { 719 return;
702 return; 720 }
703 } 721 CFX_AffineMatrix matrix;
704 CFX_AffineMatrix matrix; 722 if (rotated_count[0] > total_count * 2 / 3) {
705 if (rotated_count[0] > total_count * 2 / 3) { 723 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
706 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight()); 724 } else if (rotated_count[1] > total_count * 2 / 3) {
707 } else if (rotated_count[1] > total_count * 2 / 3) { 725 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
708 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight()); 726 } else if (rotated_count[2] > total_count * 2 / 3) {
709 } else if (rotated_count[2] > total_count * 2 / 3) { 727 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
710 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0); 728 } else {
711 } else { 729 return;
712 return; 730 }
713 } 731 page.Transform(matrix);
714 page.Transform(matrix); 732 page_bbox.Transform(&matrix);
715 page_bbox.Transform(&matrix); 733 }
716 } 734 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines,
717 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CP DF_Dictionary* pPage, 735 CPDF_Document* pDoc,
718 int iMinWidth, FX_DWORD flags) 736 CPDF_Dictionary* pPage,
719 { 737 int iMinWidth,
720 lines.RemoveAll(); 738 FX_DWORD flags) {
721 if (pPage == NULL) { 739 lines.RemoveAll();
722 return; 740 if (pPage == NULL) {
723 } 741 return;
724 CPDF_Page page; 742 }
725 page.Load(pDoc, pPage); 743 CPDF_Page page;
726 CPDF_ParseOptions options; 744 page.Load(pDoc, pPage);
727 options.m_bTextOnly = TRUE; 745 CPDF_ParseOptions options;
728 options.m_bSeparateForm = FALSE; 746 options.m_bTextOnly = TRUE;
729 page.ParseContent(&options); 747 options.m_bSeparateForm = FALSE;
730 CFX_FloatRect page_bbox = page.GetPageBBox(); 748 page.ParseContent(&options);
731 if (flags & PDF2TXT_AUTO_ROTATE) { 749 CFX_FloatRect page_bbox = page.GetPageBBox();
732 CheckRotate(page, page_bbox); 750 if (flags & PDF2TXT_AUTO_ROTATE) {
733 } 751 CheckRotate(page, page_bbox);
734 CTextPage texts; 752 }
735 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH; 753 CTextPage texts;
736 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN; 754 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
737 texts.m_bBreakSpace = TRUE; 755 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
738 FX_POSITION pos = page.GetFirstObjectPosition(); 756 texts.m_bBreakSpace = TRUE;
739 while (pos) { 757 FX_POSITION pos = page.GetFirstObjectPosition();
740 CPDF_PageObject* pObject = page.GetNextObject(pos); 758 while (pos) {
741 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) { 759 CPDF_PageObject* pObject = page.GetNextObject(pos);
742 CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Ri ght, pObject->m_Top); 760 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
743 if (!page_bbox.Contains(rect)) { 761 CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Right,
744 continue; 762 pObject->m_Top);
745 } 763 if (!page_bbox.Contains(rect)) {
746 } 764 continue;
747 texts.ProcessObject(pObject); 765 }
748 } 766 }
749 texts.WriteOutput(lines, iMinWidth); 767 texts.ProcessObject(pObject);
750 } 768 }
751 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dicti onary* pPage, 769 texts.WriteOutput(lines, iMinWidth);
752 int iMinWidth, FX_DWORD flags) 770 }
753 { 771 void PDF_GetPageText(CFX_ByteStringArray& lines,
754 lines.RemoveAll(); 772 CPDF_Document* pDoc,
755 CFX_WideStringArray wlines; 773 CPDF_Dictionary* pPage,
756 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags); 774 int iMinWidth,
757 for (int i = 0; i < wlines.GetSize(); i ++) { 775 FX_DWORD flags) {
758 CFX_WideString wstr = wlines[i]; 776 lines.RemoveAll();
759 CFX_ByteString str; 777 CFX_WideStringArray wlines;
760 for (int c = 0; c < wstr.GetLength(); c ++) { 778 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
761 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?"); 779 for (int i = 0; i < wlines.GetSize(); i++) {
762 } 780 CFX_WideString wstr = wlines[i];
763 lines.Add(str); 781 CFX_ByteString str;
764 } 782 for (int c = 0; c < wstr.GetLength(); c++) {
765 } 783 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
766 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects * pPage, FX_BOOL bUseLF, 784 }
785 lines.Add(str);
786 }
787 }
788 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer,
789 CPDF_PageObjects* pPage,
790 FX_BOOL bUseLF,
767 CFX_PtrArray* pObjArray); 791 CFX_PtrArray* pObjArray);
768 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPD F_Dictionary* pPage, FX_DWORD flags) 792 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer,
769 { 793 CPDF_Document* pDoc,
770 buffer.EstimateSize(0, 10240); 794 CPDF_Dictionary* pPage,
771 CPDF_Page page; 795 FX_DWORD flags) {
772 page.Load(pDoc, pPage); 796 buffer.EstimateSize(0, 10240);
773 CPDF_ParseOptions options; 797 CPDF_Page page;
774 options.m_bTextOnly = TRUE; 798 page.Load(pDoc, pPage);
775 options.m_bSeparateForm = FALSE; 799 CPDF_ParseOptions options;
776 page.ParseContent(&options); 800 options.m_bTextOnly = TRUE;
777 _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL); 801 options.m_bSeparateForm = FALSE;
778 } 802 page.ParseContent(&options);
803 _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL);
804 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698