Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(24)

Side by Side Diff: core/src/fpdftext/fpdf_text.cpp

Issue 453133004: clang-format all code (Closed) Base URL: https://pdfium.googlesource.com/pdfium.git@master
Patch Set: Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 PDFium Authors. All rights reserved. 1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 6
7 #include "../../include/fpdfapi/fpdf_page.h" 7 #include "../../include/fpdfapi/fpdf_page.h"
8 #include "../../include/fpdfapi/fpdf_pageobj.h" 8 #include "../../include/fpdfapi/fpdf_pageobj.h"
9 #include "../../include/fpdftext/fpdf_text.h" 9 #include "../../include/fpdftext/fpdf_text.h"
10 #include "txtproc.h" 10 #include "txtproc.h"
11 #include "text_int.h" 11 #include "text_int.h"
12 #if !defined(_FPDFAPI_MINI_) || defined(_FXCORE_FEATURE_ALL_) 12 #if !defined(_FPDFAPI_MINI_) || defined(_FXCORE_FEATURE_ALL_)
13 extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR); 13 extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR);
14 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, FX_LPCSTR defcha r) 14 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode,
15 { 15 int destcp,
16 if (destcp == 0) { 16 FX_LPCSTR defchar) {
17 if (unicode < 0x80) { 17 if (destcp == 0) {
18 return CFX_ByteString((char)unicode); 18 if (unicode < 0x80) {
19 } 19 return CFX_ByteString((char)unicode);
20 FX_LPCSTR altstr = FCS_GetAltStr(unicode);
21 if (altstr) {
22 return CFX_ByteString(altstr, -1);
23 }
24 return CFX_ByteString(defchar, -1);
25 }
26 FX_BOOL bDef = FALSE;
27 char buf[10];
28 int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 1 0, NULL, &bDef);
29 if (ret && !bDef) {
30 return CFX_ByteString(buf, ret);
31 } 20 }
32 FX_LPCSTR altstr = FCS_GetAltStr(unicode); 21 FX_LPCSTR altstr = FCS_GetAltStr(unicode);
33 if (altstr) { 22 if (altstr) {
34 return CFX_ByteString(altstr, -1); 23 return CFX_ByteString(altstr, -1);
35 } 24 }
36 return CFX_ByteString(defchar, -1); 25 return CFX_ByteString(defchar, -1);
37 } 26 }
38 CTextPage::CTextPage() 27 FX_BOOL bDef = FALSE;
39 { 28 char buf[10];
40 } 29 int ret = FXSYS_WideCharToMultiByte(
41 CTextPage::~CTextPage() 30 destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &bDef);
42 { 31 if (ret && !bDef) {
32 return CFX_ByteString(buf, ret);
33 }
34 FX_LPCSTR altstr = FCS_GetAltStr(unicode);
35 if (altstr) {
36 return CFX_ByteString(altstr, -1);
37 }
38 return CFX_ByteString(defchar, -1);
39 }
40 CTextPage::CTextPage() {
41 }
42 CTextPage::~CTextPage() {
43 int i;
44 for (i = 0; i < m_BaseLines.GetSize(); i++) {
45 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
46 delete pBaseLine;
47 }
48 for (i = 0; i < m_TextColumns.GetSize(); i++) {
49 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
50 delete pTextColumn;
51 }
52 }
53 void CTextPage::ProcessObject(CPDF_PageObject* pObject) {
54 if (pObject->m_Type != PDFPAGE_TEXT) {
55 return;
56 }
57 CPDF_TextObject* pText = (CPDF_TextObject*)pObject;
58 CPDF_Font* pFont = pText->m_TextState.GetFont();
59 int count = pText->CountItems();
60 FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2);
61 if (pPosArray) {
62 pText->CalcCharPos(pPosArray);
63 }
64 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH();
65 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV();
66 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
67 FX_FLOAT spacew = 0;
68 if (space_charcode != -1) {
69 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
70 }
71 if (spacew == 0) {
72 spacew = fontsize_h / 4;
73 }
74 if (pText->m_TextState.GetBaselineAngle() != 0) {
75 int cc = 0;
76 CFX_AffineMatrix matrix;
77 pText->GetTextMatrix(&matrix);
78 for (int i = 0; i < pText->m_nChars; i++) {
79 FX_DWORD charcode = pText->m_nChars == 1
80 ? (FX_DWORD)(FX_UINTPTR) pText->m_pCharCodes
81 : pText->m_pCharCodes[i];
82 if (charcode == (FX_DWORD)-1) {
83 continue;
84 }
85 FX_RECT char_box;
86 pFont->GetCharBBox(charcode, char_box);
87 FX_FLOAT char_left =
88 pPosArray ? pPosArray[cc * 2]
89 : char_box.left * pText->m_TextState.GetFontSize() / 1000;
90 FX_FLOAT char_right =
91 pPosArray ? pPosArray[cc * 2 + 1]
92 : char_box.right * pText->m_TextState.GetFontSize() / 1000;
93 FX_FLOAT char_top =
94 char_box.top * pText->m_TextState.GetFontSize() / 1000;
95 FX_FLOAT char_bottom =
96 char_box.bottom * pText->m_TextState.GetFontSize() / 1000;
97 cc++;
98 FX_FLOAT char_origx, char_origy;
99 matrix.Transform(char_left, 0, char_origx, char_origy);
100 matrix.TransformRect(char_left, char_right, char_top, char_bottom);
101 CFX_ByteString str;
102 pFont->AppendChar(str, charcode);
103 InsertTextBox(NULL,
104 char_origy,
105 char_left,
106 char_right,
107 char_top,
108 char_bottom,
109 spacew,
110 fontsize_v,
111 str,
112 pFont);
113 }
114 if (pPosArray) {
115 FX_Free(pPosArray);
116 }
117 return;
118 }
119 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize();
120 for (int ii = 0; ii < count * 2; ii++) {
121 pPosArray[ii] *= ratio_h;
122 }
123 FX_FLOAT baseline = pText->m_PosY;
124 CTextBaseLine* pBaseLine = NULL;
125 FX_FLOAT topy = pText->m_Top;
126 FX_FLOAT bottomy = pText->m_Bottom;
127 FX_FLOAT leftx = pText->m_Left;
128 int cc = 0;
129 CFX_ByteString segment;
130 int space_count = 0;
131 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0;
132 for (int i = 0; i < pText->m_nChars; i++) {
133 FX_DWORD charcode = pText->m_nChars == 1
134 ? (FX_DWORD)(FX_UINTPTR) pText->m_pCharCodes
135 : pText->m_pCharCodes[i];
136 if (charcode == (FX_DWORD)-1) {
137 continue;
138 }
139 FX_FLOAT char_left = pPosArray[cc * 2];
140 FX_FLOAT char_right = pPosArray[cc * 2 + 1];
141 cc++;
142 if (char_left < last_left || (char_left - last_right) > spacew / 2) {
143 pBaseLine = InsertTextBox(pBaseLine,
144 baseline,
145 leftx + segment_left,
146 leftx + segment_right,
147 topy,
148 bottomy,
149 spacew,
150 fontsize_v,
151 segment,
152 pFont);
153 segment_left = char_left;
154 segment = "";
155 }
156 if (space_count > 1) {
157 pBaseLine = InsertTextBox(pBaseLine,
158 baseline,
159 leftx + segment_left,
160 leftx + segment_right,
161 topy,
162 bottomy,
163 spacew,
164 fontsize_v,
165 segment,
166 pFont);
167 segment = "";
168 } else if (space_count == 1) {
169 pFont->AppendChar(segment, ' ');
170 }
171 if (segment.GetLength() == 0) {
172 segment_left = char_left;
173 }
174 segment_right = char_right;
175 pFont->AppendChar(segment, charcode);
176 space_count = 0;
177 last_left = char_left;
178 last_right = char_right;
179 }
180 if (segment.GetLength())
181 pBaseLine = InsertTextBox(pBaseLine,
182 baseline,
183 leftx + segment_left,
184 leftx + segment_right,
185 topy,
186 bottomy,
187 spacew,
188 fontsize_v,
189 segment,
190 pFont);
191 FX_Free(pPosArray);
192 }
193 static void ConvertPDFString(CFX_ByteString& result,
194 CFX_ByteString& src,
195 CPDF_Font* pFont);
196 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine,
197 FX_FLOAT basey,
198 FX_FLOAT leftx,
199 FX_FLOAT rightx,
200 FX_FLOAT topy,
201 FX_FLOAT bottomy,
202 FX_FLOAT spacew,
203 FX_FLOAT fontsize_v,
204 CFX_ByteString& str,
205 CPDF_Font* pFont) {
206 if (str.GetLength() == 0) {
207 return NULL;
208 }
209 if (pBaseLine == NULL) {
43 int i; 210 int i;
44 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 211 for (i = 0; i < m_BaseLines.GetSize(); i++) {
212 CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
213 if (pExistLine->m_BaseLine == basey) {
214 pBaseLine = pExistLine;
215 break;
216 }
217 if (pExistLine->m_BaseLine < basey) {
218 break;
219 }
220 }
221 if (pBaseLine == NULL) {
222 pBaseLine = FX_NEW CTextBaseLine;
223 if (NULL == pBaseLine) {
224 return NULL;
225 }
226 pBaseLine->m_BaseLine = basey;
227 m_BaseLines.InsertAt(i, pBaseLine);
228 }
229 }
230 CFX_WideString text;
231 FX_LPCSTR pStr = str;
232 int len = str.GetLength(), offset = 0;
233 while (offset < len) {
234 FX_DWORD ch = pFont->GetNextChar(pStr, offset);
235 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch);
236 text += unicode_str;
237 }
238 pBaseLine->InsertTextBox(
239 leftx, rightx, topy, bottomy, spacew, fontsize_v, text);
240 return pBaseLine;
241 }
242 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) {
243 FX_FLOAT lastheight = -1;
244 FX_FLOAT lastbaseline = -1;
245 FX_FLOAT MinLeftX = 1000000;
246 FX_FLOAT MaxRightX = 0;
247 int i;
248 for (i = 0; i < m_BaseLines.GetSize(); i++) {
249 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
250 FX_FLOAT leftx, rightx;
251 if (pBaseLine->GetWidth(leftx, rightx)) {
252 if (leftx < MinLeftX) {
253 MinLeftX = leftx;
254 }
255 if (rightx > MaxRightX) {
256 MaxRightX = rightx;
257 }
258 }
259 }
260 for (i = 0; i < m_BaseLines.GetSize(); i++) {
261 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
262 pBaseLine->MergeBoxes();
263 }
264 for (i = 1; i < m_BaseLines.GetSize(); i++) {
265 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
266 CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1);
267 if (pBaseLine->CanMerge(pPrevLine)) {
268 pPrevLine->Merge(pBaseLine);
269 delete pBaseLine;
270 m_BaseLines.RemoveAt(i);
271 i--;
272 }
273 }
274 if (m_bAutoWidth) {
275 int* widths = FX_Alloc(int, m_BaseLines.GetSize());
276 if (widths) {
277 for (i = 0; i < m_BaseLines.GetSize(); i++) {
278 widths[i] = 0;
45 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 279 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
46 delete pBaseLine; 280 int TotalChars = 0;
47 } 281 FX_FLOAT TotalWidth = 0;
48 for (i = 0; i < m_TextColumns.GetSize(); i ++) { 282 int minchars;
49 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i); 283 pBaseLine->CountChars(TotalChars, TotalWidth, minchars);
50 delete pTextColumn; 284 if (TotalChars) {
51 } 285 FX_FLOAT charwidth = TotalWidth / TotalChars;
52 } 286 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth);
53 void CTextPage::ProcessObject(CPDF_PageObject* pObject) 287 }
54 { 288 if (widths[i] > 1000) {
55 if (pObject->m_Type != PDFPAGE_TEXT) { 289 widths[i] = 1000;
56 return; 290 }
57 } 291 if (widths[i] < minchars) {
58 CPDF_TextObject* pText = (CPDF_TextObject*)pObject; 292 widths[i] = minchars;
59 CPDF_Font* pFont = pText->m_TextState.GetFont(); 293 }
60 int count = pText->CountItems(); 294 }
61 FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2); 295 int AvgWidth = 0, widthcount = 0;
62 if (pPosArray) { 296 for (i = 0; i < m_BaseLines.GetSize(); i++)
63 pText->CalcCharPos(pPosArray); 297 if (widths[i]) {
64 } 298 AvgWidth += widths[i];
65 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH(); 299 widthcount++;
66 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV(); 300 }
67 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); 301 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5);
68 FX_FLOAT spacew = 0; 302 int MaxWidth = 0;
69 if (space_charcode != -1) { 303 for (i = 0; i < m_BaseLines.GetSize(); i++)
70 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; 304 if (MaxWidth < widths[i]) {
71 } 305 MaxWidth = widths[i];
72 if (spacew == 0) { 306 }
73 spacew = fontsize_h / 4; 307 if (MaxWidth > AvgWidth * 6 / 5) {
74 } 308 MaxWidth = AvgWidth * 6 / 5;
75 if (pText->m_TextState.GetBaselineAngle() != 0) { 309 }
76 int cc = 0; 310 FX_Free(widths);
77 CFX_AffineMatrix matrix; 311 if (iMinWidth < MaxWidth) {
78 pText->GetTextMatrix(&matrix); 312 iMinWidth = MaxWidth;
79 for (int i = 0; i < pText->m_nChars; i ++) { 313 }
80 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pTe xt->m_pCharCodes : pText->m_pCharCodes[i]; 314 }
81 if (charcode == (FX_DWORD) - 1) { 315 }
82 continue; 316 for (i = 0; i < m_BaseLines.GetSize(); i++) {
317 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
318 pBaseLine->MergeBoxes();
319 }
320 if (m_bKeepColumn) {
321 FindColumns();
322 }
323 for (i = 0; i < m_BaseLines.GetSize(); i++) {
324 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
325 if (lastheight >= 0) {
326 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine;
327 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) {
328 lines.Add(L"");
329 }
330 }
331 lastheight = pBaseLine->m_MaxFontSizeV;
332 lastbaseline = pBaseLine->m_BaseLine;
333 CFX_WideString str;
334 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth);
335 lines.Add(str);
336 }
337 }
338 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) {
339 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE);
340 FX_LPWSTR pDst = NULL;
341 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
342 if (nCount < 1) {
343 sDest += wChar;
344 return;
345 }
346 pDst = new FX_WCHAR[nCount];
347 FX_Unicode_GetNormalization(wChar, pDst);
348 for (int nIndex = 0; nIndex < nCount; nIndex++) {
349 sDest += pDst[nIndex];
350 }
351 delete[] pDst;
352 }
353 void NormalizeString(CFX_WideString& str) {
354 if (str.GetLength() <= 0) {
355 return;
356 }
357 CFX_WideString sBuffer;
358 IFX_BidiChar* BidiChar = IFX_BidiChar::Create();
359 if (NULL == BidiChar) {
360 return;
361 }
362 CFX_WordArray order;
363 FX_BOOL bR2L = FALSE;
364 FX_INT32 start = 0, count = 0, i = 0;
365 int nR2L = 0, nL2R = 0;
366 for (i = 0; i < str.GetLength(); i++) {
367 if (BidiChar->AppendChar(str.GetAt(i))) {
368 FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
369 order.Add(start);
370 order.Add(count);
371 order.Add(ret);
372 if (!bR2L) {
373 if (ret == 2) {
374 nR2L++;
375 } else if (ret == 1) {
376 nL2R++;
377 }
378 }
379 }
380 }
381 if (BidiChar->EndChar()) {
382 FX_INT32 ret = BidiChar->GetBidiInfo(start, count);
383 order.Add(start);
384 order.Add(count);
385 order.Add(ret);
386 if (!bR2L) {
387 if (ret == 2) {
388 nR2L++;
389 } else if (ret == 1) {
390 nL2R++;
391 }
392 }
393 }
394 if (nR2L > 0 && nR2L >= nL2R) {
395 bR2L = TRUE;
396 }
397 if (bR2L) {
398 int count = order.GetSize();
399 for (int j = count - 1; j > 0; j -= 3) {
400 int ret = order.GetAt(j);
401 int start = order.GetAt(j - 2);
402 int count1 = order.GetAt(j - 1);
403 if (ret == 2 || ret == 0) {
404 for (int i = start + count1 - 1; i >= start; i--) {
405 NormalizeCompositeChar(str[i], sBuffer);
406 }
407 } else {
408 i = j;
409 FX_BOOL bSymbol = FALSE;
410 while (i > 0 && order.GetAt(i) != 2) {
411 bSymbol = !order.GetAt(i);
412 i -= 3;
413 }
414 int end = start + count1;
415 int n = 0;
416 if (bSymbol) {
417 n = i + 6;
418 } else {
419 n = i + 3;
420 }
421 if (n >= j) {
422 for (int m = start; m < end; m++) {
423 sBuffer += str[m];
424 }
425 } else {
426 i = j;
427 j = n;
428 for (; n <= i; n += 3) {
429 int start = order.GetAt(n - 2);
430 int count1 = order.GetAt(n - 1);
431 int end = start + count1;
432 for (int m = start; m < end; m++) {
433 sBuffer += str[m];
83 } 434 }
84 FX_RECT char_box; 435 }
85 pFont->GetCharBBox(charcode, char_box); 436 }
86 FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left * pText->m_TextState.GetFontSize() / 1000; 437 }
87 FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.r ight * pText->m_TextState.GetFontSize() / 1000; 438 }
88 FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize() / 1000; 439 } else {
89 FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontS ize() / 1000; 440 int count = order.GetSize();
90 cc ++; 441 FX_BOOL bL2R = FALSE;
91 FX_FLOAT char_origx, char_origy; 442 for (int j = 0; j < count; j += 3) {
92 matrix.Transform(char_left, 0, char_origx, char_origy); 443 int ret = order.GetAt(j + 2);
93 matrix.TransformRect(char_left, char_right, char_top, char_bottom); 444 int start = order.GetAt(j);
94 CFX_ByteString str; 445 int count1 = order.GetAt(j + 1);
95 pFont->AppendChar(str, charcode); 446 if (ret == 2 || (j == 0 && ret == 0 && !bL2R)) {
96 InsertTextBox(NULL, char_origy, char_left, char_right, char_top, 447 int i = j + 3;
97 char_bottom, spacew, fontsize_v, str, pFont); 448 while (bR2L && i < count) {
98 } 449 if (order.GetAt(i + 2) == 1) {
99 if (pPosArray) { 450 break;
100 FX_Free(pPosArray); 451 } else {
101 } 452 i += 3;
102 return; 453 }
103 } 454 }
104 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize(); 455 if (i == 3) {
105 for (int ii = 0; ii < count * 2; ii ++) { 456 j = -3;
106 pPosArray[ii] *= ratio_h; 457 bL2R = TRUE;
107 } 458 continue;
108 FX_FLOAT baseline = pText->m_PosY; 459 }
109 CTextBaseLine* pBaseLine = NULL; 460 int end = str.GetLength() - 1;
110 FX_FLOAT topy = pText->m_Top; 461 if (i < count) {
111 FX_FLOAT bottomy = pText->m_Bottom; 462 end = order.GetAt(i) - 1;
112 FX_FLOAT leftx = pText->m_Left; 463 }
113 int cc = 0; 464 j = i - 3;
114 CFX_ByteString segment; 465 for (int n = end; n >= start; n--) {
115 int space_count = 0; 466 NormalizeCompositeChar(str[i], sBuffer);
116 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0; 467 }
117 for (int i = 0; i < pText->m_nChars; i ++) { 468 } else {
118 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText-> m_pCharCodes : pText->m_pCharCodes[i]; 469 int end = start + count1;
119 if (charcode == (FX_DWORD) - 1) { 470 for (int i = start; i < end; i++) {
120 continue; 471 sBuffer += str[i];
121 } 472 }
122 FX_FLOAT char_left = pPosArray[cc * 2]; 473 }
123 FX_FLOAT char_right = pPosArray[cc * 2 + 1]; 474 }
124 cc ++; 475 }
125 if (char_left < last_left || (char_left - last_right) > spacew / 2) { 476 str.Empty();
126 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, 477 str += sBuffer;
127 topy, bottomy, spacew, fontsize_v, segment , pFont); 478 BidiChar->Release();
128 segment_left = char_left; 479 }
129 segment = ""; 480 static FX_BOOL IsNumber(CFX_WideString& str) {
130 } 481 for (int i = 0; i < str.GetLength(); i++) {
131 if (space_count > 1) { 482 FX_WCHAR ch = str[i];
132 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, leftx + segment_right, 483 if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' &&
133 topy, bottomy, spacew, fontsize_v, segment , pFont); 484 ch != ' ') {
134 segment = ""; 485 return FALSE;
135 } else if (space_count == 1) { 486 }
136 pFont->AppendChar(segment, ' '); 487 }
137 } 488 return TRUE;
138 if (segment.GetLength() == 0) { 489 }
139 segment_left = char_left; 490 void CTextPage::FindColumns() {
140 } 491 int i;
141 segment_right = char_right; 492 for (i = 0; i < m_BaseLines.GetSize(); i++) {
142 pFont->AppendChar(segment, charcode); 493 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
143 space_count = 0; 494 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
144 last_left = char_left; 495 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
145 last_right = char_right; 496 CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
146 } 497 if (pColumn == NULL) {
147 if (segment.GetLength()) 498 pColumn = FX_NEW CTextColumn;
148 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, lef tx + segment_right, 499 if (pColumn) {
149 topy, bottomy, spacew, fontsize_v, segment, pF ont); 500 pColumn->m_Count = 1;
150 FX_Free(pPosArray); 501 pColumn->m_AvgPos = pTextBox->m_Right;
151 } 502 pColumn->m_TextPos = -1;
152 static void ConvertPDFString(CFX_ByteString& result, CFX_ByteString& src, CPDF_F ont* pFont); 503 m_TextColumns.Add(pColumn);
153 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey , FX_FLOAT leftx, 504 }
154 FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v, 505 } else {
155 CFX_ByteString& str, CPDF_Font* pFont) 506 pColumn->m_AvgPos =
156 { 507 (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) /
157 if (str.GetLength() == 0) { 508 (pColumn->m_Count + 1);
158 return NULL; 509 pColumn->m_Count++;
159 } 510 }
160 if (pBaseLine == NULL) { 511 }
161 int i; 512 }
162 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 513 int mincount = m_BaseLines.GetSize() / 4;
163 CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 514 for (i = 0; i < m_TextColumns.GetSize(); i++) {
164 if (pExistLine->m_BaseLine == basey) { 515 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
165 pBaseLine = pExistLine; 516 if (pTextColumn->m_Count >= mincount) {
166 break; 517 continue;
167 } 518 }
168 if (pExistLine->m_BaseLine < basey) { 519 delete pTextColumn;
169 break; 520 m_TextColumns.RemoveAt(i);
170 } 521 i--;
171 } 522 }
172 if (pBaseLine == NULL) { 523 for (i = 0; i < m_BaseLines.GetSize(); i++) {
173 pBaseLine = FX_NEW CTextBaseLine; 524 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
174 if (NULL == pBaseLine) { 525 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) {
175 return NULL; 526 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
176 } 527 if (IsNumber(pTextBox->m_Text)) {
177 pBaseLine->m_BaseLine = basey; 528 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
178 m_BaseLines.InsertAt(i, pBaseLine); 529 }
179 } 530 }
180 } 531 }
181 CFX_WideString text; 532 }
182 FX_LPCSTR pStr = str; 533 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) {
183 int len = str.GetLength(), offset = 0; 534 for (int i = 0; i < m_TextColumns.GetSize(); i++) {
184 while (offset < len) { 535 CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i);
185 FX_DWORD ch = pFont->GetNextChar(pStr, offset); 536 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
186 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch); 537 return pColumn;
187 text += unicode_str; 538 }
188 } 539 }
189 pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, t ext); 540 return NULL;
190 return pBaseLine; 541 }
191 } 542 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) {
192 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) 543 }
193 { 544 CTextBaseLine::CTextBaseLine() {
194 FX_FLOAT lastheight = -1; 545 m_Top = -100000;
195 FX_FLOAT lastbaseline = -1; 546 m_Bottom = 100000;
196 FX_FLOAT MinLeftX = 1000000; 547 m_MaxFontSizeV = 0;
197 FX_FLOAT MaxRightX = 0; 548 }
198 int i; 549 CTextBaseLine::~CTextBaseLine() {
199 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 550 for (int i = 0; i < m_TextList.GetSize(); i++) {
200 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 551 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
201 FX_FLOAT leftx, rightx; 552 delete pText;
202 if (pBaseLine->GetWidth(leftx, rightx)) { 553 }
203 if (leftx < MinLeftX) { 554 }
204 MinLeftX = leftx; 555 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx,
205 } 556 FX_FLOAT rightx,
206 if (rightx > MaxRightX) { 557 FX_FLOAT topy,
207 MaxRightX = rightx; 558 FX_FLOAT bottomy,
208 } 559 FX_FLOAT spacew,
209 } 560 FX_FLOAT fontsize_v,
210 } 561 const CFX_WideString& text) {
211 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 562 if (m_Top < topy) {
212 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 563 m_Top = topy;
213 pBaseLine->MergeBoxes(); 564 }
214 } 565 if (m_Bottom > bottomy) {
215 for (i = 1; i < m_BaseLines.GetSize(); i ++) { 566 m_Bottom = bottomy;
216 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 567 }
217 CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1); 568 if (m_MaxFontSizeV < fontsize_v) {
218 if (pBaseLine->CanMerge(pPrevLine)) { 569 m_MaxFontSizeV = fontsize_v;
219 pPrevLine->Merge(pBaseLine); 570 }
220 delete pBaseLine; 571 int i;
221 m_BaseLines.RemoveAt(i); 572 for (i = 0; i < m_TextList.GetSize(); i++) {
222 i --; 573 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
223 } 574 if (pText->m_Left > leftx) {
224 } 575 break;
225 if (m_bAutoWidth) { 576 }
226 int* widths = FX_Alloc(int, m_BaseLines.GetSize()); 577 }
227 if (widths) { 578 CTextBox* pText = FX_NEW CTextBox;
228 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 579 if (NULL == pText) {
229 widths[i] = 0; 580 return;
230 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 581 }
231 int TotalChars = 0; 582 pText->m_Text = text;
232 FX_FLOAT TotalWidth = 0; 583 pText->m_Left = leftx;
233 int minchars; 584 pText->m_Right = rightx;
234 pBaseLine->CountChars(TotalChars, TotalWidth, minchars); 585 pText->m_Top = topy;
235 if (TotalChars) { 586 pText->m_Bottom = bottomy;
236 FX_FLOAT charwidth = TotalWidth / TotalChars; 587 pText->m_SpaceWidth = spacew;
237 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth); 588 pText->m_FontSizeV = fontsize_v;
238 } 589 pText->m_pColumn = NULL;
239 if (widths[i] > 1000) { 590 m_TextList.InsertAt(i, pText);
240 widths[i] = 1000; 591 }
241 } 592 FX_BOOL GetIntersection(FX_FLOAT low1,
242 if (widths[i] < minchars) { 593 FX_FLOAT high1,
243 widths[i] = minchars; 594 FX_FLOAT low2,
244 } 595 FX_FLOAT high2,
245 } 596 FX_FLOAT& interlow,
246 int AvgWidth = 0, widthcount = 0; 597 FX_FLOAT& interhigh);
247 for (i = 0; i < m_BaseLines.GetSize(); i ++) 598 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) {
248 if (widths[i]) { 599 FX_FLOAT inter_top, inter_bottom;
249 AvgWidth += widths[i]; 600 if (!GetIntersection(m_Bottom,
250 widthcount ++; 601 m_Top,
251 } 602 pOther->m_Bottom,
252 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5); 603 pOther->m_Top,
253 int MaxWidth = 0; 604 inter_bottom,
254 for (i = 0; i < m_BaseLines.GetSize(); i ++) 605 inter_top)) {
255 if (MaxWidth < widths[i]) { 606 return FALSE;
256 MaxWidth = widths[i]; 607 }
257 } 608 FX_FLOAT inter_h = inter_top - inter_bottom;
258 if (MaxWidth > AvgWidth * 6 / 5) { 609 if (inter_h < (m_Top - m_Bottom) / 2 &&
259 MaxWidth = AvgWidth * 6 / 5; 610 inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) {
260 } 611 return FALSE;
261 FX_Free(widths); 612 }
262 if (iMinWidth < MaxWidth) { 613 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
263 iMinWidth = MaxWidth; 614 for (int i = 0; i < m_TextList.GetSize(); i++) {
264 } 615 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
265 } 616 for (int j = 0; j < pOther->m_TextList.GetSize(); j++) {
266 } 617 CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j);
267 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 618 FX_FLOAT inter_left, inter_right;
268 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 619 if (!GetIntersection(pText->m_Left,
269 pBaseLine->MergeBoxes(); 620 pText->m_Right,
270 } 621 pOtherText->m_Left,
271 if (m_bKeepColumn) { 622 pOtherText->m_Right,
272 FindColumns(); 623 inter_left,
273 } 624 inter_right)) {
274 for (i = 0; i < m_BaseLines.GetSize(); i ++) { 625 continue;
275 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); 626 }
276 if (lastheight >= 0) { 627 FX_FLOAT inter_w = inter_right - inter_left;
277 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine; 628 if (inter_w < pText->m_SpaceWidth / 2 &&
278 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1. 5) { 629 inter_w < pOtherText->m_SpaceWidth / 2) {
279 lines.Add(L""); 630 continue;
280 } 631 }
281 } 632 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
282 lastheight = pBaseLine->m_MaxFontSizeV; 633 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
283 lastbaseline = pBaseLine->m_BaseLine; 634 return FALSE;
284 CFX_WideString str; 635 }
285 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth); 636 }
286 lines.Add(str); 637 }
287 } 638 return TRUE;
288 } 639 }
289 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) 640 void CTextBaseLine::Merge(CTextBaseLine* pOther) {
290 { 641 for (int i = 0; i < pOther->m_TextList.GetSize(); i++) {
291 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); 642 CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i);
292 FX_LPWSTR pDst = NULL; 643 InsertTextBox(pText->m_Left,
293 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); 644 pText->m_Right,
294 if (nCount < 1 ) { 645 pText->m_Top,
295 sDest += wChar; 646 pText->m_Bottom,
296 return; 647 pText->m_SpaceWidth,
297 } 648 pText->m_FontSizeV,
298 pDst = new FX_WCHAR[nCount]; 649 pText->m_Text);
299 FX_Unicode_GetNormalization(wChar, pDst); 650 }
300 for (int nIndex = 0; nIndex < nCount; nIndex++) { 651 }
301 sDest += pDst[nIndex]; 652 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) {
302 } 653 int i;
303 delete[] pDst; 654 for (i = 0; i < m_TextList.GetSize(); i++) {
304 } 655 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
305 void NormalizeString(CFX_WideString& str) 656 if (pText->m_Text != L" ") {
306 { 657 break;
307 if (str.GetLength() <= 0) { 658 }
308 return; 659 }
309 } 660 if (i == m_TextList.GetSize()) {
310 CFX_WideString sBuffer; 661 return FALSE;
311 IFX_BidiChar* BidiChar = IFX_BidiChar::Create(); 662 }
312 if (NULL == BidiChar) { 663 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
313 return; 664 leftx = pText->m_Left;
314 } 665 for (i = m_TextList.GetSize() - 1; i >= 0; i--) {
315 CFX_WordArray order; 666 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
316 FX_BOOL bR2L = FALSE; 667 if (pText->m_Text != L" ") {
317 FX_INT32 start = 0, count = 0, i = 0; 668 break;
318 int nR2L = 0, nL2R = 0; 669 }
319 for (i = 0; i < str.GetLength(); i++) { 670 }
320 if(BidiChar->AppendChar(str.GetAt(i))) { 671 pText = (CTextBox*)m_TextList.GetAt(i);
321 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 672 rightx = pText->m_Right;
322 order.Add(start); 673 return TRUE;
323 order.Add(count); 674 }
324 order.Add(ret); 675 void CTextBaseLine::MergeBoxes() {
325 if(!bR2L) { 676 int i = 0;
326 if(ret == 2) { 677 while (1) {
327 nR2L++; 678 if (i >= m_TextList.GetSize() - 1) {
328 } else if (ret == 1) { 679 break;
329 nL2R++; 680 }
330 } 681 CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i);
331 } 682 CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1);
332 } 683 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right;
333 } 684 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0)
334 if(BidiChar->EndChar()) { 685 ? pNextText->m_SpaceWidth
335 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); 686 : pThisText->m_SpaceWidth;
336 order.Add(start); 687 if (spacew > 0.0 && dx < spacew * 2) {
337 order.Add(count); 688 pThisText->m_Right = pNextText->m_Right;
338 order.Add(ret); 689 if (dx > spacew * 1.5) {
339 if(!bR2L) { 690 pThisText->m_Text += L" ";
340 if(ret == 2) { 691 } else if (dx > spacew / 3) {
341 nR2L++; 692 pThisText->m_Text += L' ';
342 } else if(ret == 1) { 693 }
343 nL2R++; 694 pThisText->m_Text += pNextText->m_Text;
344 } 695 pThisText->m_SpaceWidth =
345 } 696 pNextText->m_SpaceWidth == 0.0 ? spacew : pNextText->m_SpaceWidth;
346 } 697 m_TextList.RemoveAt(i + 1);
347 if(nR2L > 0 && nR2L >= nL2R) { 698 delete pNextText;
348 bR2L = TRUE;
349 }
350 if(bR2L) {
351 int count = order.GetSize();
352 for(int j = count - 1; j > 0; j -= 3) {
353 int ret = order.GetAt(j);
354 int start = order.GetAt(j - 2);
355 int count1 = order.GetAt(j - 1);
356 if(ret == 2 || ret == 0) {
357 for(int i = start + count1 - 1; i >= start; i--) {
358 NormalizeCompositeChar(str[i], sBuffer);
359 }
360 } else {
361 i = j;
362 FX_BOOL bSymbol = FALSE;
363 while(i > 0 && order.GetAt(i) != 2) {
364 bSymbol = !order.GetAt(i);
365 i -= 3;
366 }
367 int end = start + count1 ;
368 int n = 0;
369 if(bSymbol) {
370 n = i + 6;
371 } else {
372 n = i + 3;
373 }
374 if(n >= j) {
375 for(int m = start; m < end; m++) {
376 sBuffer += str[m];
377 }
378 } else {
379 i = j;
380 j = n;
381 for(; n <= i; n += 3) {
382 int start = order.GetAt(n - 2);
383 int count1 = order.GetAt(n - 1);
384 int end = start + count1 ;
385 for(int m = start; m < end; m++) {
386 sBuffer += str[m];
387 }
388 }
389 }
390 }
391 }
392 } else { 699 } else {
393 int count = order.GetSize(); 700 i++;
394 FX_BOOL bL2R = FALSE; 701 }
395 for(int j = 0; j < count; j += 3) { 702 }
396 int ret = order.GetAt(j + 2); 703 }
397 int start = order.GetAt(j); 704 void CTextBaseLine::WriteOutput(CFX_WideString& str,
398 int count1 = order.GetAt(j + 1); 705 FX_FLOAT leftx,
399 if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) { 706 FX_FLOAT pagewidth,
400 int i = j + 3; 707 int iTextWidth) {
401 while(bR2L && i < count) { 708 int lastpos = -1;
402 if(order.GetAt(i + 2) == 1) { 709 for (int i = 0; i < m_TextList.GetSize(); i++) {
403 break;
404 } else {
405 i += 3;
406 }
407 }
408 if(i == 3) {
409 j = -3;
410 bL2R = TRUE;
411 continue;
412 }
413 int end = str.GetLength() - 1;
414 if(i < count) {
415 end = order.GetAt(i) - 1;
416 }
417 j = i - 3;
418 for(int n = end; n >= start; n--) {
419 NormalizeCompositeChar(str[i], sBuffer);
420 }
421 } else {
422 int end = start + count1 ;
423 for(int i = start; i < end; i++) {
424 sBuffer += str[i];
425 }
426 }
427 }
428 }
429 str.Empty();
430 str += sBuffer;
431 BidiChar->Release();
432 }
433 static FX_BOOL IsNumber(CFX_WideString& str)
434 {
435 for (int i = 0; i < str.GetLength(); i ++) {
436 FX_WCHAR ch = str[i];
437 if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch != ' ') {
438 return FALSE;
439 }
440 }
441 return TRUE;
442 }
443 void CTextPage::FindColumns()
444 {
445 int i;
446 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
447 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
448 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
449 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
450 CTextColumn* pColumn = FindColumn(pTextBox->m_Right);
451 if (pColumn == NULL) {
452 pColumn = FX_NEW CTextColumn;
453 if (pColumn) {
454 pColumn->m_Count = 1;
455 pColumn->m_AvgPos = pTextBox->m_Right;
456 pColumn->m_TextPos = -1;
457 m_TextColumns.Add(pColumn);
458 }
459 } else {
460 pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTex tBox->m_Right) /
461 (pColumn->m_Count + 1);
462 pColumn->m_Count ++;
463 }
464 }
465 }
466 int mincount = m_BaseLines.GetSize() / 4;
467 for (i = 0; i < m_TextColumns.GetSize(); i ++) {
468 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i);
469 if (pTextColumn->m_Count >= mincount) {
470 continue;
471 }
472 delete pTextColumn;
473 m_TextColumns.RemoveAt(i);
474 i --;
475 }
476 for (i = 0; i < m_BaseLines.GetSize(); i ++) {
477 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i);
478 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) {
479 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j);
480 if (IsNumber(pTextBox->m_Text)) {
481 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right);
482 }
483 }
484 }
485 }
486 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos)
487 {
488 for (int i = 0; i < m_TextColumns.GetSize(); i ++) {
489 CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i);
490 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) {
491 return pColumn;
492 }
493 }
494 return NULL;
495 }
496 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj)
497 {
498 }
499 CTextBaseLine::CTextBaseLine()
500 {
501 m_Top = -100000;
502 m_Bottom = 100000;
503 m_MaxFontSizeV = 0;
504 }
505 CTextBaseLine::~CTextBaseLine()
506 {
507 for (int i = 0; i < m_TextList.GetSize(); i ++) {
508 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
509 delete pText;
510 }
511 }
512 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy , FX_FLOAT bottomy,
513 FX_FLOAT spacew, FX_FLOAT fontsize_v, const CF X_WideString& text)
514 {
515 if (m_Top < topy) {
516 m_Top = topy;
517 }
518 if (m_Bottom > bottomy) {
519 m_Bottom = bottomy;
520 }
521 if (m_MaxFontSizeV < fontsize_v) {
522 m_MaxFontSizeV = fontsize_v;
523 }
524 int i;
525 for (i = 0; i < m_TextList.GetSize(); i ++) {
526 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
527 if (pText->m_Left > leftx) {
528 break;
529 }
530 }
531 CTextBox* pText = FX_NEW CTextBox;
532 if (NULL == pText) {
533 return;
534 }
535 pText->m_Text = text;
536 pText->m_Left = leftx;
537 pText->m_Right = rightx;
538 pText->m_Top = topy;
539 pText->m_Bottom = bottomy;
540 pText->m_SpaceWidth = spacew;
541 pText->m_FontSizeV = fontsize_v;
542 pText->m_pColumn = NULL;
543 m_TextList.InsertAt(i, pText);
544 }
545 FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT h igh2,
546 FX_FLOAT& interlow, FX_FLOAT& interhigh);
547 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther)
548 {
549 FX_FLOAT inter_top, inter_bottom;
550 if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top,
551 inter_bottom, inter_top)) {
552 return FALSE;
553 }
554 FX_FLOAT inter_h = inter_top - inter_bottom;
555 if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m _Bottom) / 2) {
556 return FALSE;
557 }
558 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine);
559 for (int i = 0; i < m_TextList.GetSize(); i ++) {
560 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
561 for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) {
562 CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j);
563 FX_FLOAT inter_left, inter_right;
564 if (!GetIntersection(pText->m_Left, pText->m_Right,
565 pOtherText->m_Left, pOtherText->m_Right, inter_ left, inter_right)) {
566 continue;
567 }
568 FX_FLOAT inter_w = inter_right - inter_left;
569 if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_Spa ceWidth / 2) {
570 continue;
571 }
572 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 ||
573 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) {
574 return FALSE;
575 }
576 }
577 }
578 return TRUE;
579 }
580 void CTextBaseLine::Merge(CTextBaseLine* pOther)
581 {
582 for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) {
583 CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i);
584 InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bott om,
585 pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text);
586 }
587 }
588 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx)
589 {
590 int i;
591 for (i = 0; i < m_TextList.GetSize(); i ++) {
592 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
593 if (pText->m_Text != L" ") {
594 break;
595 }
596 }
597 if (i == m_TextList.GetSize()) {
598 return FALSE;
599 }
600 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 710 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
601 leftx = pText->m_Left; 711 int xpos;
602 for (i = m_TextList.GetSize() - 1; i >= 0; i --) { 712 if (pText->m_pColumn) {
603 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); 713 xpos =
604 if (pText->m_Text != L" ") { 714 (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth +
605 break; 715 0.5);
606 } 716 xpos -= pText->m_Text.GetLength();
607 } 717 } else {
608 pText = (CTextBox*)m_TextList.GetAt(i); 718 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5);
609 rightx = pText->m_Right; 719 }
610 return TRUE; 720 if (xpos <= lastpos) {
611 } 721 xpos = lastpos + 1;
612 void CTextBaseLine::MergeBoxes() 722 }
613 { 723 for (int j = lastpos + 1; j < xpos; j++) {
614 int i = 0; 724 str += ' ';
615 while (1) { 725 }
616 if (i >= m_TextList.GetSize() - 1) { 726 CFX_WideString sSrc(pText->m_Text);
617 break; 727 NormalizeString(sSrc);
618 } 728 str += sSrc;
619 CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i); 729 str += ' ';
620 CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1); 730 lastpos = xpos + pText->m_Text.GetLength();
621 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right; 731 }
622 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ? 732 }
623 pNextText->m_SpaceWidth : pThisText->m_SpaceWidth; 733 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) {
624 if (spacew > 0.0 && dx < spacew * 2) { 734 minchars = 0;
625 pThisText->m_Right = pNextText->m_Right; 735 for (int i = 0; i < m_TextList.GetSize(); i++) {
626 if (dx > spacew * 1.5) { 736 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
627 pThisText->m_Text += L" "; 737 if (pText->m_Right - pText->m_Left < 0.002) {
628 } else if (dx > spacew / 3) { 738 continue;
629 pThisText->m_Text += L' '; 739 }
630 } 740 count += pText->m_Text.GetLength();
631 pThisText->m_Text += pNextText->m_Text; 741 width += pText->m_Right - pText->m_Left;
632 pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ? 742 minchars += pText->m_Text.GetLength() + 1;
633 spacew : pNextText->m_SpaceWidth; 743 }
634 m_TextList.RemoveAt(i + 1);
635 delete pNextText;
636 } else {
637 i ++;
638 }
639 }
640 }
641 void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pa gewidth,
642 int iTextWidth)
643 {
644 int lastpos = -1;
645 for (int i = 0; i < m_TextList.GetSize(); i ++) {
646 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
647 int xpos;
648 if (pText->m_pColumn) {
649 xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pag ewidth + 0.5);
650 xpos -= pText->m_Text.GetLength();
651 } else {
652 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5) ;
653 }
654 if (xpos <= lastpos) {
655 xpos = lastpos + 1;
656 }
657 for (int j = lastpos + 1; j < xpos; j ++) {
658 str += ' ';
659 }
660 CFX_WideString sSrc(pText->m_Text);
661 NormalizeString(sSrc);
662 str += sSrc;
663 str += ' ';
664 lastpos = xpos + pText->m_Text.GetLength();
665 }
666 }
667 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars)
668 {
669 minchars = 0;
670 for (int i = 0; i < m_TextList.GetSize(); i ++) {
671 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i);
672 if (pText->m_Right - pText->m_Left < 0.002) {
673 continue;
674 }
675 count += pText->m_Text.GetLength();
676 width += pText->m_Right - pText->m_Left;
677 minchars += pText->m_Text.GetLength() + 1;
678 }
679 } 744 }
680 #define PI 3.1415926535897932384626433832795 745 #define PI 3.1415926535897932384626433832795
681 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) 746 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) {
682 { 747 int total_count = 0, rotated_count[3] = { 0, 0, 0 };
683 int total_count = 0, rotated_count[3] = {0, 0, 0}; 748 FX_POSITION pos = page.GetFirstObjectPosition();
684 FX_POSITION pos = page.GetFirstObjectPosition(); 749 while (pos) {
685 while (pos) { 750 CPDF_PageObject* pObj = page.GetNextObject(pos);
686 CPDF_PageObject* pObj = page.GetNextObject(pos); 751 if (pObj->m_Type != PDFPAGE_TEXT) {
687 if (pObj->m_Type != PDFPAGE_TEXT) { 752 continue;
688 continue; 753 }
689 } 754 total_count++;
690 total_count ++; 755 CPDF_TextObject* pText = (CPDF_TextObject*)pObj;
691 CPDF_TextObject* pText = (CPDF_TextObject*)pObj; 756 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle();
692 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle(); 757 if (angle == 0.0) {
693 if (angle == 0.0) { 758 continue;
694 continue; 759 }
695 } 760 int degree = (int)(angle * 180 / PI + 0.5);
696 int degree = (int)(angle * 180 / PI + 0.5); 761 if (degree % 90) {
697 if (degree % 90) { 762 continue;
698 continue; 763 }
699 } 764 if (degree < 0) {
700 if (degree < 0) { 765 degree += 360;
701 degree += 360; 766 }
702 } 767 int index = degree / 90 % 3 - 1;
703 int index = degree / 90 % 3 - 1; 768 if (index < 0) {
704 if (index < 0) { 769 continue;
705 continue; 770 }
706 } 771 rotated_count[index]++;
707 rotated_count[index] ++; 772 }
708 } 773 if (total_count == 0) {
709 if (total_count == 0) { 774 return;
710 return; 775 }
711 } 776 CFX_AffineMatrix matrix;
712 CFX_AffineMatrix matrix; 777 if (rotated_count[0] > total_count * 2 / 3) {
713 if (rotated_count[0] > total_count * 2 / 3) { 778 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight());
714 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight()); 779 } else if (rotated_count[1] > total_count * 2 / 3) {
715 } else if (rotated_count[1] > total_count * 2 / 3) { 780 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight());
716 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight()); 781 } else if (rotated_count[2] > total_count * 2 / 3) {
717 } else if (rotated_count[2] > total_count * 2 / 3) { 782 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0);
718 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0); 783 } else {
719 } else { 784 return;
720 return; 785 }
721 } 786 page.Transform(matrix);
722 page.Transform(matrix); 787 page_bbox.Transform(&matrix);
723 page_bbox.Transform(&matrix); 788 }
724 } 789 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines,
725 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CP DF_Dictionary* pPage, 790 CPDF_Document* pDoc,
726 int iMinWidth, FX_DWORD flags) 791 CPDF_Dictionary* pPage,
727 { 792 int iMinWidth,
728 lines.RemoveAll(); 793 FX_DWORD flags) {
729 if (pPage == NULL) { 794 lines.RemoveAll();
730 return; 795 if (pPage == NULL) {
731 } 796 return;
732 CPDF_Page page; 797 }
733 page.Load(pDoc, pPage); 798 CPDF_Page page;
734 CPDF_ParseOptions options; 799 page.Load(pDoc, pPage);
735 options.m_bTextOnly = TRUE; 800 CPDF_ParseOptions options;
736 options.m_bSeparateForm = FALSE; 801 options.m_bTextOnly = TRUE;
737 page.ParseContent(&options); 802 options.m_bSeparateForm = FALSE;
738 CFX_FloatRect page_bbox = page.GetPageBBox(); 803 page.ParseContent(&options);
739 if (flags & PDF2TXT_AUTO_ROTATE) { 804 CFX_FloatRect page_bbox = page.GetPageBBox();
740 CheckRotate(page, page_bbox); 805 if (flags & PDF2TXT_AUTO_ROTATE) {
741 } 806 CheckRotate(page, page_bbox);
742 CTextPage texts; 807 }
743 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH; 808 CTextPage texts;
744 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN; 809 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH;
745 texts.m_bBreakSpace = TRUE; 810 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN;
746 FX_POSITION pos = page.GetFirstObjectPosition(); 811 texts.m_bBreakSpace = TRUE;
747 while (pos) { 812 FX_POSITION pos = page.GetFirstObjectPosition();
748 CPDF_PageObject* pObject = page.GetNextObject(pos); 813 while (pos) {
749 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) { 814 CPDF_PageObject* pObject = page.GetNextObject(pos);
750 CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Ri ght, pObject->m_Top); 815 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) {
751 if (!page_bbox.Contains(rect)) { 816 CFX_FloatRect rect(
752 continue; 817 pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top);
753 } 818 if (!page_bbox.Contains(rect)) {
754 } 819 continue;
755 texts.ProcessObject(pObject); 820 }
756 } 821 }
757 texts.WriteOutput(lines, iMinWidth); 822 texts.ProcessObject(pObject);
758 } 823 }
759 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dicti onary* pPage, 824 texts.WriteOutput(lines, iMinWidth);
760 int iMinWidth, FX_DWORD flags) 825 }
761 { 826 void PDF_GetPageText(CFX_ByteStringArray& lines,
762 lines.RemoveAll(); 827 CPDF_Document* pDoc,
763 CFX_WideStringArray wlines; 828 CPDF_Dictionary* pPage,
764 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags); 829 int iMinWidth,
765 for (int i = 0; i < wlines.GetSize(); i ++) { 830 FX_DWORD flags) {
766 CFX_WideString wstr = wlines[i]; 831 lines.RemoveAll();
767 CFX_ByteString str; 832 CFX_WideStringArray wlines;
768 for (int c = 0; c < wstr.GetLength(); c ++) { 833 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags);
769 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?"); 834 for (int i = 0; i < wlines.GetSize(); i++) {
770 } 835 CFX_WideString wstr = wlines[i];
771 lines.Add(str); 836 CFX_ByteString str;
772 } 837 for (int c = 0; c < wstr.GetLength(); c++) {
838 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?");
839 }
840 lines.Add(str);
841 }
773 } 842 }
774 #endif 843 #endif
775 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects * pPage, FX_BOOL bUseLF, 844 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer,
845 CPDF_PageObjects* pPage,
846 FX_BOOL bUseLF,
776 CFX_PtrArray* pObjArray); 847 CFX_PtrArray* pObjArray);
777 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPD F_Dictionary* pPage, FX_DWORD flags) 848 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer,
778 { 849 CPDF_Document* pDoc,
779 buffer.EstimateSize(0, 10240); 850 CPDF_Dictionary* pPage,
780 CPDF_Page page; 851 FX_DWORD flags) {
781 page.Load(pDoc, pPage); 852 buffer.EstimateSize(0, 10240);
782 CPDF_ParseOptions options; 853 CPDF_Page page;
783 options.m_bTextOnly = TRUE; 854 page.Load(pDoc, pPage);
784 options.m_bSeparateForm = FALSE; 855 CPDF_ParseOptions options;
785 page.ParseContent(&options); 856 options.m_bTextOnly = TRUE;
786 _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL); 857 options.m_bSeparateForm = FALSE;
787 } 858 page.ParseContent(&options);
859 _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL);
860 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698