| OLD | NEW |
| 1 // Copyright 2014 PDFium Authors. All rights reserved. | 1 // Copyright 2014 PDFium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| 6 | 6 |
| 7 #include "../../include/fpdfapi/fpdf_page.h" | 7 #include "../../include/fpdfapi/fpdf_page.h" |
| 8 #include "../../include/fpdfapi/fpdf_pageobj.h" | 8 #include "../../include/fpdfapi/fpdf_pageobj.h" |
| 9 #include "../../include/fpdftext/fpdf_text.h" | 9 #include "../../include/fpdftext/fpdf_text.h" |
| 10 #include "txtproc.h" | 10 #include "txtproc.h" |
| 11 #include "text_int.h" | 11 #include "text_int.h" |
| 12 #if !defined(_FPDFAPI_MINI_) || defined(_FXCORE_FEATURE_ALL_) | 12 #if !defined(_FPDFAPI_MINI_) || defined(_FXCORE_FEATURE_ALL_) |
| 13 extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR); | 13 extern FX_LPCSTR FCS_GetAltStr(FX_WCHAR); |
| 14 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, int destcp, FX_LPCSTR defcha
r) | 14 CFX_ByteString CharFromUnicodeAlt(FX_WCHAR unicode, |
| 15 { | 15 int destcp, |
| 16 if (destcp == 0) { | 16 FX_LPCSTR defchar) { |
| 17 if (unicode < 0x80) { | 17 if (destcp == 0) { |
| 18 return CFX_ByteString((char)unicode); | 18 if (unicode < 0x80) { |
| 19 } | 19 return CFX_ByteString((char)unicode); |
| 20 FX_LPCSTR altstr = FCS_GetAltStr(unicode); | |
| 21 if (altstr) { | |
| 22 return CFX_ByteString(altstr, -1); | |
| 23 } | |
| 24 return CFX_ByteString(defchar, -1); | |
| 25 } | |
| 26 FX_BOOL bDef = FALSE; | |
| 27 char buf[10]; | |
| 28 int ret = FXSYS_WideCharToMultiByte(destcp, 0, (wchar_t*)&unicode, 1, buf, 1
0, NULL, &bDef); | |
| 29 if (ret && !bDef) { | |
| 30 return CFX_ByteString(buf, ret); | |
| 31 } | 20 } |
| 32 FX_LPCSTR altstr = FCS_GetAltStr(unicode); | 21 FX_LPCSTR altstr = FCS_GetAltStr(unicode); |
| 33 if (altstr) { | 22 if (altstr) { |
| 34 return CFX_ByteString(altstr, -1); | 23 return CFX_ByteString(altstr, -1); |
| 35 } | 24 } |
| 36 return CFX_ByteString(defchar, -1); | 25 return CFX_ByteString(defchar, -1); |
| 37 } | 26 } |
| 38 CTextPage::CTextPage() | 27 FX_BOOL bDef = FALSE; |
| 39 { | 28 char buf[10]; |
| 40 } | 29 int ret = FXSYS_WideCharToMultiByte( |
| 41 CTextPage::~CTextPage() | 30 destcp, 0, (wchar_t*)&unicode, 1, buf, 10, NULL, &bDef); |
| 42 { | 31 if (ret && !bDef) { |
| 32 return CFX_ByteString(buf, ret); |
| 33 } |
| 34 FX_LPCSTR altstr = FCS_GetAltStr(unicode); |
| 35 if (altstr) { |
| 36 return CFX_ByteString(altstr, -1); |
| 37 } |
| 38 return CFX_ByteString(defchar, -1); |
| 39 } |
| 40 CTextPage::CTextPage() { |
| 41 } |
| 42 CTextPage::~CTextPage() { |
| 43 int i; |
| 44 for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| 45 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); |
| 46 delete pBaseLine; |
| 47 } |
| 48 for (i = 0; i < m_TextColumns.GetSize(); i++) { |
| 49 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i); |
| 50 delete pTextColumn; |
| 51 } |
| 52 } |
| 53 void CTextPage::ProcessObject(CPDF_PageObject* pObject) { |
| 54 if (pObject->m_Type != PDFPAGE_TEXT) { |
| 55 return; |
| 56 } |
| 57 CPDF_TextObject* pText = (CPDF_TextObject*)pObject; |
| 58 CPDF_Font* pFont = pText->m_TextState.GetFont(); |
| 59 int count = pText->CountItems(); |
| 60 FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2); |
| 61 if (pPosArray) { |
| 62 pText->CalcCharPos(pPosArray); |
| 63 } |
| 64 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH(); |
| 65 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV(); |
| 66 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); |
| 67 FX_FLOAT spacew = 0; |
| 68 if (space_charcode != -1) { |
| 69 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; |
| 70 } |
| 71 if (spacew == 0) { |
| 72 spacew = fontsize_h / 4; |
| 73 } |
| 74 if (pText->m_TextState.GetBaselineAngle() != 0) { |
| 75 int cc = 0; |
| 76 CFX_AffineMatrix matrix; |
| 77 pText->GetTextMatrix(&matrix); |
| 78 for (int i = 0; i < pText->m_nChars; i++) { |
| 79 FX_DWORD charcode = pText->m_nChars == 1 |
| 80 ? (FX_DWORD)(FX_UINTPTR) pText->m_pCharCodes |
| 81 : pText->m_pCharCodes[i]; |
| 82 if (charcode == (FX_DWORD)-1) { |
| 83 continue; |
| 84 } |
| 85 FX_RECT char_box; |
| 86 pFont->GetCharBBox(charcode, char_box); |
| 87 FX_FLOAT char_left = |
| 88 pPosArray ? pPosArray[cc * 2] |
| 89 : char_box.left * pText->m_TextState.GetFontSize() / 1000; |
| 90 FX_FLOAT char_right = |
| 91 pPosArray ? pPosArray[cc * 2 + 1] |
| 92 : char_box.right * pText->m_TextState.GetFontSize() / 1000; |
| 93 FX_FLOAT char_top = |
| 94 char_box.top * pText->m_TextState.GetFontSize() / 1000; |
| 95 FX_FLOAT char_bottom = |
| 96 char_box.bottom * pText->m_TextState.GetFontSize() / 1000; |
| 97 cc++; |
| 98 FX_FLOAT char_origx, char_origy; |
| 99 matrix.Transform(char_left, 0, char_origx, char_origy); |
| 100 matrix.TransformRect(char_left, char_right, char_top, char_bottom); |
| 101 CFX_ByteString str; |
| 102 pFont->AppendChar(str, charcode); |
| 103 InsertTextBox(NULL, |
| 104 char_origy, |
| 105 char_left, |
| 106 char_right, |
| 107 char_top, |
| 108 char_bottom, |
| 109 spacew, |
| 110 fontsize_v, |
| 111 str, |
| 112 pFont); |
| 113 } |
| 114 if (pPosArray) { |
| 115 FX_Free(pPosArray); |
| 116 } |
| 117 return; |
| 118 } |
| 119 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize(); |
| 120 for (int ii = 0; ii < count * 2; ii++) { |
| 121 pPosArray[ii] *= ratio_h; |
| 122 } |
| 123 FX_FLOAT baseline = pText->m_PosY; |
| 124 CTextBaseLine* pBaseLine = NULL; |
| 125 FX_FLOAT topy = pText->m_Top; |
| 126 FX_FLOAT bottomy = pText->m_Bottom; |
| 127 FX_FLOAT leftx = pText->m_Left; |
| 128 int cc = 0; |
| 129 CFX_ByteString segment; |
| 130 int space_count = 0; |
| 131 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0; |
| 132 for (int i = 0; i < pText->m_nChars; i++) { |
| 133 FX_DWORD charcode = pText->m_nChars == 1 |
| 134 ? (FX_DWORD)(FX_UINTPTR) pText->m_pCharCodes |
| 135 : pText->m_pCharCodes[i]; |
| 136 if (charcode == (FX_DWORD)-1) { |
| 137 continue; |
| 138 } |
| 139 FX_FLOAT char_left = pPosArray[cc * 2]; |
| 140 FX_FLOAT char_right = pPosArray[cc * 2 + 1]; |
| 141 cc++; |
| 142 if (char_left < last_left || (char_left - last_right) > spacew / 2) { |
| 143 pBaseLine = InsertTextBox(pBaseLine, |
| 144 baseline, |
| 145 leftx + segment_left, |
| 146 leftx + segment_right, |
| 147 topy, |
| 148 bottomy, |
| 149 spacew, |
| 150 fontsize_v, |
| 151 segment, |
| 152 pFont); |
| 153 segment_left = char_left; |
| 154 segment = ""; |
| 155 } |
| 156 if (space_count > 1) { |
| 157 pBaseLine = InsertTextBox(pBaseLine, |
| 158 baseline, |
| 159 leftx + segment_left, |
| 160 leftx + segment_right, |
| 161 topy, |
| 162 bottomy, |
| 163 spacew, |
| 164 fontsize_v, |
| 165 segment, |
| 166 pFont); |
| 167 segment = ""; |
| 168 } else if (space_count == 1) { |
| 169 pFont->AppendChar(segment, ' '); |
| 170 } |
| 171 if (segment.GetLength() == 0) { |
| 172 segment_left = char_left; |
| 173 } |
| 174 segment_right = char_right; |
| 175 pFont->AppendChar(segment, charcode); |
| 176 space_count = 0; |
| 177 last_left = char_left; |
| 178 last_right = char_right; |
| 179 } |
| 180 if (segment.GetLength()) |
| 181 pBaseLine = InsertTextBox(pBaseLine, |
| 182 baseline, |
| 183 leftx + segment_left, |
| 184 leftx + segment_right, |
| 185 topy, |
| 186 bottomy, |
| 187 spacew, |
| 188 fontsize_v, |
| 189 segment, |
| 190 pFont); |
| 191 FX_Free(pPosArray); |
| 192 } |
| 193 static void ConvertPDFString(CFX_ByteString& result, |
| 194 CFX_ByteString& src, |
| 195 CPDF_Font* pFont); |
| 196 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, |
| 197 FX_FLOAT basey, |
| 198 FX_FLOAT leftx, |
| 199 FX_FLOAT rightx, |
| 200 FX_FLOAT topy, |
| 201 FX_FLOAT bottomy, |
| 202 FX_FLOAT spacew, |
| 203 FX_FLOAT fontsize_v, |
| 204 CFX_ByteString& str, |
| 205 CPDF_Font* pFont) { |
| 206 if (str.GetLength() == 0) { |
| 207 return NULL; |
| 208 } |
| 209 if (pBaseLine == NULL) { |
| 43 int i; | 210 int i; |
| 44 for (i = 0; i < m_BaseLines.GetSize(); i ++) { | 211 for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| 212 CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i); |
| 213 if (pExistLine->m_BaseLine == basey) { |
| 214 pBaseLine = pExistLine; |
| 215 break; |
| 216 } |
| 217 if (pExistLine->m_BaseLine < basey) { |
| 218 break; |
| 219 } |
| 220 } |
| 221 if (pBaseLine == NULL) { |
| 222 pBaseLine = FX_NEW CTextBaseLine; |
| 223 if (NULL == pBaseLine) { |
| 224 return NULL; |
| 225 } |
| 226 pBaseLine->m_BaseLine = basey; |
| 227 m_BaseLines.InsertAt(i, pBaseLine); |
| 228 } |
| 229 } |
| 230 CFX_WideString text; |
| 231 FX_LPCSTR pStr = str; |
| 232 int len = str.GetLength(), offset = 0; |
| 233 while (offset < len) { |
| 234 FX_DWORD ch = pFont->GetNextChar(pStr, offset); |
| 235 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch); |
| 236 text += unicode_str; |
| 237 } |
| 238 pBaseLine->InsertTextBox( |
| 239 leftx, rightx, topy, bottomy, spacew, fontsize_v, text); |
| 240 return pBaseLine; |
| 241 } |
| 242 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) { |
| 243 FX_FLOAT lastheight = -1; |
| 244 FX_FLOAT lastbaseline = -1; |
| 245 FX_FLOAT MinLeftX = 1000000; |
| 246 FX_FLOAT MaxRightX = 0; |
| 247 int i; |
| 248 for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| 249 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); |
| 250 FX_FLOAT leftx, rightx; |
| 251 if (pBaseLine->GetWidth(leftx, rightx)) { |
| 252 if (leftx < MinLeftX) { |
| 253 MinLeftX = leftx; |
| 254 } |
| 255 if (rightx > MaxRightX) { |
| 256 MaxRightX = rightx; |
| 257 } |
| 258 } |
| 259 } |
| 260 for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| 261 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); |
| 262 pBaseLine->MergeBoxes(); |
| 263 } |
| 264 for (i = 1; i < m_BaseLines.GetSize(); i++) { |
| 265 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); |
| 266 CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1); |
| 267 if (pBaseLine->CanMerge(pPrevLine)) { |
| 268 pPrevLine->Merge(pBaseLine); |
| 269 delete pBaseLine; |
| 270 m_BaseLines.RemoveAt(i); |
| 271 i--; |
| 272 } |
| 273 } |
| 274 if (m_bAutoWidth) { |
| 275 int* widths = FX_Alloc(int, m_BaseLines.GetSize()); |
| 276 if (widths) { |
| 277 for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| 278 widths[i] = 0; |
| 45 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); | 279 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); |
| 46 delete pBaseLine; | 280 int TotalChars = 0; |
| 47 } | 281 FX_FLOAT TotalWidth = 0; |
| 48 for (i = 0; i < m_TextColumns.GetSize(); i ++) { | 282 int minchars; |
| 49 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i); | 283 pBaseLine->CountChars(TotalChars, TotalWidth, minchars); |
| 50 delete pTextColumn; | 284 if (TotalChars) { |
| 51 } | 285 FX_FLOAT charwidth = TotalWidth / TotalChars; |
| 52 } | 286 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth); |
| 53 void CTextPage::ProcessObject(CPDF_PageObject* pObject) | 287 } |
| 54 { | 288 if (widths[i] > 1000) { |
| 55 if (pObject->m_Type != PDFPAGE_TEXT) { | 289 widths[i] = 1000; |
| 56 return; | 290 } |
| 57 } | 291 if (widths[i] < minchars) { |
| 58 CPDF_TextObject* pText = (CPDF_TextObject*)pObject; | 292 widths[i] = minchars; |
| 59 CPDF_Font* pFont = pText->m_TextState.GetFont(); | 293 } |
| 60 int count = pText->CountItems(); | 294 } |
| 61 FX_FLOAT* pPosArray = FX_Alloc(FX_FLOAT, count * 2); | 295 int AvgWidth = 0, widthcount = 0; |
| 62 if (pPosArray) { | 296 for (i = 0; i < m_BaseLines.GetSize(); i++) |
| 63 pText->CalcCharPos(pPosArray); | 297 if (widths[i]) { |
| 64 } | 298 AvgWidth += widths[i]; |
| 65 FX_FLOAT fontsize_h = pText->m_TextState.GetFontSizeH(); | 299 widthcount++; |
| 66 FX_FLOAT fontsize_v = pText->m_TextState.GetFontSizeV(); | 300 } |
| 67 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); | 301 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5); |
| 68 FX_FLOAT spacew = 0; | 302 int MaxWidth = 0; |
| 69 if (space_charcode != -1) { | 303 for (i = 0; i < m_BaseLines.GetSize(); i++) |
| 70 spacew = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000; | 304 if (MaxWidth < widths[i]) { |
| 71 } | 305 MaxWidth = widths[i]; |
| 72 if (spacew == 0) { | 306 } |
| 73 spacew = fontsize_h / 4; | 307 if (MaxWidth > AvgWidth * 6 / 5) { |
| 74 } | 308 MaxWidth = AvgWidth * 6 / 5; |
| 75 if (pText->m_TextState.GetBaselineAngle() != 0) { | 309 } |
| 76 int cc = 0; | 310 FX_Free(widths); |
| 77 CFX_AffineMatrix matrix; | 311 if (iMinWidth < MaxWidth) { |
| 78 pText->GetTextMatrix(&matrix); | 312 iMinWidth = MaxWidth; |
| 79 for (int i = 0; i < pText->m_nChars; i ++) { | 313 } |
| 80 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pTe
xt->m_pCharCodes : pText->m_pCharCodes[i]; | 314 } |
| 81 if (charcode == (FX_DWORD) - 1) { | 315 } |
| 82 continue; | 316 for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| 317 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); |
| 318 pBaseLine->MergeBoxes(); |
| 319 } |
| 320 if (m_bKeepColumn) { |
| 321 FindColumns(); |
| 322 } |
| 323 for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| 324 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); |
| 325 if (lastheight >= 0) { |
| 326 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine; |
| 327 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.5) { |
| 328 lines.Add(L""); |
| 329 } |
| 330 } |
| 331 lastheight = pBaseLine->m_MaxFontSizeV; |
| 332 lastbaseline = pBaseLine->m_BaseLine; |
| 333 CFX_WideString str; |
| 334 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth); |
| 335 lines.Add(str); |
| 336 } |
| 337 } |
| 338 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) { |
| 339 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); |
| 340 FX_LPWSTR pDst = NULL; |
| 341 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); |
| 342 if (nCount < 1) { |
| 343 sDest += wChar; |
| 344 return; |
| 345 } |
| 346 pDst = new FX_WCHAR[nCount]; |
| 347 FX_Unicode_GetNormalization(wChar, pDst); |
| 348 for (int nIndex = 0; nIndex < nCount; nIndex++) { |
| 349 sDest += pDst[nIndex]; |
| 350 } |
| 351 delete[] pDst; |
| 352 } |
| 353 void NormalizeString(CFX_WideString& str) { |
| 354 if (str.GetLength() <= 0) { |
| 355 return; |
| 356 } |
| 357 CFX_WideString sBuffer; |
| 358 IFX_BidiChar* BidiChar = IFX_BidiChar::Create(); |
| 359 if (NULL == BidiChar) { |
| 360 return; |
| 361 } |
| 362 CFX_WordArray order; |
| 363 FX_BOOL bR2L = FALSE; |
| 364 FX_INT32 start = 0, count = 0, i = 0; |
| 365 int nR2L = 0, nL2R = 0; |
| 366 for (i = 0; i < str.GetLength(); i++) { |
| 367 if (BidiChar->AppendChar(str.GetAt(i))) { |
| 368 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); |
| 369 order.Add(start); |
| 370 order.Add(count); |
| 371 order.Add(ret); |
| 372 if (!bR2L) { |
| 373 if (ret == 2) { |
| 374 nR2L++; |
| 375 } else if (ret == 1) { |
| 376 nL2R++; |
| 377 } |
| 378 } |
| 379 } |
| 380 } |
| 381 if (BidiChar->EndChar()) { |
| 382 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); |
| 383 order.Add(start); |
| 384 order.Add(count); |
| 385 order.Add(ret); |
| 386 if (!bR2L) { |
| 387 if (ret == 2) { |
| 388 nR2L++; |
| 389 } else if (ret == 1) { |
| 390 nL2R++; |
| 391 } |
| 392 } |
| 393 } |
| 394 if (nR2L > 0 && nR2L >= nL2R) { |
| 395 bR2L = TRUE; |
| 396 } |
| 397 if (bR2L) { |
| 398 int count = order.GetSize(); |
| 399 for (int j = count - 1; j > 0; j -= 3) { |
| 400 int ret = order.GetAt(j); |
| 401 int start = order.GetAt(j - 2); |
| 402 int count1 = order.GetAt(j - 1); |
| 403 if (ret == 2 || ret == 0) { |
| 404 for (int i = start + count1 - 1; i >= start; i--) { |
| 405 NormalizeCompositeChar(str[i], sBuffer); |
| 406 } |
| 407 } else { |
| 408 i = j; |
| 409 FX_BOOL bSymbol = FALSE; |
| 410 while (i > 0 && order.GetAt(i) != 2) { |
| 411 bSymbol = !order.GetAt(i); |
| 412 i -= 3; |
| 413 } |
| 414 int end = start + count1; |
| 415 int n = 0; |
| 416 if (bSymbol) { |
| 417 n = i + 6; |
| 418 } else { |
| 419 n = i + 3; |
| 420 } |
| 421 if (n >= j) { |
| 422 for (int m = start; m < end; m++) { |
| 423 sBuffer += str[m]; |
| 424 } |
| 425 } else { |
| 426 i = j; |
| 427 j = n; |
| 428 for (; n <= i; n += 3) { |
| 429 int start = order.GetAt(n - 2); |
| 430 int count1 = order.GetAt(n - 1); |
| 431 int end = start + count1; |
| 432 for (int m = start; m < end; m++) { |
| 433 sBuffer += str[m]; |
| 83 } | 434 } |
| 84 FX_RECT char_box; | 435 } |
| 85 pFont->GetCharBBox(charcode, char_box); | 436 } |
| 86 FX_FLOAT char_left = pPosArray ? pPosArray[cc * 2] : char_box.left *
pText->m_TextState.GetFontSize() / 1000; | 437 } |
| 87 FX_FLOAT char_right = pPosArray ? pPosArray[cc * 2 + 1] : char_box.r
ight * pText->m_TextState.GetFontSize() / 1000; | 438 } |
| 88 FX_FLOAT char_top = char_box.top * pText->m_TextState.GetFontSize()
/ 1000; | 439 } else { |
| 89 FX_FLOAT char_bottom = char_box.bottom * pText->m_TextState.GetFontS
ize() / 1000; | 440 int count = order.GetSize(); |
| 90 cc ++; | 441 FX_BOOL bL2R = FALSE; |
| 91 FX_FLOAT char_origx, char_origy; | 442 for (int j = 0; j < count; j += 3) { |
| 92 matrix.Transform(char_left, 0, char_origx, char_origy); | 443 int ret = order.GetAt(j + 2); |
| 93 matrix.TransformRect(char_left, char_right, char_top, char_bottom); | 444 int start = order.GetAt(j); |
| 94 CFX_ByteString str; | 445 int count1 = order.GetAt(j + 1); |
| 95 pFont->AppendChar(str, charcode); | 446 if (ret == 2 || (j == 0 && ret == 0 && !bL2R)) { |
| 96 InsertTextBox(NULL, char_origy, char_left, char_right, char_top, | 447 int i = j + 3; |
| 97 char_bottom, spacew, fontsize_v, str, pFont); | 448 while (bR2L && i < count) { |
| 98 } | 449 if (order.GetAt(i + 2) == 1) { |
| 99 if (pPosArray) { | 450 break; |
| 100 FX_Free(pPosArray); | 451 } else { |
| 101 } | 452 i += 3; |
| 102 return; | 453 } |
| 103 } | 454 } |
| 104 FX_FLOAT ratio_h = fontsize_h / pText->m_TextState.GetFontSize(); | 455 if (i == 3) { |
| 105 for (int ii = 0; ii < count * 2; ii ++) { | 456 j = -3; |
| 106 pPosArray[ii] *= ratio_h; | 457 bL2R = TRUE; |
| 107 } | 458 continue; |
| 108 FX_FLOAT baseline = pText->m_PosY; | 459 } |
| 109 CTextBaseLine* pBaseLine = NULL; | 460 int end = str.GetLength() - 1; |
| 110 FX_FLOAT topy = pText->m_Top; | 461 if (i < count) { |
| 111 FX_FLOAT bottomy = pText->m_Bottom; | 462 end = order.GetAt(i) - 1; |
| 112 FX_FLOAT leftx = pText->m_Left; | 463 } |
| 113 int cc = 0; | 464 j = i - 3; |
| 114 CFX_ByteString segment; | 465 for (int n = end; n >= start; n--) { |
| 115 int space_count = 0; | 466 NormalizeCompositeChar(str[i], sBuffer); |
| 116 FX_FLOAT last_left = 0, last_right = 0, segment_left = 0, segment_right = 0; | 467 } |
| 117 for (int i = 0; i < pText->m_nChars; i ++) { | 468 } else { |
| 118 FX_DWORD charcode = pText->m_nChars == 1 ? (FX_DWORD)(FX_UINTPTR)pText->
m_pCharCodes : pText->m_pCharCodes[i]; | 469 int end = start + count1; |
| 119 if (charcode == (FX_DWORD) - 1) { | 470 for (int i = start; i < end; i++) { |
| 120 continue; | 471 sBuffer += str[i]; |
| 121 } | 472 } |
| 122 FX_FLOAT char_left = pPosArray[cc * 2]; | 473 } |
| 123 FX_FLOAT char_right = pPosArray[cc * 2 + 1]; | 474 } |
| 124 cc ++; | 475 } |
| 125 if (char_left < last_left || (char_left - last_right) > spacew / 2) { | 476 str.Empty(); |
| 126 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
leftx + segment_right, | 477 str += sBuffer; |
| 127 topy, bottomy, spacew, fontsize_v, segment
, pFont); | 478 BidiChar->Release(); |
| 128 segment_left = char_left; | 479 } |
| 129 segment = ""; | 480 static FX_BOOL IsNumber(CFX_WideString& str) { |
| 130 } | 481 for (int i = 0; i < str.GetLength(); i++) { |
| 131 if (space_count > 1) { | 482 FX_WCHAR ch = str[i]; |
| 132 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left,
leftx + segment_right, | 483 if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && |
| 133 topy, bottomy, spacew, fontsize_v, segment
, pFont); | 484 ch != ' ') { |
| 134 segment = ""; | 485 return FALSE; |
| 135 } else if (space_count == 1) { | 486 } |
| 136 pFont->AppendChar(segment, ' '); | 487 } |
| 137 } | 488 return TRUE; |
| 138 if (segment.GetLength() == 0) { | 489 } |
| 139 segment_left = char_left; | 490 void CTextPage::FindColumns() { |
| 140 } | 491 int i; |
| 141 segment_right = char_right; | 492 for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| 142 pFont->AppendChar(segment, charcode); | 493 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); |
| 143 space_count = 0; | 494 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) { |
| 144 last_left = char_left; | 495 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j); |
| 145 last_right = char_right; | 496 CTextColumn* pColumn = FindColumn(pTextBox->m_Right); |
| 146 } | 497 if (pColumn == NULL) { |
| 147 if (segment.GetLength()) | 498 pColumn = FX_NEW CTextColumn; |
| 148 pBaseLine = InsertTextBox(pBaseLine, baseline, leftx + segment_left, lef
tx + segment_right, | 499 if (pColumn) { |
| 149 topy, bottomy, spacew, fontsize_v, segment, pF
ont); | 500 pColumn->m_Count = 1; |
| 150 FX_Free(pPosArray); | 501 pColumn->m_AvgPos = pTextBox->m_Right; |
| 151 } | 502 pColumn->m_TextPos = -1; |
| 152 static void ConvertPDFString(CFX_ByteString& result, CFX_ByteString& src, CPDF_F
ont* pFont); | 503 m_TextColumns.Add(pColumn); |
| 153 CTextBaseLine* CTextPage::InsertTextBox(CTextBaseLine* pBaseLine, FX_FLOAT basey
, FX_FLOAT leftx, | 504 } |
| 154 FX_FLOAT rightx, FX_FLOAT topy, FX_FLOAT
bottomy, FX_FLOAT spacew, FX_FLOAT fontsize_v, | 505 } else { |
| 155 CFX_ByteString& str, CPDF_Font* pFont) | 506 pColumn->m_AvgPos = |
| 156 { | 507 (pColumn->m_Count * pColumn->m_AvgPos + pTextBox->m_Right) / |
| 157 if (str.GetLength() == 0) { | 508 (pColumn->m_Count + 1); |
| 158 return NULL; | 509 pColumn->m_Count++; |
| 159 } | 510 } |
| 160 if (pBaseLine == NULL) { | 511 } |
| 161 int i; | 512 } |
| 162 for (i = 0; i < m_BaseLines.GetSize(); i ++) { | 513 int mincount = m_BaseLines.GetSize() / 4; |
| 163 CTextBaseLine* pExistLine = (CTextBaseLine*)m_BaseLines.GetAt(i); | 514 for (i = 0; i < m_TextColumns.GetSize(); i++) { |
| 164 if (pExistLine->m_BaseLine == basey) { | 515 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i); |
| 165 pBaseLine = pExistLine; | 516 if (pTextColumn->m_Count >= mincount) { |
| 166 break; | 517 continue; |
| 167 } | 518 } |
| 168 if (pExistLine->m_BaseLine < basey) { | 519 delete pTextColumn; |
| 169 break; | 520 m_TextColumns.RemoveAt(i); |
| 170 } | 521 i--; |
| 171 } | 522 } |
| 172 if (pBaseLine == NULL) { | 523 for (i = 0; i < m_BaseLines.GetSize(); i++) { |
| 173 pBaseLine = FX_NEW CTextBaseLine; | 524 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); |
| 174 if (NULL == pBaseLine) { | 525 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j++) { |
| 175 return NULL; | 526 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j); |
| 176 } | 527 if (IsNumber(pTextBox->m_Text)) { |
| 177 pBaseLine->m_BaseLine = basey; | 528 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right); |
| 178 m_BaseLines.InsertAt(i, pBaseLine); | 529 } |
| 179 } | 530 } |
| 180 } | 531 } |
| 181 CFX_WideString text; | 532 } |
| 182 FX_LPCSTR pStr = str; | 533 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) { |
| 183 int len = str.GetLength(), offset = 0; | 534 for (int i = 0; i < m_TextColumns.GetSize(); i++) { |
| 184 while (offset < len) { | 535 CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i); |
| 185 FX_DWORD ch = pFont->GetNextChar(pStr, offset); | 536 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) { |
| 186 CFX_WideString unicode_str = pFont->UnicodeFromCharCode(ch); | 537 return pColumn; |
| 187 text += unicode_str; | 538 } |
| 188 } | 539 } |
| 189 pBaseLine->InsertTextBox(leftx, rightx, topy, bottomy, spacew, fontsize_v, t
ext); | 540 return NULL; |
| 190 return pBaseLine; | 541 } |
| 191 } | 542 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) { |
| 192 void CTextPage::WriteOutput(CFX_WideStringArray& lines, int iMinWidth) | 543 } |
| 193 { | 544 CTextBaseLine::CTextBaseLine() { |
| 194 FX_FLOAT lastheight = -1; | 545 m_Top = -100000; |
| 195 FX_FLOAT lastbaseline = -1; | 546 m_Bottom = 100000; |
| 196 FX_FLOAT MinLeftX = 1000000; | 547 m_MaxFontSizeV = 0; |
| 197 FX_FLOAT MaxRightX = 0; | 548 } |
| 198 int i; | 549 CTextBaseLine::~CTextBaseLine() { |
| 199 for (i = 0; i < m_BaseLines.GetSize(); i ++) { | 550 for (int i = 0; i < m_TextList.GetSize(); i++) { |
| 200 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); | 551 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); |
| 201 FX_FLOAT leftx, rightx; | 552 delete pText; |
| 202 if (pBaseLine->GetWidth(leftx, rightx)) { | 553 } |
| 203 if (leftx < MinLeftX) { | 554 } |
| 204 MinLeftX = leftx; | 555 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, |
| 205 } | 556 FX_FLOAT rightx, |
| 206 if (rightx > MaxRightX) { | 557 FX_FLOAT topy, |
| 207 MaxRightX = rightx; | 558 FX_FLOAT bottomy, |
| 208 } | 559 FX_FLOAT spacew, |
| 209 } | 560 FX_FLOAT fontsize_v, |
| 210 } | 561 const CFX_WideString& text) { |
| 211 for (i = 0; i < m_BaseLines.GetSize(); i ++) { | 562 if (m_Top < topy) { |
| 212 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); | 563 m_Top = topy; |
| 213 pBaseLine->MergeBoxes(); | 564 } |
| 214 } | 565 if (m_Bottom > bottomy) { |
| 215 for (i = 1; i < m_BaseLines.GetSize(); i ++) { | 566 m_Bottom = bottomy; |
| 216 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); | 567 } |
| 217 CTextBaseLine* pPrevLine = (CTextBaseLine*)m_BaseLines.GetAt(i - 1); | 568 if (m_MaxFontSizeV < fontsize_v) { |
| 218 if (pBaseLine->CanMerge(pPrevLine)) { | 569 m_MaxFontSizeV = fontsize_v; |
| 219 pPrevLine->Merge(pBaseLine); | 570 } |
| 220 delete pBaseLine; | 571 int i; |
| 221 m_BaseLines.RemoveAt(i); | 572 for (i = 0; i < m_TextList.GetSize(); i++) { |
| 222 i --; | 573 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); |
| 223 } | 574 if (pText->m_Left > leftx) { |
| 224 } | 575 break; |
| 225 if (m_bAutoWidth) { | 576 } |
| 226 int* widths = FX_Alloc(int, m_BaseLines.GetSize()); | 577 } |
| 227 if (widths) { | 578 CTextBox* pText = FX_NEW CTextBox; |
| 228 for (i = 0; i < m_BaseLines.GetSize(); i ++) { | 579 if (NULL == pText) { |
| 229 widths[i] = 0; | 580 return; |
| 230 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); | 581 } |
| 231 int TotalChars = 0; | 582 pText->m_Text = text; |
| 232 FX_FLOAT TotalWidth = 0; | 583 pText->m_Left = leftx; |
| 233 int minchars; | 584 pText->m_Right = rightx; |
| 234 pBaseLine->CountChars(TotalChars, TotalWidth, minchars); | 585 pText->m_Top = topy; |
| 235 if (TotalChars) { | 586 pText->m_Bottom = bottomy; |
| 236 FX_FLOAT charwidth = TotalWidth / TotalChars; | 587 pText->m_SpaceWidth = spacew; |
| 237 widths[i] = (int)((MaxRightX - MinLeftX) / charwidth); | 588 pText->m_FontSizeV = fontsize_v; |
| 238 } | 589 pText->m_pColumn = NULL; |
| 239 if (widths[i] > 1000) { | 590 m_TextList.InsertAt(i, pText); |
| 240 widths[i] = 1000; | 591 } |
| 241 } | 592 FX_BOOL GetIntersection(FX_FLOAT low1, |
| 242 if (widths[i] < minchars) { | 593 FX_FLOAT high1, |
| 243 widths[i] = minchars; | 594 FX_FLOAT low2, |
| 244 } | 595 FX_FLOAT high2, |
| 245 } | 596 FX_FLOAT& interlow, |
| 246 int AvgWidth = 0, widthcount = 0; | 597 FX_FLOAT& interhigh); |
| 247 for (i = 0; i < m_BaseLines.GetSize(); i ++) | 598 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) { |
| 248 if (widths[i]) { | 599 FX_FLOAT inter_top, inter_bottom; |
| 249 AvgWidth += widths[i]; | 600 if (!GetIntersection(m_Bottom, |
| 250 widthcount ++; | 601 m_Top, |
| 251 } | 602 pOther->m_Bottom, |
| 252 AvgWidth = int((FX_FLOAT)AvgWidth / widthcount + 0.5); | 603 pOther->m_Top, |
| 253 int MaxWidth = 0; | 604 inter_bottom, |
| 254 for (i = 0; i < m_BaseLines.GetSize(); i ++) | 605 inter_top)) { |
| 255 if (MaxWidth < widths[i]) { | 606 return FALSE; |
| 256 MaxWidth = widths[i]; | 607 } |
| 257 } | 608 FX_FLOAT inter_h = inter_top - inter_bottom; |
| 258 if (MaxWidth > AvgWidth * 6 / 5) { | 609 if (inter_h < (m_Top - m_Bottom) / 2 && |
| 259 MaxWidth = AvgWidth * 6 / 5; | 610 inter_h < (pOther->m_Top - pOther->m_Bottom) / 2) { |
| 260 } | 611 return FALSE; |
| 261 FX_Free(widths); | 612 } |
| 262 if (iMinWidth < MaxWidth) { | 613 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine); |
| 263 iMinWidth = MaxWidth; | 614 for (int i = 0; i < m_TextList.GetSize(); i++) { |
| 264 } | 615 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); |
| 265 } | 616 for (int j = 0; j < pOther->m_TextList.GetSize(); j++) { |
| 266 } | 617 CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j); |
| 267 for (i = 0; i < m_BaseLines.GetSize(); i ++) { | 618 FX_FLOAT inter_left, inter_right; |
| 268 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); | 619 if (!GetIntersection(pText->m_Left, |
| 269 pBaseLine->MergeBoxes(); | 620 pText->m_Right, |
| 270 } | 621 pOtherText->m_Left, |
| 271 if (m_bKeepColumn) { | 622 pOtherText->m_Right, |
| 272 FindColumns(); | 623 inter_left, |
| 273 } | 624 inter_right)) { |
| 274 for (i = 0; i < m_BaseLines.GetSize(); i ++) { | 625 continue; |
| 275 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); | 626 } |
| 276 if (lastheight >= 0) { | 627 FX_FLOAT inter_w = inter_right - inter_left; |
| 277 FX_FLOAT dy = lastbaseline - pBaseLine->m_BaseLine; | 628 if (inter_w < pText->m_SpaceWidth / 2 && |
| 278 if (dy >= (pBaseLine->m_MaxFontSizeV) * 1.5 || dy >= lastheight * 1.
5) { | 629 inter_w < pOtherText->m_SpaceWidth / 2) { |
| 279 lines.Add(L""); | 630 continue; |
| 280 } | 631 } |
| 281 } | 632 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 || |
| 282 lastheight = pBaseLine->m_MaxFontSizeV; | 633 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) { |
| 283 lastbaseline = pBaseLine->m_BaseLine; | 634 return FALSE; |
| 284 CFX_WideString str; | 635 } |
| 285 pBaseLine->WriteOutput(str, MinLeftX, MaxRightX - MinLeftX, iMinWidth); | 636 } |
| 286 lines.Add(str); | 637 } |
| 287 } | 638 return TRUE; |
| 288 } | 639 } |
| 289 void NormalizeCompositeChar(FX_WCHAR wChar, CFX_WideString& sDest) | 640 void CTextBaseLine::Merge(CTextBaseLine* pOther) { |
| 290 { | 641 for (int i = 0; i < pOther->m_TextList.GetSize(); i++) { |
| 291 wChar = FX_GetMirrorChar(wChar, TRUE, FALSE); | 642 CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i); |
| 292 FX_LPWSTR pDst = NULL; | 643 InsertTextBox(pText->m_Left, |
| 293 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst); | 644 pText->m_Right, |
| 294 if (nCount < 1 ) { | 645 pText->m_Top, |
| 295 sDest += wChar; | 646 pText->m_Bottom, |
| 296 return; | 647 pText->m_SpaceWidth, |
| 297 } | 648 pText->m_FontSizeV, |
| 298 pDst = new FX_WCHAR[nCount]; | 649 pText->m_Text); |
| 299 FX_Unicode_GetNormalization(wChar, pDst); | 650 } |
| 300 for (int nIndex = 0; nIndex < nCount; nIndex++) { | 651 } |
| 301 sDest += pDst[nIndex]; | 652 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) { |
| 302 } | 653 int i; |
| 303 delete[] pDst; | 654 for (i = 0; i < m_TextList.GetSize(); i++) { |
| 304 } | 655 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); |
| 305 void NormalizeString(CFX_WideString& str) | 656 if (pText->m_Text != L" ") { |
| 306 { | 657 break; |
| 307 if (str.GetLength() <= 0) { | 658 } |
| 308 return; | 659 } |
| 309 } | 660 if (i == m_TextList.GetSize()) { |
| 310 CFX_WideString sBuffer; | 661 return FALSE; |
| 311 IFX_BidiChar* BidiChar = IFX_BidiChar::Create(); | 662 } |
| 312 if (NULL == BidiChar) { | 663 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); |
| 313 return; | 664 leftx = pText->m_Left; |
| 314 } | 665 for (i = m_TextList.GetSize() - 1; i >= 0; i--) { |
| 315 CFX_WordArray order; | 666 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); |
| 316 FX_BOOL bR2L = FALSE; | 667 if (pText->m_Text != L" ") { |
| 317 FX_INT32 start = 0, count = 0, i = 0; | 668 break; |
| 318 int nR2L = 0, nL2R = 0; | 669 } |
| 319 for (i = 0; i < str.GetLength(); i++) { | 670 } |
| 320 if(BidiChar->AppendChar(str.GetAt(i))) { | 671 pText = (CTextBox*)m_TextList.GetAt(i); |
| 321 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); | 672 rightx = pText->m_Right; |
| 322 order.Add(start); | 673 return TRUE; |
| 323 order.Add(count); | 674 } |
| 324 order.Add(ret); | 675 void CTextBaseLine::MergeBoxes() { |
| 325 if(!bR2L) { | 676 int i = 0; |
| 326 if(ret == 2) { | 677 while (1) { |
| 327 nR2L++; | 678 if (i >= m_TextList.GetSize() - 1) { |
| 328 } else if (ret == 1) { | 679 break; |
| 329 nL2R++; | 680 } |
| 330 } | 681 CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i); |
| 331 } | 682 CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1); |
| 332 } | 683 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right; |
| 333 } | 684 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) |
| 334 if(BidiChar->EndChar()) { | 685 ? pNextText->m_SpaceWidth |
| 335 FX_INT32 ret = BidiChar->GetBidiInfo(start, count); | 686 : pThisText->m_SpaceWidth; |
| 336 order.Add(start); | 687 if (spacew > 0.0 && dx < spacew * 2) { |
| 337 order.Add(count); | 688 pThisText->m_Right = pNextText->m_Right; |
| 338 order.Add(ret); | 689 if (dx > spacew * 1.5) { |
| 339 if(!bR2L) { | 690 pThisText->m_Text += L" "; |
| 340 if(ret == 2) { | 691 } else if (dx > spacew / 3) { |
| 341 nR2L++; | 692 pThisText->m_Text += L' '; |
| 342 } else if(ret == 1) { | 693 } |
| 343 nL2R++; | 694 pThisText->m_Text += pNextText->m_Text; |
| 344 } | 695 pThisText->m_SpaceWidth = |
| 345 } | 696 pNextText->m_SpaceWidth == 0.0 ? spacew : pNextText->m_SpaceWidth; |
| 346 } | 697 m_TextList.RemoveAt(i + 1); |
| 347 if(nR2L > 0 && nR2L >= nL2R) { | 698 delete pNextText; |
| 348 bR2L = TRUE; | |
| 349 } | |
| 350 if(bR2L) { | |
| 351 int count = order.GetSize(); | |
| 352 for(int j = count - 1; j > 0; j -= 3) { | |
| 353 int ret = order.GetAt(j); | |
| 354 int start = order.GetAt(j - 2); | |
| 355 int count1 = order.GetAt(j - 1); | |
| 356 if(ret == 2 || ret == 0) { | |
| 357 for(int i = start + count1 - 1; i >= start; i--) { | |
| 358 NormalizeCompositeChar(str[i], sBuffer); | |
| 359 } | |
| 360 } else { | |
| 361 i = j; | |
| 362 FX_BOOL bSymbol = FALSE; | |
| 363 while(i > 0 && order.GetAt(i) != 2) { | |
| 364 bSymbol = !order.GetAt(i); | |
| 365 i -= 3; | |
| 366 } | |
| 367 int end = start + count1 ; | |
| 368 int n = 0; | |
| 369 if(bSymbol) { | |
| 370 n = i + 6; | |
| 371 } else { | |
| 372 n = i + 3; | |
| 373 } | |
| 374 if(n >= j) { | |
| 375 for(int m = start; m < end; m++) { | |
| 376 sBuffer += str[m]; | |
| 377 } | |
| 378 } else { | |
| 379 i = j; | |
| 380 j = n; | |
| 381 for(; n <= i; n += 3) { | |
| 382 int start = order.GetAt(n - 2); | |
| 383 int count1 = order.GetAt(n - 1); | |
| 384 int end = start + count1 ; | |
| 385 for(int m = start; m < end; m++) { | |
| 386 sBuffer += str[m]; | |
| 387 } | |
| 388 } | |
| 389 } | |
| 390 } | |
| 391 } | |
| 392 } else { | 699 } else { |
| 393 int count = order.GetSize(); | 700 i++; |
| 394 FX_BOOL bL2R = FALSE; | 701 } |
| 395 for(int j = 0; j < count; j += 3) { | 702 } |
| 396 int ret = order.GetAt(j + 2); | 703 } |
| 397 int start = order.GetAt(j); | 704 void CTextBaseLine::WriteOutput(CFX_WideString& str, |
| 398 int count1 = order.GetAt(j + 1); | 705 FX_FLOAT leftx, |
| 399 if(ret == 2 || (j == 0 && ret == 0 && !bL2R)) { | 706 FX_FLOAT pagewidth, |
| 400 int i = j + 3; | 707 int iTextWidth) { |
| 401 while(bR2L && i < count) { | 708 int lastpos = -1; |
| 402 if(order.GetAt(i + 2) == 1) { | 709 for (int i = 0; i < m_TextList.GetSize(); i++) { |
| 403 break; | |
| 404 } else { | |
| 405 i += 3; | |
| 406 } | |
| 407 } | |
| 408 if(i == 3) { | |
| 409 j = -3; | |
| 410 bL2R = TRUE; | |
| 411 continue; | |
| 412 } | |
| 413 int end = str.GetLength() - 1; | |
| 414 if(i < count) { | |
| 415 end = order.GetAt(i) - 1; | |
| 416 } | |
| 417 j = i - 3; | |
| 418 for(int n = end; n >= start; n--) { | |
| 419 NormalizeCompositeChar(str[i], sBuffer); | |
| 420 } | |
| 421 } else { | |
| 422 int end = start + count1 ; | |
| 423 for(int i = start; i < end; i++) { | |
| 424 sBuffer += str[i]; | |
| 425 } | |
| 426 } | |
| 427 } | |
| 428 } | |
| 429 str.Empty(); | |
| 430 str += sBuffer; | |
| 431 BidiChar->Release(); | |
| 432 } | |
| 433 static FX_BOOL IsNumber(CFX_WideString& str) | |
| 434 { | |
| 435 for (int i = 0; i < str.GetLength(); i ++) { | |
| 436 FX_WCHAR ch = str[i]; | |
| 437 if ((ch < '0' || ch > '9') && ch != '-' && ch != '+' && ch != '.' && ch
!= ' ') { | |
| 438 return FALSE; | |
| 439 } | |
| 440 } | |
| 441 return TRUE; | |
| 442 } | |
| 443 void CTextPage::FindColumns() | |
| 444 { | |
| 445 int i; | |
| 446 for (i = 0; i < m_BaseLines.GetSize(); i ++) { | |
| 447 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); | |
| 448 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) { | |
| 449 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j); | |
| 450 CTextColumn* pColumn = FindColumn(pTextBox->m_Right); | |
| 451 if (pColumn == NULL) { | |
| 452 pColumn = FX_NEW CTextColumn; | |
| 453 if (pColumn) { | |
| 454 pColumn->m_Count = 1; | |
| 455 pColumn->m_AvgPos = pTextBox->m_Right; | |
| 456 pColumn->m_TextPos = -1; | |
| 457 m_TextColumns.Add(pColumn); | |
| 458 } | |
| 459 } else { | |
| 460 pColumn->m_AvgPos = (pColumn->m_Count * pColumn->m_AvgPos + pTex
tBox->m_Right) / | |
| 461 (pColumn->m_Count + 1); | |
| 462 pColumn->m_Count ++; | |
| 463 } | |
| 464 } | |
| 465 } | |
| 466 int mincount = m_BaseLines.GetSize() / 4; | |
| 467 for (i = 0; i < m_TextColumns.GetSize(); i ++) { | |
| 468 CTextColumn* pTextColumn = (CTextColumn*)m_TextColumns.GetAt(i); | |
| 469 if (pTextColumn->m_Count >= mincount) { | |
| 470 continue; | |
| 471 } | |
| 472 delete pTextColumn; | |
| 473 m_TextColumns.RemoveAt(i); | |
| 474 i --; | |
| 475 } | |
| 476 for (i = 0; i < m_BaseLines.GetSize(); i ++) { | |
| 477 CTextBaseLine* pBaseLine = (CTextBaseLine*)m_BaseLines.GetAt(i); | |
| 478 for (int j = 0; j < pBaseLine->m_TextList.GetSize(); j ++) { | |
| 479 CTextBox* pTextBox = (CTextBox*)pBaseLine->m_TextList.GetAt(j); | |
| 480 if (IsNumber(pTextBox->m_Text)) { | |
| 481 pTextBox->m_pColumn = FindColumn(pTextBox->m_Right); | |
| 482 } | |
| 483 } | |
| 484 } | |
| 485 } | |
| 486 CTextColumn* CTextPage::FindColumn(FX_FLOAT xpos) | |
| 487 { | |
| 488 for (int i = 0; i < m_TextColumns.GetSize(); i ++) { | |
| 489 CTextColumn* pColumn = (CTextColumn*)m_TextColumns.GetAt(i); | |
| 490 if (pColumn->m_AvgPos < xpos + 1 && pColumn->m_AvgPos > xpos - 1) { | |
| 491 return pColumn; | |
| 492 } | |
| 493 } | |
| 494 return NULL; | |
| 495 } | |
| 496 void CTextPage::BreakSpace(CPDF_TextObject* pTextObj) | |
| 497 { | |
| 498 } | |
| 499 CTextBaseLine::CTextBaseLine() | |
| 500 { | |
| 501 m_Top = -100000; | |
| 502 m_Bottom = 100000; | |
| 503 m_MaxFontSizeV = 0; | |
| 504 } | |
| 505 CTextBaseLine::~CTextBaseLine() | |
| 506 { | |
| 507 for (int i = 0; i < m_TextList.GetSize(); i ++) { | |
| 508 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); | |
| 509 delete pText; | |
| 510 } | |
| 511 } | |
| 512 void CTextBaseLine::InsertTextBox(FX_FLOAT leftx, FX_FLOAT rightx, FX_FLOAT topy
, FX_FLOAT bottomy, | |
| 513 FX_FLOAT spacew, FX_FLOAT fontsize_v, const CF
X_WideString& text) | |
| 514 { | |
| 515 if (m_Top < topy) { | |
| 516 m_Top = topy; | |
| 517 } | |
| 518 if (m_Bottom > bottomy) { | |
| 519 m_Bottom = bottomy; | |
| 520 } | |
| 521 if (m_MaxFontSizeV < fontsize_v) { | |
| 522 m_MaxFontSizeV = fontsize_v; | |
| 523 } | |
| 524 int i; | |
| 525 for (i = 0; i < m_TextList.GetSize(); i ++) { | |
| 526 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); | |
| 527 if (pText->m_Left > leftx) { | |
| 528 break; | |
| 529 } | |
| 530 } | |
| 531 CTextBox* pText = FX_NEW CTextBox; | |
| 532 if (NULL == pText) { | |
| 533 return; | |
| 534 } | |
| 535 pText->m_Text = text; | |
| 536 pText->m_Left = leftx; | |
| 537 pText->m_Right = rightx; | |
| 538 pText->m_Top = topy; | |
| 539 pText->m_Bottom = bottomy; | |
| 540 pText->m_SpaceWidth = spacew; | |
| 541 pText->m_FontSizeV = fontsize_v; | |
| 542 pText->m_pColumn = NULL; | |
| 543 m_TextList.InsertAt(i, pText); | |
| 544 } | |
| 545 FX_BOOL GetIntersection(FX_FLOAT low1, FX_FLOAT high1, FX_FLOAT low2, FX_FLOAT h
igh2, | |
| 546 FX_FLOAT& interlow, FX_FLOAT& interhigh); | |
| 547 FX_BOOL CTextBaseLine::CanMerge(CTextBaseLine* pOther) | |
| 548 { | |
| 549 FX_FLOAT inter_top, inter_bottom; | |
| 550 if (!GetIntersection(m_Bottom, m_Top, pOther->m_Bottom, pOther->m_Top, | |
| 551 inter_bottom, inter_top)) { | |
| 552 return FALSE; | |
| 553 } | |
| 554 FX_FLOAT inter_h = inter_top - inter_bottom; | |
| 555 if (inter_h < (m_Top - m_Bottom) / 2 && inter_h < (pOther->m_Top - pOther->m
_Bottom) / 2) { | |
| 556 return FALSE; | |
| 557 } | |
| 558 FX_FLOAT dy = (FX_FLOAT)FXSYS_fabs(m_BaseLine - pOther->m_BaseLine); | |
| 559 for (int i = 0; i < m_TextList.GetSize(); i ++) { | |
| 560 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); | |
| 561 for (int j = 0; j < pOther->m_TextList.GetSize(); j ++) { | |
| 562 CTextBox* pOtherText = (CTextBox*)pOther->m_TextList.GetAt(j); | |
| 563 FX_FLOAT inter_left, inter_right; | |
| 564 if (!GetIntersection(pText->m_Left, pText->m_Right, | |
| 565 pOtherText->m_Left, pOtherText->m_Right, inter_
left, inter_right)) { | |
| 566 continue; | |
| 567 } | |
| 568 FX_FLOAT inter_w = inter_right - inter_left; | |
| 569 if (inter_w < pText->m_SpaceWidth / 2 && inter_w < pOtherText->m_Spa
ceWidth / 2) { | |
| 570 continue; | |
| 571 } | |
| 572 if (dy >= (pText->m_Bottom - pText->m_Top) / 2 || | |
| 573 dy >= (pOtherText->m_Bottom - pOtherText->m_Top) / 2) { | |
| 574 return FALSE; | |
| 575 } | |
| 576 } | |
| 577 } | |
| 578 return TRUE; | |
| 579 } | |
| 580 void CTextBaseLine::Merge(CTextBaseLine* pOther) | |
| 581 { | |
| 582 for (int i = 0; i < pOther->m_TextList.GetSize(); i ++) { | |
| 583 CTextBox* pText = (CTextBox*)pOther->m_TextList.GetAt(i); | |
| 584 InsertTextBox(pText->m_Left, pText->m_Right, pText->m_Top, pText->m_Bott
om, | |
| 585 pText->m_SpaceWidth, pText->m_FontSizeV, pText->m_Text); | |
| 586 } | |
| 587 } | |
| 588 FX_BOOL CTextBaseLine::GetWidth(FX_FLOAT& leftx, FX_FLOAT& rightx) | |
| 589 { | |
| 590 int i; | |
| 591 for (i = 0; i < m_TextList.GetSize(); i ++) { | |
| 592 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); | |
| 593 if (pText->m_Text != L" ") { | |
| 594 break; | |
| 595 } | |
| 596 } | |
| 597 if (i == m_TextList.GetSize()) { | |
| 598 return FALSE; | |
| 599 } | |
| 600 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); | 710 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); |
| 601 leftx = pText->m_Left; | 711 int xpos; |
| 602 for (i = m_TextList.GetSize() - 1; i >= 0; i --) { | 712 if (pText->m_pColumn) { |
| 603 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); | 713 xpos = |
| 604 if (pText->m_Text != L" ") { | 714 (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pagewidth + |
| 605 break; | 715 0.5); |
| 606 } | 716 xpos -= pText->m_Text.GetLength(); |
| 607 } | 717 } else { |
| 608 pText = (CTextBox*)m_TextList.GetAt(i); | 718 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5); |
| 609 rightx = pText->m_Right; | 719 } |
| 610 return TRUE; | 720 if (xpos <= lastpos) { |
| 611 } | 721 xpos = lastpos + 1; |
| 612 void CTextBaseLine::MergeBoxes() | 722 } |
| 613 { | 723 for (int j = lastpos + 1; j < xpos; j++) { |
| 614 int i = 0; | 724 str += ' '; |
| 615 while (1) { | 725 } |
| 616 if (i >= m_TextList.GetSize() - 1) { | 726 CFX_WideString sSrc(pText->m_Text); |
| 617 break; | 727 NormalizeString(sSrc); |
| 618 } | 728 str += sSrc; |
| 619 CTextBox* pThisText = (CTextBox*)m_TextList.GetAt(i); | 729 str += ' '; |
| 620 CTextBox* pNextText = (CTextBox*)m_TextList.GetAt(i + 1); | 730 lastpos = xpos + pText->m_Text.GetLength(); |
| 621 FX_FLOAT dx = pNextText->m_Left - pThisText->m_Right; | 731 } |
| 622 FX_FLOAT spacew = (pThisText->m_SpaceWidth == 0.0) ? | 732 } |
| 623 pNextText->m_SpaceWidth : pThisText->m_SpaceWidth; | 733 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) { |
| 624 if (spacew > 0.0 && dx < spacew * 2) { | 734 minchars = 0; |
| 625 pThisText->m_Right = pNextText->m_Right; | 735 for (int i = 0; i < m_TextList.GetSize(); i++) { |
| 626 if (dx > spacew * 1.5) { | 736 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); |
| 627 pThisText->m_Text += L" "; | 737 if (pText->m_Right - pText->m_Left < 0.002) { |
| 628 } else if (dx > spacew / 3) { | 738 continue; |
| 629 pThisText->m_Text += L' '; | 739 } |
| 630 } | 740 count += pText->m_Text.GetLength(); |
| 631 pThisText->m_Text += pNextText->m_Text; | 741 width += pText->m_Right - pText->m_Left; |
| 632 pThisText->m_SpaceWidth = pNextText->m_SpaceWidth == 0.0 ? | 742 minchars += pText->m_Text.GetLength() + 1; |
| 633 spacew : pNextText->m_SpaceWidth; | 743 } |
| 634 m_TextList.RemoveAt(i + 1); | |
| 635 delete pNextText; | |
| 636 } else { | |
| 637 i ++; | |
| 638 } | |
| 639 } | |
| 640 } | |
| 641 void CTextBaseLine::WriteOutput(CFX_WideString& str, FX_FLOAT leftx, FX_FLOAT pa
gewidth, | |
| 642 int iTextWidth) | |
| 643 { | |
| 644 int lastpos = -1; | |
| 645 for (int i = 0; i < m_TextList.GetSize(); i ++) { | |
| 646 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); | |
| 647 int xpos; | |
| 648 if (pText->m_pColumn) { | |
| 649 xpos = (int)((pText->m_pColumn->m_AvgPos - leftx) * iTextWidth / pag
ewidth + 0.5); | |
| 650 xpos -= pText->m_Text.GetLength(); | |
| 651 } else { | |
| 652 xpos = (int)((pText->m_Left - leftx) * iTextWidth / pagewidth + 0.5)
; | |
| 653 } | |
| 654 if (xpos <= lastpos) { | |
| 655 xpos = lastpos + 1; | |
| 656 } | |
| 657 for (int j = lastpos + 1; j < xpos; j ++) { | |
| 658 str += ' '; | |
| 659 } | |
| 660 CFX_WideString sSrc(pText->m_Text); | |
| 661 NormalizeString(sSrc); | |
| 662 str += sSrc; | |
| 663 str += ' '; | |
| 664 lastpos = xpos + pText->m_Text.GetLength(); | |
| 665 } | |
| 666 } | |
| 667 void CTextBaseLine::CountChars(int& count, FX_FLOAT& width, int& minchars) | |
| 668 { | |
| 669 minchars = 0; | |
| 670 for (int i = 0; i < m_TextList.GetSize(); i ++) { | |
| 671 CTextBox* pText = (CTextBox*)m_TextList.GetAt(i); | |
| 672 if (pText->m_Right - pText->m_Left < 0.002) { | |
| 673 continue; | |
| 674 } | |
| 675 count += pText->m_Text.GetLength(); | |
| 676 width += pText->m_Right - pText->m_Left; | |
| 677 minchars += pText->m_Text.GetLength() + 1; | |
| 678 } | |
| 679 } | 744 } |
| 680 #define PI 3.1415926535897932384626433832795 | 745 #define PI 3.1415926535897932384626433832795 |
| 681 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) | 746 static void CheckRotate(CPDF_Page& page, CFX_FloatRect& page_bbox) { |
| 682 { | 747 int total_count = 0, rotated_count[3] = { 0, 0, 0 }; |
| 683 int total_count = 0, rotated_count[3] = {0, 0, 0}; | 748 FX_POSITION pos = page.GetFirstObjectPosition(); |
| 684 FX_POSITION pos = page.GetFirstObjectPosition(); | 749 while (pos) { |
| 685 while (pos) { | 750 CPDF_PageObject* pObj = page.GetNextObject(pos); |
| 686 CPDF_PageObject* pObj = page.GetNextObject(pos); | 751 if (pObj->m_Type != PDFPAGE_TEXT) { |
| 687 if (pObj->m_Type != PDFPAGE_TEXT) { | 752 continue; |
| 688 continue; | 753 } |
| 689 } | 754 total_count++; |
| 690 total_count ++; | 755 CPDF_TextObject* pText = (CPDF_TextObject*)pObj; |
| 691 CPDF_TextObject* pText = (CPDF_TextObject*)pObj; | 756 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle(); |
| 692 FX_FLOAT angle = pText->m_TextState.GetBaselineAngle(); | 757 if (angle == 0.0) { |
| 693 if (angle == 0.0) { | 758 continue; |
| 694 continue; | 759 } |
| 695 } | 760 int degree = (int)(angle * 180 / PI + 0.5); |
| 696 int degree = (int)(angle * 180 / PI + 0.5); | 761 if (degree % 90) { |
| 697 if (degree % 90) { | 762 continue; |
| 698 continue; | 763 } |
| 699 } | 764 if (degree < 0) { |
| 700 if (degree < 0) { | 765 degree += 360; |
| 701 degree += 360; | 766 } |
| 702 } | 767 int index = degree / 90 % 3 - 1; |
| 703 int index = degree / 90 % 3 - 1; | 768 if (index < 0) { |
| 704 if (index < 0) { | 769 continue; |
| 705 continue; | 770 } |
| 706 } | 771 rotated_count[index]++; |
| 707 rotated_count[index] ++; | 772 } |
| 708 } | 773 if (total_count == 0) { |
| 709 if (total_count == 0) { | 774 return; |
| 710 return; | 775 } |
| 711 } | 776 CFX_AffineMatrix matrix; |
| 712 CFX_AffineMatrix matrix; | 777 if (rotated_count[0] > total_count * 2 / 3) { |
| 713 if (rotated_count[0] > total_count * 2 / 3) { | 778 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight()); |
| 714 matrix.Set(0, -1, 1, 0, 0, page.GetPageHeight()); | 779 } else if (rotated_count[1] > total_count * 2 / 3) { |
| 715 } else if (rotated_count[1] > total_count * 2 / 3) { | 780 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight()); |
| 716 matrix.Set(-1, 0, 0, -1, page.GetPageWidth(), page.GetPageHeight()); | 781 } else if (rotated_count[2] > total_count * 2 / 3) { |
| 717 } else if (rotated_count[2] > total_count * 2 / 3) { | 782 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0); |
| 718 matrix.Set(0, 1, -1, 0, page.GetPageWidth(), 0); | 783 } else { |
| 719 } else { | 784 return; |
| 720 return; | 785 } |
| 721 } | 786 page.Transform(matrix); |
| 722 page.Transform(matrix); | 787 page_bbox.Transform(&matrix); |
| 723 page_bbox.Transform(&matrix); | 788 } |
| 724 } | 789 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, |
| 725 void PDF_GetPageText_Unicode(CFX_WideStringArray& lines, CPDF_Document* pDoc, CP
DF_Dictionary* pPage, | 790 CPDF_Document* pDoc, |
| 726 int iMinWidth, FX_DWORD flags) | 791 CPDF_Dictionary* pPage, |
| 727 { | 792 int iMinWidth, |
| 728 lines.RemoveAll(); | 793 FX_DWORD flags) { |
| 729 if (pPage == NULL) { | 794 lines.RemoveAll(); |
| 730 return; | 795 if (pPage == NULL) { |
| 731 } | 796 return; |
| 732 CPDF_Page page; | 797 } |
| 733 page.Load(pDoc, pPage); | 798 CPDF_Page page; |
| 734 CPDF_ParseOptions options; | 799 page.Load(pDoc, pPage); |
| 735 options.m_bTextOnly = TRUE; | 800 CPDF_ParseOptions options; |
| 736 options.m_bSeparateForm = FALSE; | 801 options.m_bTextOnly = TRUE; |
| 737 page.ParseContent(&options); | 802 options.m_bSeparateForm = FALSE; |
| 738 CFX_FloatRect page_bbox = page.GetPageBBox(); | 803 page.ParseContent(&options); |
| 739 if (flags & PDF2TXT_AUTO_ROTATE) { | 804 CFX_FloatRect page_bbox = page.GetPageBBox(); |
| 740 CheckRotate(page, page_bbox); | 805 if (flags & PDF2TXT_AUTO_ROTATE) { |
| 741 } | 806 CheckRotate(page, page_bbox); |
| 742 CTextPage texts; | 807 } |
| 743 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH; | 808 CTextPage texts; |
| 744 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN; | 809 texts.m_bAutoWidth = flags & PDF2TXT_AUTO_WIDTH; |
| 745 texts.m_bBreakSpace = TRUE; | 810 texts.m_bKeepColumn = flags & PDF2TXT_KEEP_COLUMN; |
| 746 FX_POSITION pos = page.GetFirstObjectPosition(); | 811 texts.m_bBreakSpace = TRUE; |
| 747 while (pos) { | 812 FX_POSITION pos = page.GetFirstObjectPosition(); |
| 748 CPDF_PageObject* pObject = page.GetNextObject(pos); | 813 while (pos) { |
| 749 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) { | 814 CPDF_PageObject* pObject = page.GetNextObject(pos); |
| 750 CFX_FloatRect rect(pObject->m_Left, pObject->m_Bottom, pObject->m_Ri
ght, pObject->m_Top); | 815 if (!(flags & PDF2TXT_INCLUDE_INVISIBLE)) { |
| 751 if (!page_bbox.Contains(rect)) { | 816 CFX_FloatRect rect( |
| 752 continue; | 817 pObject->m_Left, pObject->m_Bottom, pObject->m_Right, pObject->m_Top); |
| 753 } | 818 if (!page_bbox.Contains(rect)) { |
| 754 } | 819 continue; |
| 755 texts.ProcessObject(pObject); | 820 } |
| 756 } | 821 } |
| 757 texts.WriteOutput(lines, iMinWidth); | 822 texts.ProcessObject(pObject); |
| 758 } | 823 } |
| 759 void PDF_GetPageText(CFX_ByteStringArray& lines, CPDF_Document* pDoc, CPDF_Dicti
onary* pPage, | 824 texts.WriteOutput(lines, iMinWidth); |
| 760 int iMinWidth, FX_DWORD flags) | 825 } |
| 761 { | 826 void PDF_GetPageText(CFX_ByteStringArray& lines, |
| 762 lines.RemoveAll(); | 827 CPDF_Document* pDoc, |
| 763 CFX_WideStringArray wlines; | 828 CPDF_Dictionary* pPage, |
| 764 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags); | 829 int iMinWidth, |
| 765 for (int i = 0; i < wlines.GetSize(); i ++) { | 830 FX_DWORD flags) { |
| 766 CFX_WideString wstr = wlines[i]; | 831 lines.RemoveAll(); |
| 767 CFX_ByteString str; | 832 CFX_WideStringArray wlines; |
| 768 for (int c = 0; c < wstr.GetLength(); c ++) { | 833 PDF_GetPageText_Unicode(wlines, pDoc, pPage, iMinWidth, flags); |
| 769 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?"); | 834 for (int i = 0; i < wlines.GetSize(); i++) { |
| 770 } | 835 CFX_WideString wstr = wlines[i]; |
| 771 lines.Add(str); | 836 CFX_ByteString str; |
| 772 } | 837 for (int c = 0; c < wstr.GetLength(); c++) { |
| 838 str += CharFromUnicodeAlt(wstr[c], FXSYS_GetACP(), "?"); |
| 839 } |
| 840 lines.Add(str); |
| 841 } |
| 773 } | 842 } |
| 774 #endif | 843 #endif |
| 775 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_PageObjects
* pPage, FX_BOOL bUseLF, | 844 extern void _PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, |
| 845 CPDF_PageObjects* pPage, |
| 846 FX_BOOL bUseLF, |
| 776 CFX_PtrArray* pObjArray); | 847 CFX_PtrArray* pObjArray); |
| 777 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, CPDF_Document* pDoc, CPD
F_Dictionary* pPage, FX_DWORD flags) | 848 void PDF_GetTextStream_Unicode(CFX_WideTextBuf& buffer, |
| 778 { | 849 CPDF_Document* pDoc, |
| 779 buffer.EstimateSize(0, 10240); | 850 CPDF_Dictionary* pPage, |
| 780 CPDF_Page page; | 851 FX_DWORD flags) { |
| 781 page.Load(pDoc, pPage); | 852 buffer.EstimateSize(0, 10240); |
| 782 CPDF_ParseOptions options; | 853 CPDF_Page page; |
| 783 options.m_bTextOnly = TRUE; | 854 page.Load(pDoc, pPage); |
| 784 options.m_bSeparateForm = FALSE; | 855 CPDF_ParseOptions options; |
| 785 page.ParseContent(&options); | 856 options.m_bTextOnly = TRUE; |
| 786 _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL); | 857 options.m_bSeparateForm = FALSE; |
| 787 } | 858 page.ParseContent(&options); |
| 859 _PDF_GetTextStream_Unicode(buffer, &page, TRUE, NULL); |
| 860 } |
| OLD | NEW |