Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(613)

Side by Side Diff: core/src/fpdftext/fpdf_text_int.cpp

Issue 1265503005: clang-format all pdfium code. (Closed) Base URL: https://pdfium.googlesource.com/pdfium.git@master
Patch Set: sigh Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 PDFium Authors. All rights reserved. 1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 6
7 #include <ctype.h> 7 #include <ctype.h>
8 #include <algorithm> 8 #include <algorithm>
9 9
10 #include "../../../third_party/base/nonstd_unique_ptr.h" 10 #include "../../../third_party/base/nonstd_unique_ptr.h"
11 #include "../../include/fpdfapi/fpdf_module.h" 11 #include "../../include/fpdfapi/fpdf_module.h"
12 #include "../../include/fpdfapi/fpdf_page.h" 12 #include "../../include/fpdfapi/fpdf_page.h"
13 #include "../../include/fpdfapi/fpdf_pageobj.h" 13 #include "../../include/fpdfapi/fpdf_pageobj.h"
14 #include "../../include/fpdfapi/fpdf_resource.h" 14 #include "../../include/fpdfapi/fpdf_resource.h"
15 #include "../../include/fpdftext/fpdf_text.h" 15 #include "../../include/fpdftext/fpdf_text.h"
16 #include "../../include/fxcrt/fx_arb.h" 16 #include "../../include/fxcrt/fx_arb.h"
17 #include "../../include/fxcrt/fx_ucd.h" 17 #include "../../include/fxcrt/fx_ucd.h"
18 #include "text_int.h" 18 #include "text_int.h"
19 19
20 namespace { 20 namespace {
21 21
22 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) 22 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
23 { 23 if (curChar < 255) {
24 if(curChar < 255 ) { 24 return FALSE;
25 return FALSE; 25 }
26 } 26 if ((curChar >= 0x0600 && curChar <= 0x06FF) ||
27 if ( (curChar >= 0x0600 && curChar <= 0x06FF) 27 (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
28 || (curChar >= 0xFE70 && curChar <= 0xFEFF) 28 (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
29 || (curChar >= 0xFB50 && curChar <= 0xFDFF) 29 (curChar >= 0x0400 && curChar <= 0x04FF) ||
30 || (curChar >= 0x0400 && curChar <= 0x04FF) 30 (curChar >= 0x0500 && curChar <= 0x052F) ||
31 || (curChar >= 0x0500 && curChar <= 0x052F) 31 (curChar >= 0xA640 && curChar <= 0xA69F) ||
32 || (curChar >= 0xA640 && curChar <= 0xA69F) 32 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
33 || (curChar >= 0x2DE0 && curChar <= 0x2DFF) 33 (curChar >= 0x2000 && curChar <= 0x206F)) {
34 || curChar == 8467 34 return FALSE;
35 || (curChar >= 0x2000 && curChar <= 0x206F)) { 35 }
36 return FALSE; 36 return TRUE;
37 }
38 return TRUE;
39 } 37 }
40 38
41 FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) 39 FX_FLOAT _NormalizeThreshold(FX_FLOAT threshold) {
42 { 40 if (threshold < 300) {
43 if (threshold < 300) { 41 return threshold / 2.0f;
44 return threshold / 2.0f; 42 }
45 } 43 if (threshold < 500) {
46 if (threshold < 500) { 44 return threshold / 4.0f;
47 return threshold / 4.0f; 45 }
48 } 46 if (threshold < 700) {
49 if (threshold < 700) { 47 return threshold / 5.0f;
50 return threshold / 5.0f; 48 }
51 } 49 return threshold / 6.0f;
52 return threshold / 6.0f;
53 } 50 }
54 51
55 FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj, 52 FX_FLOAT _CalculateBaseSpace(const CPDF_TextObject* pTextObj,
56 const CFX_AffineMatrix& matrix) 53 const CFX_AffineMatrix& matrix) {
57 { 54 FX_FLOAT baseSpace = 0.0;
58 FX_FLOAT baseSpace = 0.0; 55 const int nItems = pTextObj->CountItems();
59 const int nItems = pTextObj->CountItems(); 56 if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) {
60 if (pTextObj->m_TextState.GetObject()->m_CharSpace && nItems >= 3) { 57 FX_BOOL bAllChar = TRUE;
61 FX_BOOL bAllChar = TRUE; 58 FX_FLOAT spacing = matrix.TransformDistance(
62 FX_FLOAT spacing = matrix.TransformDistance( 59 pTextObj->m_TextState.GetObject()->m_CharSpace);
63 pTextObj->m_TextState.GetObject()->m_CharSpace); 60 baseSpace = spacing;
64 baseSpace = spacing; 61 for (int i = 0; i < nItems; i++) {
65 for (int i = 0; i < nItems; i++) { 62 CPDF_TextObjectItem item;
66 CPDF_TextObjectItem item; 63 pTextObj->GetItemInfo(i, &item);
67 pTextObj->GetItemInfo(i, &item); 64 if (item.m_CharCode == (FX_DWORD)-1) {
68 if (item.m_CharCode == (FX_DWORD) - 1) { 65 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
69 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 66 FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000;
70 FX_FLOAT kerning = -fontsize_h * item.m_OriginX / 1000; 67 baseSpace = std::min(baseSpace, kerning + spacing);
71 baseSpace = std::min(baseSpace, kerning + spacing); 68 bAllChar = FALSE;
72 bAllChar = FALSE; 69 }
73 }
74 }
75 if (baseSpace < 0.0 || (nItems == 3 && !bAllChar)) {
76 baseSpace = 0.0;
77 }
78 } 70 }
79 return baseSpace; 71 if (baseSpace < 0.0 || (nItems == 3 && !bAllChar)) {
72 baseSpace = 0.0;
73 }
74 }
75 return baseSpace;
80 } 76 }
81 77
82 } // namespace 78 } // namespace
83 79
84 CPDFText_ParseOptions::CPDFText_ParseOptions() 80 CPDFText_ParseOptions::CPDFText_ParseOptions()
85 : m_bGetCharCodeOnly(FALSE), m_bNormalizeObjs(TRUE), m_bOutputHyphen(FALSE) 81 : m_bGetCharCodeOnly(FALSE),
86 { 82 m_bNormalizeObjs(TRUE),
83 m_bOutputHyphen(FALSE) {}
84 IPDF_TextPage* IPDF_TextPage::CreateTextPage(
85 const CPDF_Page* pPage,
86 CPDFText_ParseOptions ParserOptions) {
87 return new CPDF_TextPage(pPage, ParserOptions);
87 } 88 }
88 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, CPDFText_Pa rseOptions ParserOptions) 89 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage,
89 { 90 int flags) {
90 return new CPDF_TextPage(pPage, ParserOptions); 91 return new CPDF_TextPage(pPage, flags);
91 } 92 }
92 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_Page* pPage, int flags) 93 IPDF_TextPage* IPDF_TextPage::CreateTextPage(const CPDF_PageObjects* pObjs,
93 { 94 int flags) {
94 return new CPDF_TextPage(pPage, flags); 95 return new CPDF_TextPage(pObjs, flags);
95 } 96 }
96 IPDF_TextPage*» IPDF_TextPage::CreateTextPage(const CPDF_PageObjects* pObjs, int flags) 97 IPDF_TextPageFind* IPDF_TextPageFind::CreatePageFind(
97 { 98 const IPDF_TextPage* pTextPage) {
98 return new CPDF_TextPage(pObjs, flags); 99 if (!pTextPage) {
100 return NULL;
101 }
102 return new CPDF_TextPageFind(pTextPage);
99 } 103 }
100 IPDF_TextPageFind*» IPDF_TextPageFind::CreatePageFind(const IPDF_TextPage* p TextPage) 104 IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() {
101 { 105 return new CPDF_LinkExtract();
102 if (!pTextPage) {
103 return NULL;
104 }
105 return new CPDF_TextPageFind(pTextPage);
106 } 106 }
107 IPDF_LinkExtract* IPDF_LinkExtract::CreateLinkExtract() 107 #define TEXT_BLANK_CHAR L' '
108 { 108 #define TEXT_LINEFEED_CHAR L'\n'
109 return new CPDF_LinkExtract(); 109 #define TEXT_RETURN_CHAR L'\r'
110 } 110 #define TEXT_EMPTY L""
111 #define TEXT_BLANK_CHAR» » L' ' 111 #define TEXT_BLANK L" "
112 #define TEXT_LINEFEED_CHAR» » L'\n' 112 #define TEXT_RETURN_LINEFEED L"\r\n"
113 #define» TEXT_RETURN_CHAR» » L'\r' 113 #define TEXT_LINEFEED L"\n"
114 #define TEXT_EMPTY» » » » L"" 114 #define TEXT_CHARRATIO_GAPDELTA 0.070
115 #define TEXT_BLANK» » » » L" "
116 #define TEXT_RETURN_LINEFEED» L"\r\n"
117 #define TEXT_LINEFEED» » » L"\n"
118 #define» TEXT_CHARRATIO_GAPDELTA» 0.070
119 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags) 115 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, int flags)
120 : m_charList(512), 116 : m_charList(512),
121 m_TempCharList(50), 117 m_TempCharList(50),
122 m_pPreTextObj(NULL), 118 m_pPreTextObj(NULL),
123 m_IsParsered(FALSE), 119 m_IsParsered(FALSE),
124 m_TextlineDir(-1), 120 m_TextlineDir(-1),
125 m_CurlineRect(0, 0, 0, 0) 121 m_CurlineRect(0, 0, 0, 0) {
126 { 122 m_pPage = pPage;
127 m_pPage = pPage; 123 m_parserflag = flags;
128 m_parserflag = flags; 124 m_TextBuf.EstimateSize(0, 10240);
129 m_TextBuf.EstimateSize(0, 10240); 125 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int)pPage->GetPageWidth(),
130 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int) pPage->GetPageWidth(), (int)pPage->GetPageHeight(), 0); 126 (int)pPage->GetPageHeight(), 0);
131 } 127 }
132 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage, CPDFText_ParseOptions Parse rOptions) 128 CPDF_TextPage::CPDF_TextPage(const CPDF_Page* pPage,
133 : m_ParseOptions(ParserOptions) 129 CPDFText_ParseOptions ParserOptions)
134 , m_charList(512) 130 : m_ParseOptions(ParserOptions),
135 , m_TempCharList(50) 131 m_charList(512),
136 , m_pPreTextObj(NULL) 132 m_TempCharList(50),
137 , m_IsParsered(FALSE) 133 m_pPreTextObj(NULL),
138 , m_TextlineDir(-1) 134 m_IsParsered(FALSE),
139 , m_CurlineRect(0, 0, 0, 0) 135 m_TextlineDir(-1),
140 { 136 m_CurlineRect(0, 0, 0, 0) {
141 m_pPage = pPage; 137 m_pPage = pPage;
142 m_parserflag = 0; 138 m_parserflag = 0;
143 m_TextBuf.EstimateSize(0, 10240); 139 m_TextBuf.EstimateSize(0, 10240);
144 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int) pPage->GetPageWidth(), (int)pPage->GetPageHeight(), 0); 140 pPage->GetDisplayMatrix(m_DisplayMatrix, 0, 0, (int)pPage->GetPageWidth(),
141 (int)pPage->GetPageHeight(), 0);
145 } 142 }
146 CPDF_TextPage::CPDF_TextPage(const CPDF_PageObjects* pPage, int flags) 143 CPDF_TextPage::CPDF_TextPage(const CPDF_PageObjects* pPage, int flags)
147 : m_charList(512), 144 : m_charList(512),
148 m_TempCharList(50), 145 m_TempCharList(50),
149 m_pPreTextObj(NULL), 146 m_pPreTextObj(NULL),
150 m_IsParsered(FALSE), 147 m_IsParsered(FALSE),
151 m_TextlineDir(-1), 148 m_TextlineDir(-1),
152 m_CurlineRect(0, 0, 0, 0) 149 m_CurlineRect(0, 0, 0, 0) {
153 { 150 m_pPage = pPage;
154 m_pPage = pPage; 151 m_parserflag = flags;
155 m_parserflag = flags; 152 m_TextBuf.EstimateSize(0, 10240);
156 m_TextBuf.EstimateSize(0, 10240); 153 CFX_FloatRect pageRect = pPage->CalcBoundingBox();
157 CFX_FloatRect pageRect = pPage->CalcBoundingBox(); 154 m_DisplayMatrix = CFX_AffineMatrix(1, 0, 0, -1, pageRect.right, pageRect.top);
158 m_DisplayMatrix = CFX_AffineMatrix(1, 0, 0, -1, pageRect.right, pageRect.top ); 155 }
159 } 156 void CPDF_TextPage::NormalizeObjects(FX_BOOL bNormalize) {
160 void CPDF_TextPage::NormalizeObjects(FX_BOOL bNormalize) 157 m_ParseOptions.m_bNormalizeObjs = bNormalize;
161 { 158 }
162 m_ParseOptions.m_bNormalizeObjs = bNormalize; 159 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) {
163 } 160 switch (charInfo.m_Unicode) {
164 bool CPDF_TextPage::IsControlChar(const PAGECHAR_INFO& charInfo) 161 case 0x2:
165 { 162 case 0x3:
166 switch (charInfo.m_Unicode) { 163 case 0x93:
167 case 0x2: 164 case 0x94:
168 case 0x3: 165 case 0x96:
169 case 0x93: 166 case 0x97:
170 case 0x94: 167 case 0x98:
171 case 0x96: 168 case 0xfffe:
172 case 0x97: 169 return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN;
173 case 0x98: 170 default:
174 case 0xfffe: 171 return false;
175 return charInfo.m_Flag != FPDFTEXT_CHAR_HYPHEN; 172 }
176 default: 173 }
177 return false; 174 FX_BOOL CPDF_TextPage::ParseTextPage() {
178 } 175 if (!m_pPage) {
179 }
180 FX_BOOL CPDF_TextPage::ParseTextPage()
181 {
182 if (!m_pPage) {
183 m_IsParsered = FALSE;
184 return FALSE;
185 }
186 m_IsParsered = FALSE; 176 m_IsParsered = FALSE;
187 m_TextBuf.Clear(); 177 return FALSE;
188 m_charList.RemoveAll(); 178 }
189 m_pPreTextObj = NULL; 179 m_IsParsered = FALSE;
190 ProcessObject(); 180 m_TextBuf.Clear();
191 m_IsParsered = TRUE; 181 m_charList.RemoveAll();
192 if(!m_ParseOptions.m_bGetCharCodeOnly) { 182 m_pPreTextObj = NULL;
193 m_CharIndex.RemoveAll(); 183 ProcessObject();
194 int nCount = m_charList.GetSize(); 184 m_IsParsered = TRUE;
195 if(nCount) { 185 if (!m_ParseOptions.m_bGetCharCodeOnly) {
196 m_CharIndex.Add(0); 186 m_CharIndex.RemoveAll();
197 } 187 int nCount = m_charList.GetSize();
198 for(int i = 0; i < nCount; i++) { 188 if (nCount) {
199 int indexSize = m_CharIndex.GetSize(); 189 m_CharIndex.Add(0);
200 FX_BOOL bNormal = FALSE; 190 }
201 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(i); 191 for (int i = 0; i < nCount; i++) {
202 if(charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) { 192 int indexSize = m_CharIndex.GetSize();
203 bNormal = TRUE; 193 FX_BOOL bNormal = FALSE;
204 } 194 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(i);
205 else if(charinfo.m_Unicode == 0 || IsControlChar(charinfo)) 195 if (charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
206 bNormal = FALSE; 196 bNormal = TRUE;
207 else { 197 } else if (charinfo.m_Unicode == 0 || IsControlChar(charinfo))
208 bNormal = TRUE; 198 bNormal = FALSE;
209 } 199 else {
210 if(bNormal) { 200 bNormal = TRUE;
211 if(indexSize % 2) { 201 }
212 m_CharIndex.Add(1); 202 if (bNormal) {
213 } else { 203 if (indexSize % 2) {
214 if(indexSize <= 0) { 204 m_CharIndex.Add(1);
215 continue; 205 } else {
216 } 206 if (indexSize <= 0) {
217 m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1); 207 continue;
218 } 208 }
219 } else { 209 m_CharIndex.SetAt(indexSize - 1,
220 if(indexSize % 2) { 210 m_CharIndex.GetAt(indexSize - 1) + 1);
221 if(indexSize <= 0) { 211 }
222 continue; 212 } else {
223 } 213 if (indexSize % 2) {
224 m_CharIndex.SetAt(indexSize - 1, i + 1); 214 if (indexSize <= 0) {
225 } else { 215 continue;
226 m_CharIndex.Add(i + 1); 216 }
227 } 217 m_CharIndex.SetAt(indexSize - 1, i + 1);
228 } 218 } else {
229 } 219 m_CharIndex.Add(i + 1);
230 int indexSize = m_CharIndex.GetSize(); 220 }
231 if(indexSize % 2) { 221 }
232 m_CharIndex.RemoveAt(indexSize - 1); 222 }
233 }
234 }
235 return TRUE;
236 }
237 int» CPDF_TextPage::CountChars() const
238 {
239 if(m_ParseOptions.m_bGetCharCodeOnly) {
240 return m_TextBuf.GetSize();
241 }
242 return m_charList.GetSize();
243 }
244 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const
245 {
246 int indexSize = m_CharIndex.GetSize(); 223 int indexSize = m_CharIndex.GetSize();
247 int count = 0; 224 if (indexSize % 2) {
248 for(int i = 0; i < indexSize; i += 2) { 225 m_CharIndex.RemoveAt(indexSize - 1);
249 count += m_CharIndex.GetAt(i + 1); 226 }
250 if(count > TextIndex) { 227 }
251 return TextIndex - count + m_CharIndex.GetAt(i + 1) + m_CharInd ex.GetAt(i); 228 return TRUE;
252 } 229 }
253 } 230 int CPDF_TextPage::CountChars() const {
231 if (m_ParseOptions.m_bGetCharCodeOnly) {
232 return m_TextBuf.GetSize();
233 }
234 return m_charList.GetSize();
235 }
236 int CPDF_TextPage::CharIndexFromTextIndex(int TextIndex) const {
237 int indexSize = m_CharIndex.GetSize();
238 int count = 0;
239 for (int i = 0; i < indexSize; i += 2) {
240 count += m_CharIndex.GetAt(i + 1);
241 if (count > TextIndex) {
242 return TextIndex - count + m_CharIndex.GetAt(i + 1) +
243 m_CharIndex.GetAt(i);
244 }
245 }
246 return -1;
247 }
248 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const {
249 int indexSize = m_CharIndex.GetSize();
250 int count = 0;
251 for (int i = 0; i < indexSize; i += 2) {
252 count += m_CharIndex.GetAt(i + 1);
253 if (m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i) > CharIndex) {
254 if (CharIndex - m_CharIndex.GetAt(i) < 0) {
255 return -1;
256 }
257 return CharIndex - m_CharIndex.GetAt(i) + count -
258 m_CharIndex.GetAt(i + 1);
259 }
260 }
261 return -1;
262 }
263 void CPDF_TextPage::GetRectArray(int start,
264 int nCount,
265 CFX_RectArray& rectArray) const {
266 if (m_ParseOptions.m_bGetCharCodeOnly) {
267 return;
268 }
269 if (start < 0 || nCount == 0) {
270 return;
271 }
272 if (!m_IsParsered) {
273 return;
274 }
275 PAGECHAR_INFO info_curchar;
276 CPDF_TextObject* pCurObj = NULL;
277 CFX_FloatRect rect;
278 int curPos = start;
279 FX_BOOL flagNewRect = TRUE;
280 if (nCount + start > m_charList.GetSize() || nCount == -1) {
281 nCount = m_charList.GetSize() - start;
282 }
283 while (nCount--) {
284 info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(curPos++);
285 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
286 continue;
287 }
288 if (info_curchar.m_CharBox.Width() < 0.01 ||
289 info_curchar.m_CharBox.Height() < 0.01) {
290 continue;
291 }
292 if (!pCurObj) {
293 pCurObj = info_curchar.m_pTextObj;
294 }
295 if (pCurObj != info_curchar.m_pTextObj) {
296 rectArray.Add(rect);
297 pCurObj = info_curchar.m_pTextObj;
298 flagNewRect = TRUE;
299 }
300 if (flagNewRect) {
301 FX_FLOAT orgX = info_curchar.m_OriginX, orgY = info_curchar.m_OriginY;
302 CFX_AffineMatrix matrix, matrix_reverse;
303 info_curchar.m_pTextObj->GetTextMatrix(&matrix);
304 matrix.Concat(info_curchar.m_Matrix);
305 matrix_reverse.SetReverse(matrix);
306 matrix_reverse.Transform(orgX, orgY);
307 rect.left = info_curchar.m_CharBox.left;
308 rect.right = info_curchar.m_CharBox.right;
309 if (pCurObj->GetFont()->GetTypeDescent()) {
310 rect.bottom = orgY +
311 pCurObj->GetFont()->GetTypeDescent() *
312 pCurObj->GetFontSize() / 1000;
313 FX_FLOAT xPosTemp = orgX;
314 matrix.Transform(xPosTemp, rect.bottom);
315 } else {
316 rect.bottom = info_curchar.m_CharBox.bottom;
317 }
318 if (pCurObj->GetFont()->GetTypeAscent()) {
319 rect.top =
320 orgY +
321 pCurObj->GetFont()->GetTypeAscent() * pCurObj->GetFontSize() / 1000;
322 FX_FLOAT xPosTemp =
323 orgX +
324 GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) *
325 pCurObj->GetFontSize() / 1000;
326 matrix.Transform(xPosTemp, rect.top);
327 } else {
328 rect.top = info_curchar.m_CharBox.top;
329 }
330 flagNewRect = FALSE;
331 rect = info_curchar.m_CharBox;
332 rect.Normalize();
333 } else {
334 info_curchar.m_CharBox.Normalize();
335 if (rect.left > info_curchar.m_CharBox.left) {
336 rect.left = info_curchar.m_CharBox.left;
337 }
338 if (rect.right < info_curchar.m_CharBox.right) {
339 rect.right = info_curchar.m_CharBox.right;
340 }
341 if (rect.top < info_curchar.m_CharBox.top) {
342 rect.top = info_curchar.m_CharBox.top;
343 }
344 if (rect.bottom > info_curchar.m_CharBox.bottom) {
345 rect.bottom = info_curchar.m_CharBox.bottom;
346 }
347 }
348 }
349 rectArray.Add(rect);
350 return;
351 }
352 int CPDF_TextPage::GetIndexAtPos(CPDF_Point point,
353 FX_FLOAT xTorelance,
354 FX_FLOAT yTorelance) const {
355 if (m_ParseOptions.m_bGetCharCodeOnly) {
356 return -3;
357 }
358 if (!m_IsParsered) {
359 return -3;
360 }
361 int pos = 0;
362 int NearPos = -1;
363 double xdif = 5000, ydif = 5000;
364 while (pos < m_charList.GetSize()) {
365 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)(m_charList.GetAt(pos));
366 CFX_FloatRect charrect = charinfo.m_CharBox;
367 if (charrect.Contains(point.x, point.y)) {
368 break;
369 }
370 if (xTorelance > 0 || yTorelance > 0) {
371 CFX_FloatRect charRectExt;
372 charrect.Normalize();
373 charRectExt.left = charrect.left - xTorelance / 2;
374 charRectExt.right = charrect.right + xTorelance / 2;
375 charRectExt.top = charrect.top + yTorelance / 2;
376 charRectExt.bottom = charrect.bottom - yTorelance / 2;
377 if (charRectExt.Contains(point.x, point.y)) {
378 double curXdif, curYdif;
379 curXdif = FXSYS_fabs(point.x - charrect.left) <
380 FXSYS_fabs(point.x - charrect.right)
381 ? FXSYS_fabs(point.x - charrect.left)
382 : FXSYS_fabs(point.x - charrect.right);
383 curYdif = FXSYS_fabs(point.y - charrect.bottom) <
384 FXSYS_fabs(point.y - charrect.top)
385 ? FXSYS_fabs(point.y - charrect.bottom)
386 : FXSYS_fabs(point.y - charrect.top);
387 if (curYdif + curXdif < xdif + ydif) {
388 ydif = curYdif;
389 xdif = curXdif;
390 NearPos = pos;
391 }
392 }
393 }
394 ++pos;
395 }
396 if (pos >= m_charList.GetSize()) {
397 pos = NearPos;
398 }
399 return pos;
400 }
401 CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const {
402 CFX_WideString strText;
403 if (m_ParseOptions.m_bGetCharCodeOnly || !m_IsParsered) {
404 return strText;
405 }
406 int nCount = m_charList.GetSize();
407 int pos = 0;
408 FX_FLOAT posy = 0;
409 FX_BOOL IsContainPreChar = FALSE;
410 FX_BOOL ISAddLineFeed = FALSE;
411 while (pos < nCount) {
412 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
413 if (IsRectIntersect(rect, charinfo.m_CharBox)) {
414 if (FXSYS_fabs(posy - charinfo.m_OriginY) > 0 && !IsContainPreChar &&
415 ISAddLineFeed) {
416 posy = charinfo.m_OriginY;
417 if (strText.GetLength() > 0) {
418 strText += L"\r\n";
419 }
420 }
421 IsContainPreChar = TRUE;
422 ISAddLineFeed = FALSE;
423 if (charinfo.m_Unicode) {
424 strText += charinfo.m_Unicode;
425 }
426 } else if (charinfo.m_Unicode == 32) {
427 if (IsContainPreChar && charinfo.m_Unicode) {
428 strText += charinfo.m_Unicode;
429 IsContainPreChar = FALSE;
430 ISAddLineFeed = FALSE;
431 }
432 } else {
433 IsContainPreChar = FALSE;
434 ISAddLineFeed = TRUE;
435 }
436 }
437 return strText;
438 }
439 void CPDF_TextPage::GetRectsArrayByRect(const CFX_FloatRect& rect,
440 CFX_RectArray& resRectArray) const {
441 if (m_ParseOptions.m_bGetCharCodeOnly) {
442 return;
443 }
444 if (!m_IsParsered) {
445 return;
446 }
447 CFX_FloatRect curRect;
448 FX_BOOL flagNewRect = TRUE;
449 CPDF_TextObject* pCurObj = NULL;
450 int nCount = m_charList.GetSize();
451 int pos = 0;
452 while (pos < nCount) {
453 PAGECHAR_INFO info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
454 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
455 continue;
456 }
457 if (IsRectIntersect(rect, info_curchar.m_CharBox)) {
458 if (!pCurObj) {
459 pCurObj = info_curchar.m_pTextObj;
460 }
461 if (pCurObj != info_curchar.m_pTextObj) {
462 resRectArray.Add(curRect);
463 pCurObj = info_curchar.m_pTextObj;
464 flagNewRect = TRUE;
465 }
466 if (flagNewRect) {
467 curRect = info_curchar.m_CharBox;
468 flagNewRect = FALSE;
469 curRect.Normalize();
470 } else {
471 info_curchar.m_CharBox.Normalize();
472 if (curRect.left > info_curchar.m_CharBox.left) {
473 curRect.left = info_curchar.m_CharBox.left;
474 }
475 if (curRect.right < info_curchar.m_CharBox.right) {
476 curRect.right = info_curchar.m_CharBox.right;
477 }
478 if (curRect.top < info_curchar.m_CharBox.top) {
479 curRect.top = info_curchar.m_CharBox.top;
480 }
481 if (curRect.bottom > info_curchar.m_CharBox.bottom) {
482 curRect.bottom = info_curchar.m_CharBox.bottom;
483 }
484 }
485 }
486 }
487 resRectArray.Add(curRect);
488 return;
489 }
490 int CPDF_TextPage::GetIndexAtPos(FX_FLOAT x,
491 FX_FLOAT y,
492 FX_FLOAT xTorelance,
493 FX_FLOAT yTorelance) const {
494 if (m_ParseOptions.m_bGetCharCodeOnly) {
495 return -3;
496 }
497 CPDF_Point point(x, y);
498 return GetIndexAtPos(point, xTorelance, yTorelance);
499 }
500 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO& info) const {
501 if (m_ParseOptions.m_bGetCharCodeOnly) {
502 return;
503 }
504 if (!m_IsParsered) {
505 return;
506 }
507 if (index < 0 || index >= m_charList.GetSize()) {
508 return;
509 }
510 PAGECHAR_INFO charinfo;
511 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
512 info.m_Charcode = charinfo.m_CharCode;
513 info.m_OriginX = charinfo.m_OriginX;
514 info.m_OriginY = charinfo.m_OriginY;
515 info.m_Unicode = charinfo.m_Unicode;
516 info.m_Flag = charinfo.m_Flag;
517 info.m_CharBox = charinfo.m_CharBox;
518 info.m_pTextObj = charinfo.m_pTextObj;
519 if (charinfo.m_pTextObj && charinfo.m_pTextObj->GetFont()) {
520 info.m_FontSize = charinfo.m_pTextObj->GetFontSize();
521 }
522 info.m_Matrix.Copy(charinfo.m_Matrix);
523 return;
524 }
525 void CPDF_TextPage::CheckMarkedContentObject(int32_t& start,
526 int32_t& nCount) const {
527 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
528 PAGECHAR_INFO charinfo2 =
529 *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
530 if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag &&
531 FPDFTEXT_CHAR_PIECE != charinfo2.m_Flag) {
532 return;
533 }
534 if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) {
535 PAGECHAR_INFO charinfo1 = charinfo;
536 int startIndex = start;
537 while (FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag &&
538 charinfo1.m_Index == charinfo.m_Index) {
539 startIndex--;
540 if (startIndex < 0) {
541 break;
542 }
543 charinfo1 = *(PAGECHAR_INFO*)m_charList.GetAt(startIndex);
544 }
545 startIndex++;
546 start = startIndex;
547 }
548 if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) {
549 PAGECHAR_INFO charinfo3 = charinfo2;
550 int endIndex = start + nCount - 1;
551 while (FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag &&
552 charinfo3.m_Index == charinfo2.m_Index) {
553 endIndex++;
554 if (endIndex >= m_charList.GetSize()) {
555 break;
556 }
557 charinfo3 = *(PAGECHAR_INFO*)m_charList.GetAt(endIndex);
558 }
559 endIndex--;
560 nCount = endIndex - start + 1;
561 }
562 }
563 CFX_WideString CPDF_TextPage::GetPageText(int start, int nCount) const {
564 if (!m_IsParsered || nCount == 0) {
565 return L"";
566 }
567 if (start < 0) {
568 start = 0;
569 }
570 if (nCount == -1) {
571 nCount = m_charList.GetSize() - start;
572 return m_TextBuf.GetWideString().Mid(start,
573 m_TextBuf.GetWideString().GetLength());
574 }
575 if (nCount <= 0 || m_charList.GetSize() <= 0) {
576 return L"";
577 }
578 if (nCount + start > m_charList.GetSize() - 1) {
579 nCount = m_charList.GetSize() - start;
580 }
581 if (nCount <= 0) {
582 return L"";
583 }
584 CheckMarkedContentObject(start, nCount);
585 int startindex = 0;
586 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
587 int startOffset = 0;
588 while (charinfo.m_Index == -1) {
589 startOffset++;
590 if (startOffset > nCount || start + startOffset >= m_charList.GetSize()) {
591 return L"";
592 }
593 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + startOffset);
594 }
595 startindex = charinfo.m_Index;
596 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
597 int nCountOffset = 0;
598 while (charinfo.m_Index == -1) {
599 nCountOffset++;
600 if (nCountOffset >= nCount) {
601 return L"";
602 }
603 charinfo =
604 *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - nCountOffset - 1);
605 }
606 nCount = start + nCount - nCountOffset - startindex;
607 if (nCount <= 0) {
608 return L"";
609 }
610 return m_TextBuf.GetWideString().Mid(startindex, nCount);
611 }
612 int CPDF_TextPage::CountRects(int start, int nCount) {
613 if (m_ParseOptions.m_bGetCharCodeOnly) {
254 return -1; 614 return -1;
255 } 615 }
256 int CPDF_TextPage::TextIndexFromCharIndex(int CharIndex) const 616 if (!m_IsParsered) {
257 {
258 int indexSize = m_CharIndex.GetSize();
259 int count = 0;
260 for(int i = 0; i < indexSize; i += 2) {
261 count += m_CharIndex.GetAt(i + 1);
262 if(m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i) > CharIndex) {
263 if(CharIndex - m_CharIndex.GetAt(i) < 0) {
264 return -1;
265 }
266 return » CharIndex - m_CharIndex.GetAt(i) + count - m_CharIndex.G etAt(i + 1);
267 }
268 }
269 return -1; 617 return -1;
270 } 618 }
271 void CPDF_TextPage::GetRectArray(int start, int nCount, CFX_RectArray& rectArray ) const 619 if (start < 0) {
272 { 620 return -1;
273 if(m_ParseOptions.m_bGetCharCodeOnly) { 621 }
274 return; 622 if (nCount == -1 || nCount + start > m_charList.GetSize()) {
275 } 623 nCount = m_charList.GetSize() - start;
276 if(start < 0 || nCount == 0) { 624 }
277 return; 625 m_SelRects.RemoveAll();
278 } 626 GetRectArray(start, nCount, m_SelRects);
279 if (!m_IsParsered) { 627 return m_SelRects.GetSize();
280 return; 628 }
281 } 629 void CPDF_TextPage::GetRect(int rectIndex,
282 PAGECHAR_INFO info_curchar; 630 FX_FLOAT& left,
283 CPDF_TextObject* pCurObj = NULL; 631 FX_FLOAT& top,
284 CFX_FloatRect rect; 632 FX_FLOAT& right,
285 int curPos = start; 633 FX_FLOAT& bottom) const {
286 FX_BOOL flagNewRect = TRUE; 634 if (m_ParseOptions.m_bGetCharCodeOnly) {
287 if (nCount + start > m_charList.GetSize() || nCount == -1) { 635 return;
288 nCount = m_charList.GetSize() - start; 636 }
289 } 637 if (!m_IsParsered || rectIndex < 0 || rectIndex >= m_SelRects.GetSize()) {
290 while (nCount--) { 638 return;
291 info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(curPos++); 639 }
292 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) { 640 left = m_SelRects.GetAt(rectIndex).left;
293 continue; 641 top = m_SelRects.GetAt(rectIndex).top;
294 } 642 right = m_SelRects.GetAt(rectIndex).right;
295 if(info_curchar.m_CharBox.Width() < 0.01 || info_curchar.m_CharBox.Heigh t() < 0.01) { 643 bottom = m_SelRects.GetAt(rectIndex).bottom;
296 continue; 644 }
297 } 645 FX_BOOL CPDF_TextPage::GetBaselineRotate(int start, int end, int& Rotate) {
298 if(!pCurObj) { 646 if (m_ParseOptions.m_bGetCharCodeOnly) {
299 pCurObj = info_curchar.m_pTextObj; 647 return FALSE;
300 } 648 }
301 if (pCurObj != info_curchar.m_pTextObj) { 649 if (end == start) {
302 rectArray.Add(rect); 650 return FALSE;
303 pCurObj = info_curchar.m_pTextObj; 651 }
304 flagNewRect = TRUE; 652 FX_FLOAT dx, dy;
305 } 653 FPDF_CHAR_INFO info1, info2;
306 if (flagNewRect) { 654 GetCharInfo(start, info1);
307 FX_FLOAT orgX = info_curchar.m_OriginX, orgY = info_curchar.m_Origin Y; 655 GetCharInfo(end, info2);
308 CFX_AffineMatrix matrix, matrix_reverse; 656 while (info2.m_CharBox.Width() == 0 || info2.m_CharBox.Height() == 0) {
309 info_curchar.m_pTextObj->GetTextMatrix(&matrix); 657 end--;
310 matrix.Concat(info_curchar.m_Matrix); 658 if (end <= start) {
311 matrix_reverse.SetReverse(matrix); 659 return FALSE;
312 matrix_reverse.Transform(orgX, orgY); 660 }
313 rect.left = info_curchar.m_CharBox.left;
314 rect.right = info_curchar.m_CharBox.right;
315 if (pCurObj->GetFont()->GetTypeDescent()) {
316 rect.bottom = orgY + pCurObj->GetFont()->GetTypeDescent() * pCur Obj->GetFontSize() / 1000;
317 FX_FLOAT xPosTemp = orgX;
318 matrix.Transform(xPosTemp, rect.bottom);
319 } else {
320 rect.bottom = info_curchar.m_CharBox.bottom;
321 }
322 if (pCurObj->GetFont()->GetTypeAscent()) {
323 rect.top = orgY + pCurObj->GetFont()->GetTypeAscent() * pCurObj- >GetFontSize() / 1000;
324 FX_FLOAT xPosTemp = orgX + GetCharWidth(info_curchar.m_CharCode, pCurObj->GetFont()) * pCurObj->GetFontSize() / 1000;
325 matrix.Transform(xPosTemp, rect.top);
326 } else {
327 rect.top = info_curchar.m_CharBox.top;
328 }
329 flagNewRect = FALSE;
330 rect = info_curchar.m_CharBox;
331 rect.Normalize();
332 } else {
333 info_curchar.m_CharBox.Normalize();
334 if (rect.left > info_curchar.m_CharBox.left) {
335 rect.left = info_curchar.m_CharBox.left;
336 }
337 if (rect.right < info_curchar.m_CharBox.right) {
338 rect.right = info_curchar.m_CharBox.right;
339 }
340 if ( rect.top < info_curchar.m_CharBox.top) {
341 rect.top = info_curchar.m_CharBox.top;
342 }
343 if (rect.bottom > info_curchar.m_CharBox.bottom) {
344 rect.bottom = info_curchar.m_CharBox.bottom;
345 }
346 }
347 }
348 rectArray.Add(rect);
349 return;
350 }
351 int CPDF_TextPage::GetIndexAtPos(CPDF_Point point , FX_FLOAT xTorelance, FX_FLOA T yTorelance) const
352 {
353 if(m_ParseOptions.m_bGetCharCodeOnly) {
354 return -3;
355 }
356 if (!m_IsParsered) {
357 return -3;
358 }
359 int pos = 0;
360 int NearPos = -1;
361 double xdif = 5000, ydif = 5000;
362 while(pos < m_charList.GetSize()) {
363 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)(m_charList.GetAt(pos));
364 CFX_FloatRect charrect = charinfo.m_CharBox;
365 if (charrect.Contains(point.x, point.y)) {
366 break;
367 }
368 if (xTorelance > 0 || yTorelance > 0) {
369 CFX_FloatRect charRectExt;
370 charrect.Normalize();
371 charRectExt.left = charrect.left - xTorelance / 2;
372 charRectExt.right = charrect.right + xTorelance / 2;
373 charRectExt.top = charrect.top + yTorelance / 2;
374 charRectExt.bottom = charrect.bottom - yTorelance / 2;
375 if (charRectExt.Contains(point.x, point.y)) {
376 double curXdif, curYdif;
377 curXdif = FXSYS_fabs(point.x - charrect.left) < FXSYS_fabs(point .x - charrect.right) ? FXSYS_fabs(point.x - charrect.left) : FXSYS_fabs(point.x - charrect.right);
378 curYdif = FXSYS_fabs(point.y - charrect.bottom) < FXSYS_fabs(poi nt.y - charrect.top ) ? FXSYS_fabs(point.y - charrect.bottom) : FXSYS_fabs(p oint.y - charrect.top);
379 if (curYdif + curXdif < xdif + ydif) {
380 ydif = curYdif;
381 xdif = curXdif;
382 NearPos = pos;
383 }
384 }
385 }
386 ++pos;
387 }
388 if (pos >= m_charList.GetSize()) {
389 pos = NearPos;
390 }
391 return pos;
392 }
393 CFX_WideString CPDF_TextPage::GetTextByRect(const CFX_FloatRect& rect) const
394 {
395 CFX_WideString strText;
396 if(m_ParseOptions.m_bGetCharCodeOnly || !m_IsParsered) {
397 return strText;
398 }
399 int nCount = m_charList.GetSize();
400 int pos = 0;
401 FX_FLOAT posy = 0;
402 FX_BOOL IsContainPreChar = FALSE;
403 FX_BOOL ISAddLineFeed = FALSE;
404 while (pos < nCount) {
405 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
406 if (IsRectIntersect(rect, charinfo.m_CharBox)) {
407 if (FXSYS_fabs(posy - charinfo.m_OriginY) > 0 && !IsContainPreChar & & ISAddLineFeed) {
408 posy = charinfo.m_OriginY;
409 if (strText.GetLength() > 0) {
410 strText += L"\r\n";
411 }
412 }
413 IsContainPreChar = TRUE;
414 ISAddLineFeed = FALSE;
415 if (charinfo.m_Unicode) {
416 strText += charinfo.m_Unicode;
417 }
418 } else if (charinfo.m_Unicode == 32) {
419 if (IsContainPreChar && charinfo.m_Unicode) {
420 strText += charinfo.m_Unicode;
421 IsContainPreChar = FALSE;
422 ISAddLineFeed = FALSE;
423 }
424 } else {
425 IsContainPreChar = FALSE;
426 ISAddLineFeed = TRUE;
427 }
428 }
429 return strText;
430 }
431 void CPDF_TextPage::GetRectsArrayByRect(const CFX_FloatRect& rect, CFX_RectArray & resRectArray) const
432 {
433 if(m_ParseOptions.m_bGetCharCodeOnly) {
434 return;
435 }
436 if (!m_IsParsered) {
437 return;
438 }
439 CFX_FloatRect curRect;
440 FX_BOOL flagNewRect = TRUE;
441 CPDF_TextObject* pCurObj = NULL;
442 int nCount = m_charList.GetSize();
443 int pos = 0;
444 while (pos < nCount) {
445 PAGECHAR_INFO info_curchar = *(PAGECHAR_INFO*)m_charList.GetAt(pos++);
446 if (info_curchar.m_Flag == FPDFTEXT_CHAR_GENERATED) {
447 continue;
448 }
449 if (IsRectIntersect(rect, info_curchar.m_CharBox)) {
450 if(!pCurObj) {
451 pCurObj = info_curchar.m_pTextObj;
452 }
453 if (pCurObj != info_curchar.m_pTextObj) {
454 resRectArray.Add(curRect);
455 pCurObj = info_curchar.m_pTextObj;
456 flagNewRect = TRUE;
457 }
458 if (flagNewRect) {
459 curRect = info_curchar.m_CharBox;
460 flagNewRect = FALSE;
461 curRect.Normalize();
462 } else {
463 info_curchar.m_CharBox.Normalize();
464 if (curRect.left > info_curchar.m_CharBox.left) {
465 curRect.left = info_curchar.m_CharBox.left;
466 }
467 if (curRect.right < info_curchar.m_CharBox.right) {
468 curRect.right = info_curchar.m_CharBox.right;
469 }
470 if ( curRect.top < info_curchar.m_CharBox.top) {
471 curRect.top = info_curchar.m_CharBox.top;
472 }
473 if (curRect.bottom > info_curchar.m_CharBox.bottom) {
474 curRect.bottom = info_curchar.m_CharBox.bottom;
475 }
476 }
477 }
478 }
479 resRectArray.Add(curRect);
480 return;
481 }
482 int CPDF_TextPage::GetIndexAtPos(FX_FLOAT x, FX_FLOAT y, FX_FLOAT xTorelance , FX_FLOAT yTorelance) const
483 {
484 if(m_ParseOptions.m_bGetCharCodeOnly) {
485 return -3;
486 }
487 CPDF_Point point(x, y);
488 return GetIndexAtPos(point, xTorelance, yTorelance);
489 }
490 void CPDF_TextPage::GetCharInfo(int index, FPDF_CHAR_INFO & info) const
491 {
492 if(m_ParseOptions.m_bGetCharCodeOnly) {
493 return;
494 }
495 if (!m_IsParsered) {
496 return;
497 }
498 if (index < 0 || index >= m_charList.GetSize()) {
499 return;
500 }
501 PAGECHAR_INFO charinfo;
502 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
503 info.m_Charcode = charinfo.m_CharCode;
504 info.m_OriginX = charinfo.m_OriginX;
505 info.m_OriginY = charinfo.m_OriginY;
506 info.m_Unicode = charinfo.m_Unicode;
507 info.m_Flag = charinfo.m_Flag;
508 info.m_CharBox = charinfo.m_CharBox;
509 info.m_pTextObj = charinfo.m_pTextObj;
510 if (charinfo.m_pTextObj && charinfo.m_pTextObj->GetFont()) {
511 info.m_FontSize = charinfo.m_pTextObj->GetFontSize();
512 }
513 info.m_Matrix.Copy(charinfo.m_Matrix);
514 return;
515 }
516 void CPDF_TextPage::CheckMarkedContentObject(int32_t& start, int32_t& nCount) co nst
517 {
518 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
519 PAGECHAR_INFO charinfo2 = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
520 if (FPDFTEXT_CHAR_PIECE != charinfo.m_Flag && FPDFTEXT_CHAR_PIECE != charinf o2.m_Flag) {
521 return;
522 }
523 if (FPDFTEXT_CHAR_PIECE == charinfo.m_Flag) {
524 PAGECHAR_INFO charinfo1 = charinfo;
525 int startIndex = start;
526 while(FPDFTEXT_CHAR_PIECE == charinfo1.m_Flag && charinfo1.m_Index == ch arinfo.m_Index) {
527 startIndex--;
528 if (startIndex < 0) {
529 break;
530 }
531 charinfo1 = *(PAGECHAR_INFO*)m_charList.GetAt(startIndex);
532 }
533 startIndex++;
534 start = startIndex;
535 }
536 if (FPDFTEXT_CHAR_PIECE == charinfo2.m_Flag) {
537 PAGECHAR_INFO charinfo3 = charinfo2;
538 int endIndex = start + nCount - 1;
539 while(FPDFTEXT_CHAR_PIECE == charinfo3.m_Flag && charinfo3.m_Index == ch arinfo2.m_Index) {
540 endIndex++;
541 if (endIndex >= m_charList.GetSize()) {
542 break;
543 }
544 charinfo3 = *(PAGECHAR_INFO*)m_charList.GetAt(endIndex);
545 }
546 endIndex--;
547 nCount = endIndex - start + 1;
548 }
549 }
550 CFX_WideString CPDF_TextPage::GetPageText(int start , int nCount) const
551 {
552 if (!m_IsParsered || nCount == 0) {
553 return L"";
554 }
555 if (start < 0) {
556 start = 0;
557 }
558 if (nCount == -1) {
559 nCount = m_charList.GetSize() - start;
560 return m_TextBuf.GetWideString().Mid(start, m_TextBuf.GetWideString().Ge tLength());
561 }
562 if(nCount <= 0 || m_charList.GetSize() <= 0) {
563 return L"";
564 }
565 if(nCount + start > m_charList.GetSize() - 1) {
566 nCount = m_charList.GetSize() - start;
567 }
568 if (nCount <= 0) {
569 return L"";
570 }
571 CheckMarkedContentObject(start, nCount);
572 int startindex = 0;
573 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start);
574 int startOffset = 0;
575 while(charinfo.m_Index == -1) {
576 startOffset++;
577 if (startOffset > nCount || start + startOffset >= m_charList.GetSize()) {
578 return L"";
579 }
580 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + startOffset);
581 }
582 startindex = charinfo.m_Index;
583 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - 1);
584 int nCountOffset = 0;
585 while (charinfo.m_Index == -1) {
586 nCountOffset++;
587 if (nCountOffset >= nCount) {
588 return L"";
589 }
590 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(start + nCount - nCountOffs et - 1);
591 }
592 nCount = start + nCount - nCountOffset - startindex;
593 if(nCount <= 0) {
594 return L"";
595 }
596 return m_TextBuf.GetWideString().Mid(startindex, nCount);
597 }
598 int CPDF_TextPage::CountRects(int start, int nCount)
599 {
600 if(m_ParseOptions.m_bGetCharCodeOnly) {
601 return -1;
602 }
603 if (!m_IsParsered) {
604 return -1;
605 }
606 if (start < 0) {
607 return -1;
608 }
609 if (nCount == -1 || nCount + start > m_charList.GetSize() ) {
610 nCount = m_charList.GetSize() - start;
611 }
612 m_SelRects.RemoveAll();
613 GetRectArray(start, nCount, m_SelRects);
614 return m_SelRects.GetSize();
615 }
616 void CPDF_TextPage::GetRect(int rectIndex, FX_FLOAT& left, FX_FLOAT& top, FX_FLO AT& right, FX_FLOAT &bottom) const
617 {
618 if(m_ParseOptions.m_bGetCharCodeOnly) {
619 return ;
620 }
621 if (!m_IsParsered || rectIndex < 0 || rectIndex >= m_SelRects.GetSize()) {
622 return;
623 }
624 left = m_SelRects.GetAt(rectIndex).left;
625 top = m_SelRects.GetAt(rectIndex).top;
626 right = m_SelRects.GetAt(rectIndex).right;
627 bottom = m_SelRects.GetAt(rectIndex).bottom;
628 }
629 FX_BOOL CPDF_TextPage::GetBaselineRotate(int start, int end, int& Rotate)
630 {
631 if(m_ParseOptions.m_bGetCharCodeOnly) {
632 return FALSE;
633 }
634 if(end == start) {
635 return FALSE;
636 }
637 FX_FLOAT dx, dy;
638 FPDF_CHAR_INFO info1, info2;
639 GetCharInfo(start, info1);
640 GetCharInfo(end, info2); 661 GetCharInfo(end, info2);
641 while(info2.m_CharBox.Width() == 0 || info2.m_CharBox.Height() == 0) { 662 }
642 end--; 663 dx = (info2.m_OriginX - info1.m_OriginX);
643 if(end <= start) { 664 dy = (info2.m_OriginY - info1.m_OriginY);
644 return FALSE; 665 if (dx == 0) {
645 } 666 if (dy > 0) {
646 GetCharInfo(end, info2); 667 Rotate = 90;
647 } 668 } else if (dy < 0) {
648 dx = (info2.m_OriginX - info1.m_OriginX); 669 Rotate = 270;
649 dy = (info2.m_OriginY - info1.m_OriginY);
650 if(dx == 0) {
651 if(dy > 0) {
652 Rotate = 90;
653 } else if (dy < 0) {
654 Rotate = 270;
655 } else {
656 Rotate = 0;
657 }
658 } else { 670 } else {
659 float a = FXSYS_atan2(dy, dx); 671 Rotate = 0;
660 Rotate = (int)(a * 180 / FX_PI + 0.5); 672 }
661 } 673 } else {
662 if(Rotate < 0) { 674 float a = FXSYS_atan2(dy, dx);
663 Rotate = -Rotate; 675 Rotate = (int)(a * 180 / FX_PI + 0.5);
664 } else if(Rotate > 0) { 676 }
665 Rotate = 360 - Rotate; 677 if (Rotate < 0) {
666 } 678 Rotate = -Rotate;
667 return TRUE; 679 } else if (Rotate > 0) {
668 } 680 Rotate = 360 - Rotate;
669 FX_BOOL»CPDF_TextPage::GetBaselineRotate(const CFX_FloatRect& rect , int& Rotate ) 681 }
670 { 682 return TRUE;
671 if(m_ParseOptions.m_bGetCharCodeOnly) { 683 }
672 return FALSE; 684 FX_BOOL CPDF_TextPage::GetBaselineRotate(const CFX_FloatRect& rect,
673 } 685 int& Rotate) {
674 int start, end, count, n = CountBoundedSegments(rect.left, rect.top, rect.ri ght, rect.bottom, TRUE); 686 if (m_ParseOptions.m_bGetCharCodeOnly) {
675 if(n < 1) { 687 return FALSE;
676 return FALSE; 688 }
677 } 689 int start, end, count,
678 if(n > 1) { 690 n = CountBoundedSegments(rect.left, rect.top, rect.right, rect.bottom,
679 GetBoundedSegment(n - 1, start, count); 691 TRUE);
680 end = start + count - 1; 692 if (n < 1) {
681 GetBoundedSegment(0, start, count); 693 return FALSE;
694 }
695 if (n > 1) {
696 GetBoundedSegment(n - 1, start, count);
697 end = start + count - 1;
698 GetBoundedSegment(0, start, count);
699 } else {
700 GetBoundedSegment(0, start, count);
701 end = start + count - 1;
702 }
703 return GetBaselineRotate(start, end, Rotate);
704 }
705 FX_BOOL CPDF_TextPage::GetBaselineRotate(int rectIndex, int& Rotate) {
706 if (m_ParseOptions.m_bGetCharCodeOnly) {
707 return FALSE;
708 }
709 if (!m_IsParsered || rectIndex < 0 || rectIndex > m_SelRects.GetSize()) {
710 return FALSE;
711 }
712 CFX_FloatRect rect = m_SelRects.GetAt(rectIndex);
713 return GetBaselineRotate(rect, Rotate);
714 }
715 int CPDF_TextPage::CountBoundedSegments(FX_FLOAT left,
716 FX_FLOAT top,
717 FX_FLOAT right,
718 FX_FLOAT bottom,
719 FX_BOOL bContains) {
720 if (m_ParseOptions.m_bGetCharCodeOnly) {
721 return -1;
722 }
723 m_Segment.RemoveAll();
724 if (!m_IsParsered) {
725 return -1;
726 }
727 CFX_FloatRect rect(left, bottom, right, top);
728 rect.Normalize();
729 int nCount = m_charList.GetSize();
730 int pos = 0;
731 FPDF_SEGMENT segment;
732 segment.m_Start = 0;
733 segment.m_nCount = 0;
734 int segmentStatus = 0;
735 FX_BOOL IsContainPreChar = FALSE;
736 while (pos < nCount) {
737 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos);
738 if (bContains && rect.Contains(charinfo.m_CharBox)) {
739 if (segmentStatus == 0 || segmentStatus == 2) {
740 segment.m_Start = pos;
741 segment.m_nCount = 1;
742 segmentStatus = 1;
743 } else if (segmentStatus == 1) {
744 segment.m_nCount++;
745 }
746 IsContainPreChar = TRUE;
747 } else if (!bContains &&
748 (IsRectIntersect(rect, charinfo.m_CharBox) ||
749 rect.Contains(charinfo.m_OriginX, charinfo.m_OriginY))) {
750 if (segmentStatus == 0 || segmentStatus == 2) {
751 segment.m_Start = pos;
752 segment.m_nCount = 1;
753 segmentStatus = 1;
754 } else if (segmentStatus == 1) {
755 segment.m_nCount++;
756 }
757 IsContainPreChar = TRUE;
758 } else if (charinfo.m_Unicode == 32) {
759 if (IsContainPreChar == TRUE) {
760 if (segmentStatus == 0 || segmentStatus == 2) {
761 segment.m_Start = pos;
762 segment.m_nCount = 1;
763 segmentStatus = 1;
764 } else if (segmentStatus == 1) {
765 segment.m_nCount++;
766 }
767 IsContainPreChar = FALSE;
768 } else {
769 if (segmentStatus == 1) {
770 segmentStatus = 2;
771 m_Segment.Add(segment);
772 segment.m_Start = 0;
773 segment.m_nCount = 0;
774 }
775 }
682 } else { 776 } else {
683 GetBoundedSegment(0, start, count); 777 if (segmentStatus == 1) {
684 end = start + count - 1;
685 }
686 return GetBaselineRotate(start, end, Rotate);
687 }
688 FX_BOOL»CPDF_TextPage::GetBaselineRotate(int rectIndex, int& Rotate)
689 {
690 if(m_ParseOptions.m_bGetCharCodeOnly) {
691 return FALSE;
692 }
693 if (!m_IsParsered || rectIndex < 0 || rectIndex > m_SelRects.GetSize()) {
694 return FALSE;
695 }
696 CFX_FloatRect rect = m_SelRects.GetAt(rectIndex);
697 return GetBaselineRotate(rect , Rotate);
698 }
699 int» CPDF_TextPage::CountBoundedSegments(FX_FLOAT left, FX_FLOAT top, FX_FLOA T right, FX_FLOAT bottom, FX_BOOL bContains )
700 {
701 if(m_ParseOptions.m_bGetCharCodeOnly) {
702 return -1;
703 }
704 m_Segment.RemoveAll();
705 if (!m_IsParsered)» {
706 return -1;
707 }
708 CFX_FloatRect rect(left, bottom, right, top);
709 rect.Normalize();
710 int nCount = m_charList.GetSize();
711 int pos = 0;
712 FPDF_SEGMENT» segment;
713 segment.m_Start = 0;
714 segment.m_nCount = 0;
715 int » » segmentStatus = 0;
716 FX_BOOL» » IsContainPreChar = FALSE;
717 while (pos < nCount) {
718 PAGECHAR_INFO charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(pos);
719 if(bContains && rect.Contains(charinfo.m_CharBox)) {
720 if (segmentStatus == 0 || segmentStatus == 2) {
721 segment.m_Start = pos;
722 segment.m_nCount = 1;
723 segmentStatus = 1;
724 } else if (segmentStatus == 1) {
725 segment.m_nCount++;
726 }
727 IsContainPreChar = TRUE;
728 } else if (!bContains && (IsRectIntersect(rect, charinfo.m_CharBox) || r ect.Contains(charinfo.m_OriginX, charinfo.m_OriginY))) {
729 if (segmentStatus == 0 || segmentStatus == 2) {
730 segment.m_Start = pos;
731 segment.m_nCount = 1;
732 segmentStatus = 1;
733 } else if (segmentStatus == 1) {
734 segment.m_nCount++;
735 }
736 IsContainPreChar = TRUE;
737 } else if (charinfo.m_Unicode == 32) {
738 if (IsContainPreChar == TRUE) {
739 if (segmentStatus == 0 || segmentStatus == 2) {
740 segment.m_Start = pos;
741 segment.m_nCount = 1;
742 segmentStatus = 1;
743 } else if (segmentStatus == 1) {
744 segment.m_nCount++;
745 }
746 IsContainPreChar = FALSE;
747 } else {
748 if (segmentStatus == 1) {
749 segmentStatus = 2;
750 m_Segment.Add(segment);
751 segment.m_Start = 0;
752 segment.m_nCount = 0;
753 }
754 }
755 } else {
756 if (segmentStatus == 1) {
757 segmentStatus = 2;
758 m_Segment.Add(segment);
759 segment.m_Start = 0;
760 segment.m_nCount = 0;
761 }
762 IsContainPreChar = FALSE;
763 }
764 pos++;
765 }
766 if (segmentStatus == 1) {
767 segmentStatus = 2; 778 segmentStatus = 2;
768 m_Segment.Add(segment); 779 m_Segment.Add(segment);
769 segment.m_Start = 0; 780 segment.m_Start = 0;
770 segment.m_nCount = 0; 781 segment.m_nCount = 0;
771 } 782 }
772 return m_Segment.GetSize(); 783 IsContainPreChar = FALSE;
773 } 784 }
774 void CPDF_TextPage::GetBoundedSegment(int index, int& start, int& count) const 785 pos++;
775 { 786 }
776 if(m_ParseOptions.m_bGetCharCodeOnly) { 787 if (segmentStatus == 1) {
777 return ; 788 segmentStatus = 2;
778 } 789 m_Segment.Add(segment);
779 if (index < 0 || index >= m_Segment.GetSize()) { 790 segment.m_Start = 0;
791 segment.m_nCount = 0;
792 }
793 return m_Segment.GetSize();
794 }
795 void CPDF_TextPage::GetBoundedSegment(int index, int& start, int& count) const {
796 if (m_ParseOptions.m_bGetCharCodeOnly) {
797 return;
798 }
799 if (index < 0 || index >= m_Segment.GetSize()) {
800 return;
801 }
802 start = m_Segment.GetAt(index).m_Start;
803 count = m_Segment.GetAt(index).m_nCount;
804 }
805 int CPDF_TextPage::GetWordBreak(int index, int direction) const {
806 if (m_ParseOptions.m_bGetCharCodeOnly) {
807 return -1;
808 }
809 if (!m_IsParsered) {
810 return -1;
811 }
812 if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT) {
813 return -1;
814 }
815 if (index < 0 || index >= m_charList.GetSize()) {
816 return -1;
817 }
818 PAGECHAR_INFO charinfo;
819 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index);
820 if (charinfo.m_Index == -1 || charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) {
821 return index;
822 }
823 if (!IsLetter(charinfo.m_Unicode)) {
824 return index;
825 }
826 int breakPos = index;
827 if (direction == FPDFTEXT_LEFT) {
828 while (--breakPos > 0) {
829 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
830 if (!IsLetter(charinfo.m_Unicode)) {
831 return breakPos;
832 }
833 }
834 } else if (direction == FPDFTEXT_RIGHT) {
835 while (++breakPos < m_charList.GetSize()) {
836 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos);
837 if (!IsLetter(charinfo.m_Unicode)) {
838 return breakPos;
839 }
840 }
841 }
842 return breakPos;
843 }
844 int32_t CPDF_TextPage::FindTextlineFlowDirection() {
845 if (!m_pPage) {
846 return -1;
847 }
848 const int32_t nPageWidth = (int32_t)((CPDF_Page*)m_pPage)->GetPageWidth();
849 const int32_t nPageHeight = (int32_t)((CPDF_Page*)m_pPage)->GetPageHeight();
850 CFX_ByteArray nHorizontalMask;
851 if (!nHorizontalMask.SetSize(nPageWidth)) {
852 return -1;
853 }
854 uint8_t* pDataH = nHorizontalMask.GetData();
855 CFX_ByteArray nVerticalMask;
856 if (!nVerticalMask.SetSize(nPageHeight)) {
857 return -1;
858 }
859 uint8_t* pDataV = nVerticalMask.GetData();
860 int32_t index = 0;
861 FX_FLOAT fLineHeight = 0.0f;
862 CPDF_PageObject* pPageObj = NULL;
863 FX_POSITION pos = NULL;
864 pos = m_pPage->GetFirstObjectPosition();
865 if (!pos) {
866 return -1;
867 }
868 while (pos) {
869 pPageObj = m_pPage->GetNextObject(pos);
870 if (NULL == pPageObj) {
871 continue;
872 }
873 if (PDFPAGE_TEXT != pPageObj->m_Type) {
874 continue;
875 }
876 int32_t minH =
877 (int32_t)pPageObj->m_Left < 0 ? 0 : (int32_t)pPageObj->m_Left;
878 int32_t maxH = (int32_t)pPageObj->m_Right > nPageWidth
879 ? nPageWidth
880 : (int32_t)pPageObj->m_Right;
881 int32_t minV =
882 (int32_t)pPageObj->m_Bottom < 0 ? 0 : (int32_t)pPageObj->m_Bottom;
883 int32_t maxV = (int32_t)pPageObj->m_Top > nPageHeight
884 ? nPageHeight
885 : (int32_t)pPageObj->m_Top;
886 if (minH >= maxH || minV >= maxV) {
887 continue;
888 }
889 FXSYS_memset(pDataH + minH, 1, maxH - minH);
890 FXSYS_memset(pDataV + minV, 1, maxV - minV);
891 if (fLineHeight <= 0.0f) {
892 fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom;
893 }
894 pPageObj = NULL;
895 }
896 int32_t nStartH = 0;
897 int32_t nEndH = 0;
898 FX_FLOAT nSumH = 0.0f;
899 for (index = 0; index < nPageWidth; index++)
900 if (1 == nHorizontalMask[index]) {
901 break;
902 }
903 nStartH = index;
904 for (index = nPageWidth; index > 0; index--)
905 if (1 == nHorizontalMask[index - 1]) {
906 break;
907 }
908 nEndH = index;
909 for (index = nStartH; index < nEndH; index++) {
910 nSumH += nHorizontalMask[index];
911 }
912 nSumH /= nEndH - nStartH;
913 int32_t nStartV = 0;
914 int32_t nEndV = 0;
915 FX_FLOAT nSumV = 0.0f;
916 for (index = 0; index < nPageHeight; index++)
917 if (1 == nVerticalMask[index]) {
918 break;
919 }
920 nStartV = index;
921 for (index = nPageHeight; index > 0; index--)
922 if (1 == nVerticalMask[index - 1]) {
923 break;
924 }
925 nEndV = index;
926 for (index = nStartV; index < nEndV; index++) {
927 nSumV += nVerticalMask[index];
928 }
929 nSumV /= nEndV - nStartV;
930 if ((nEndV - nStartV) < (int32_t)(2 * fLineHeight)) {
931 return 0;
932 }
933 if ((nEndH - nStartH) < (int32_t)(2 * fLineHeight)) {
934 return 1;
935 }
936 if (nSumH > 0.8f) {
937 return 0;
938 }
939 if (nSumH - nSumV > 0.0f) {
940 return 0;
941 }
942 if (nSumV - nSumH > 0.0f) {
943 return 1;
944 }
945 return -1;
946 }
947 void CPDF_TextPage::ProcessObject() {
948 CPDF_PageObject* pPageObj = NULL;
949 if (!m_pPage) {
950 return;
951 }
952 FX_POSITION pos;
953 pos = m_pPage->GetFirstObjectPosition();
954 if (!pos) {
955 return;
956 }
957 m_TextlineDir = FindTextlineFlowDirection();
958 int nCount = 0;
959 while (pos) {
960 pPageObj = m_pPage->GetNextObject(pos);
961 if (pPageObj) {
962 if (pPageObj->m_Type == PDFPAGE_TEXT) {
963 CFX_AffineMatrix matrix;
964 ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos);
965 nCount++;
966 } else if (pPageObj->m_Type == PDFPAGE_FORM) {
967 CFX_AffineMatrix formMatrix(1, 0, 0, 1, 0, 0);
968 ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix);
969 }
970 }
971 pPageObj = NULL;
972 }
973 int count = m_LineObj.GetSize();
974 for (int i = 0; i < count; i++) {
975 ProcessTextObject(m_LineObj.GetAt(i));
976 }
977 m_LineObj.RemoveAll();
978 CloseTempLine();
979 }
980 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj,
981 const CFX_AffineMatrix& formMatrix) {
982 CPDF_PageObject* pPageObj = NULL;
983 FX_POSITION pos;
984 if (!pFormObj) {
985 return;
986 }
987 pos = pFormObj->m_pForm->GetFirstObjectPosition();
988 if (!pos) {
989 return;
990 }
991 CFX_AffineMatrix curFormMatrix;
992 curFormMatrix.Copy(pFormObj->m_FormMatrix);
993 curFormMatrix.Concat(formMatrix);
994 while (pos) {
995 pPageObj = pFormObj->m_pForm->GetNextObject(pos);
996 if (pPageObj) {
997 if (pPageObj->m_Type == PDFPAGE_TEXT) {
998 ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos);
999 } else if (pPageObj->m_Type == PDFPAGE_FORM) {
1000 ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix);
1001 }
1002 }
1003 pPageObj = NULL;
1004 }
1005 }
1006 int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const {
1007 if (charCode == -1) {
1008 return 0;
1009 }
1010 int w = pFont->GetCharWidthF(charCode);
1011 if (w == 0) {
1012 CFX_ByteString str;
1013 pFont->AppendChar(str, charCode);
1014 w = pFont->GetStringWidth(str, 1);
1015 if (w == 0) {
1016 FX_RECT BBox;
1017 pFont->GetCharBBox(charCode, BBox);
1018 w = BBox.right - BBox.left;
1019 }
1020 }
1021 return w;
1022 }
1023 void CPDF_TextPage::OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str) {
1024 int32_t start, count;
1025 int32_t ret = pBidi->GetBidiInfo(start, count);
1026 if (ret == 2) {
1027 for (int i = start + count - 1; i >= start; i--) {
1028 m_TextBuf.AppendChar(str.GetAt(i));
1029 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
1030 }
1031 } else {
1032 int end = start + count;
1033 for (int i = start; i < end; i++) {
1034 m_TextBuf.AppendChar(str.GetAt(i));
1035 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
1036 }
1037 }
1038 }
1039 void CPDF_TextPage::AddCharInfoByLRDirection(CFX_WideString& str, int i) {
1040 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
1041 FX_WCHAR wChar = str.GetAt(i);
1042 if (!IsControlChar(Info)) {
1043 Info.m_Index = m_TextBuf.GetLength();
1044 if (wChar >= 0xFB00 && wChar <= 0xFB06) {
1045 FX_WCHAR* pDst = NULL;
1046 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1047 if (nCount >= 1) {
1048 pDst = FX_Alloc(FX_WCHAR, nCount);
1049 FX_Unicode_GetNormalization(wChar, pDst);
1050 for (int nIndex = 0; nIndex < nCount; nIndex++) {
1051 PAGECHAR_INFO Info2 = Info;
1052 Info2.m_Unicode = pDst[nIndex];
1053 Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1054 m_TextBuf.AppendChar(Info2.m_Unicode);
1055 if (!m_ParseOptions.m_bGetCharCodeOnly) {
1056 m_charList.Add(Info2);
1057 }
1058 }
1059 FX_Free(pDst);
780 return; 1060 return;
781 } 1061 }
782 start = m_Segment.GetAt(index).m_Start; 1062 }
783 count = m_Segment.GetAt(index).m_nCount; 1063 m_TextBuf.AppendChar(wChar);
784 } 1064 } else {
785 int CPDF_TextPage::GetWordBreak(int index, int direction) const 1065 Info.m_Index = -1;
786 { 1066 }
787 if(m_ParseOptions.m_bGetCharCodeOnly) { 1067 if (!m_ParseOptions.m_bGetCharCodeOnly) {
788 return -1; 1068 m_charList.Add(Info);
789 } 1069 }
790 if (!m_IsParsered) { 1070 }
791 return -1; 1071 void CPDF_TextPage::AddCharInfoByRLDirection(CFX_WideString& str, int i) {
792 } 1072 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
793 if (direction != FPDFTEXT_LEFT && direction != FPDFTEXT_RIGHT) { 1073 if (!IsControlChar(Info)) {
794 return -1; 1074 Info.m_Index = m_TextBuf.GetLength();
795 } 1075 FX_WCHAR wChar = FX_GetMirrorChar(str.GetAt(i), TRUE, FALSE);
796 if (index < 0 || index >= m_charList.GetSize()) { 1076 FX_WCHAR* pDst = NULL;
797 return -1; 1077 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1078 if (nCount >= 1) {
1079 pDst = FX_Alloc(FX_WCHAR, nCount);
1080 FX_Unicode_GetNormalization(wChar, pDst);
1081 for (int nIndex = 0; nIndex < nCount; nIndex++) {
1082 PAGECHAR_INFO Info2 = Info;
1083 Info2.m_Unicode = pDst[nIndex];
1084 Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1085 m_TextBuf.AppendChar(Info2.m_Unicode);
1086 if (!m_ParseOptions.m_bGetCharCodeOnly) {
1087 m_charList.Add(Info2);
1088 }
1089 }
1090 FX_Free(pDst);
1091 return;
1092 }
1093 Info.m_Unicode = wChar;
1094 m_TextBuf.AppendChar(Info.m_Unicode);
1095 } else {
1096 Info.m_Index = -1;
1097 }
1098 if (!m_ParseOptions.m_bGetCharCodeOnly) {
1099 m_charList.Add(Info);
1100 }
1101 }
1102 void CPDF_TextPage::CloseTempLine() {
1103 int count1 = m_TempCharList.GetSize();
1104 if (count1 <= 0) {
1105 return;
1106 }
1107 nonstd::unique_ptr<IFX_BidiChar> pBidiChar(IFX_BidiChar::Create());
1108 CFX_WideString str = m_TempTextBuf.GetWideString();
1109 CFX_WordArray order;
1110 FX_BOOL bR2L = FALSE;
1111 int32_t start = 0, count = 0;
1112 int nR2L = 0, nL2R = 0;
1113 FX_BOOL bPrevSpace = FALSE;
1114 for (int i = 0; i < str.GetLength(); i++) {
1115 if (str.GetAt(i) == 32) {
1116 if (bPrevSpace) {
1117 m_TempTextBuf.Delete(i, 1);
1118 m_TempCharList.Delete(i);
1119 str.Delete(i);
1120 count1--;
1121 i--;
1122 continue;
1123 }
1124 bPrevSpace = TRUE;
1125 } else {
1126 bPrevSpace = FALSE;
1127 }
1128 if (pBidiChar->AppendChar(str.GetAt(i))) {
1129 int32_t ret = pBidiChar->GetBidiInfo(start, count);
1130 order.Add(start);
1131 order.Add(count);
1132 order.Add(ret);
1133 if (!bR2L) {
1134 if (ret == 2) {
1135 nR2L++;
1136 } else if (ret == 1) {
1137 nL2R++;
1138 }
1139 }
1140 }
1141 }
1142 if (pBidiChar->EndChar()) {
1143 int32_t ret = pBidiChar->GetBidiInfo(start, count);
1144 order.Add(start);
1145 order.Add(count);
1146 order.Add(ret);
1147 if (!bR2L) {
1148 if (ret == 2) {
1149 nR2L++;
1150 } else if (ret == 1) {
1151 nL2R++;
1152 }
1153 }
1154 }
1155 if (nR2L > 0 && nR2L >= nL2R) {
1156 bR2L = TRUE;
1157 }
1158 if (m_parserflag == FPDFTEXT_RLTB || bR2L) {
1159 int count = order.GetSize();
1160 for (int i = count - 1; i > 0; i -= 3) {
1161 int ret = order.GetAt(i);
1162 int start = order.GetAt(i - 2);
1163 int count1 = order.GetAt(i - 1);
1164 if (ret == 2 || ret == 0) {
1165 for (int j = start + count1 - 1; j >= start; j--) {
1166 AddCharInfoByRLDirection(str, j);
1167 }
1168 } else {
1169 int j = i;
1170 FX_BOOL bSymbol = FALSE;
1171 while (j > 0 && order.GetAt(j) != 2) {
1172 bSymbol = !order.GetAt(j);
1173 j -= 3;
1174 }
1175 int end = start + count1;
1176 int n = 0;
1177 if (bSymbol) {
1178 n = j + 6;
1179 } else {
1180 n = j + 3;
1181 }
1182 if (n >= i) {
1183 for (int m = start; m < end; m++) {
1184 AddCharInfoByLRDirection(str, m);
1185 }
1186 } else {
1187 j = i;
1188 i = n;
1189 for (; n <= j; n += 3) {
1190 int start = order.GetAt(n - 2);
1191 int count1 = order.GetAt(n - 1);
1192 int end = start + count1;
1193 for (int m = start; m < end; m++) {
1194 AddCharInfoByLRDirection(str, m);
1195 }
1196 }
1197 }
1198 }
1199 }
1200 } else {
1201 int count = order.GetSize();
1202 FX_BOOL bL2R = FALSE;
1203 for (int i = 0; i < count; i += 3) {
1204 int ret = order.GetAt(i + 2);
1205 int start = order.GetAt(i);
1206 int count1 = order.GetAt(i + 1);
1207 if (ret == 2 || (i == 0 && ret == 0 && !bL2R)) {
1208 int j = i + 3;
1209 while (bR2L && j < count) {
1210 if (order.GetAt(j + 2) == 1) {
1211 break;
1212 } else {
1213 j += 3;
1214 }
1215 }
1216 if (j == 3) {
1217 i = -3;
1218 bL2R = TRUE;
1219 continue;
1220 }
1221 int end = m_TempCharList.GetSize() - 1;
1222 if (j < count) {
1223 end = order.GetAt(j) - 1;
1224 }
1225 i = j - 3;
1226 for (int n = end; n >= start; n--) {
1227 AddCharInfoByRLDirection(str, n);
1228 }
1229 } else {
1230 int end = start + count1;
1231 for (int n = start; n < end; n++) {
1232 AddCharInfoByLRDirection(str, n);
1233 }
1234 }
1235 }
1236 }
1237 order.RemoveAll();
1238 m_TempCharList.RemoveAll();
1239 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
1240 }
1241 void CPDF_TextPage::ProcessTextObject(CPDF_TextObject* pTextObj,
1242 const CFX_AffineMatrix& formMatrix,
1243 FX_POSITION ObjPos) {
1244 CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right,
1245 pTextObj->m_Top);
1246 if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) {
1247 return;
1248 }
1249 int count = m_LineObj.GetSize();
1250 PDFTEXT_Obj Obj;
1251 Obj.m_pTextObj = pTextObj;
1252 Obj.m_formMatrix = formMatrix;
1253 if (count == 0) {
1254 m_LineObj.Add(Obj);
1255 return;
1256 }
1257 if (IsSameAsPreTextObject(pTextObj, ObjPos)) {
1258 return;
1259 }
1260 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1);
1261 CPDF_TextObjectItem item;
1262 int nItem = prev_Obj.m_pTextObj->CountItems();
1263 prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item);
1264 FX_FLOAT prev_width =
1265 GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->GetFont()) *
1266 prev_Obj.m_pTextObj->GetFontSize() / 1000;
1267 CFX_AffineMatrix prev_matrix;
1268 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1269 prev_width = FXSYS_fabs(prev_width);
1270 prev_matrix.Concat(prev_Obj.m_formMatrix);
1271 prev_width = prev_matrix.TransformDistance(prev_width);
1272 pTextObj->GetItemInfo(0, &item);
1273 FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) *
1274 pTextObj->GetFontSize() / 1000;
1275 this_width = FXSYS_fabs(this_width);
1276 CFX_AffineMatrix this_matrix;
1277 pTextObj->GetTextMatrix(&this_matrix);
1278 this_width = FXSYS_fabs(this_width);
1279 this_matrix.Concat(formMatrix);
1280 this_width = this_matrix.TransformDistance(this_width);
1281 FX_FLOAT threshold =
1282 prev_width > this_width ? prev_width / 4 : this_width / 4;
1283 FX_FLOAT prev_x = prev_Obj.m_pTextObj->GetPosX(),
1284 prev_y = prev_Obj.m_pTextObj->GetPosY();
1285 prev_Obj.m_formMatrix.Transform(prev_x, prev_y);
1286 m_DisplayMatrix.Transform(prev_x, prev_y);
1287 FX_FLOAT this_x = pTextObj->GetPosX(), this_y = pTextObj->GetPosY();
1288 formMatrix.Transform(this_x, this_y);
1289 m_DisplayMatrix.Transform(this_x, this_y);
1290 if (FXSYS_fabs(this_y - prev_y) > threshold * 2) {
1291 for (int i = 0; i < count; i++) {
1292 ProcessTextObject(m_LineObj.GetAt(i));
1293 }
1294 m_LineObj.RemoveAll();
1295 m_LineObj.Add(Obj);
1296 return;
1297 }
1298 int i = 0;
1299 if (m_ParseOptions.m_bNormalizeObjs) {
1300 for (i = count - 1; i >= 0; i--) {
1301 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(i);
1302 CFX_AffineMatrix prev_matrix;
1303 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1304 FX_FLOAT Prev_x = prev_Obj.m_pTextObj->GetPosX(),
1305 Prev_y = prev_Obj.m_pTextObj->GetPosY();
1306 prev_Obj.m_formMatrix.Transform(Prev_x, Prev_y);
1307 m_DisplayMatrix.Transform(Prev_x, Prev_y);
1308 if (this_x >= Prev_x) {
1309 if (i == count - 1) {
1310 m_LineObj.Add(Obj);
1311 } else {
1312 m_LineObj.InsertAt(i + 1, Obj);
1313 }
1314 break;
1315 }
1316 }
1317 if (i < 0) {
1318 m_LineObj.InsertAt(0, Obj);
1319 }
1320 } else {
1321 m_LineObj.Add(Obj);
1322 }
1323 }
1324 int32_t CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj) {
1325 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1326 CPDF_ContentMarkData* pMarkData =
1327 (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
1328 if (!pMarkData) {
1329 return FPDFTEXT_MC_PASS;
1330 }
1331 int nContentMark = pMarkData->CountItems();
1332 if (nContentMark < 1) {
1333 return FPDFTEXT_MC_PASS;
1334 }
1335 CFX_WideString actText;
1336 FX_BOOL bExist = FALSE;
1337 CPDF_Dictionary* pDict = NULL;
1338 int n = 0;
1339 for (n = 0; n < nContentMark; n++) {
1340 CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1341 CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1342 pDict = (CPDF_Dictionary*)item.GetParam();
1343 CPDF_String* temp =
1344 (CPDF_String*)(pDict ? pDict->GetElement(FX_BSTRC("ActualText"))
1345 : NULL);
1346 if (temp) {
1347 bExist = TRUE;
1348 actText = temp->GetUnicodeText();
1349 }
1350 }
1351 if (!bExist) {
1352 return FPDFTEXT_MC_PASS;
1353 }
1354 if (m_pPreTextObj) {
1355 if (CPDF_ContentMarkData* pPreMarkData =
1356 (CPDF_ContentMarkData*)m_pPreTextObj->m_ContentMark.GetObject()) {
1357 if (pPreMarkData->CountItems() == n) {
1358 CPDF_ContentMarkItem& item = pPreMarkData->GetItem(n - 1);
1359 if (pDict == item.GetParam()) {
1360 return FPDFTEXT_MC_DONE;
1361 }
1362 }
1363 }
1364 }
1365 CPDF_Font* pFont = pTextObj->GetFont();
1366 FX_STRSIZE nItems = actText.GetLength();
1367 if (nItems < 1) {
1368 return FPDFTEXT_MC_PASS;
1369 }
1370 bExist = FALSE;
1371 for (FX_STRSIZE i = 0; i < nItems; i++) {
1372 FX_WCHAR wChar = actText.GetAt(i);
1373 if (-1 == pFont->CharCodeFromUnicode(wChar)) {
1374 continue;
1375 } else {
1376 bExist = TRUE;
1377 break;
1378 }
1379 }
1380 if (!bExist) {
1381 return FPDFTEXT_MC_PASS;
1382 }
1383 bExist = FALSE;
1384 for (FX_STRSIZE i = 0; i < nItems; i++) {
1385 FX_WCHAR wChar = actText.GetAt(i);
1386 if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar))) {
1387 bExist = TRUE;
1388 break;
1389 }
1390 }
1391 if (!bExist) {
1392 return FPDFTEXT_MC_DONE;
1393 }
1394 return FPDFTEXT_MC_DELAY;
1395 }
1396 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj) {
1397 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1398 CPDF_ContentMarkData* pMarkData =
1399 (CPDF_ContentMarkData*)pTextObj->m_ContentMark.GetObject();
1400 if (!pMarkData) {
1401 return;
1402 }
1403 int nContentMark = pMarkData->CountItems();
1404 if (nContentMark < 1) {
1405 return;
1406 }
1407 CFX_WideString actText;
1408 CPDF_Dictionary* pDict = NULL;
1409 int n = 0;
1410 for (n = 0; n < nContentMark; n++) {
1411 CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1412 CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1413 pDict = (CPDF_Dictionary*)item.GetParam();
1414 CPDF_String* temp =
1415 (CPDF_String*)(pDict ? pDict->GetElement(FX_BSTRC("ActualText"))
1416 : NULL);
1417 if (temp) {
1418 actText = temp->GetUnicodeText();
1419 }
1420 }
1421 FX_STRSIZE nItems = actText.GetLength();
1422 if (nItems < 1) {
1423 return;
1424 }
1425 CPDF_Font* pFont = pTextObj->GetFont();
1426 CFX_AffineMatrix formMatrix = Obj.m_formMatrix;
1427 CFX_AffineMatrix matrix;
1428 pTextObj->GetTextMatrix(&matrix);
1429 matrix.Concat(formMatrix);
1430 FX_FLOAT fPosX = pTextObj->GetPosX();
1431 FX_FLOAT fPosY = pTextObj->GetPosY();
1432 int nCharInfoIndex = m_TextBuf.GetLength();
1433 CFX_FloatRect charBox;
1434 charBox.top = pTextObj->m_Top;
1435 charBox.left = pTextObj->m_Left;
1436 charBox.right = pTextObj->m_Right;
1437 charBox.bottom = pTextObj->m_Bottom;
1438 for (FX_STRSIZE k = 0; k < nItems; k++) {
1439 FX_WCHAR wChar = actText.GetAt(k);
1440 if (wChar <= 0x80 && !isprint(wChar)) {
1441 wChar = 0x20;
1442 }
1443 if (wChar >= 0xFFFD) {
1444 continue;
798 } 1445 }
799 PAGECHAR_INFO charinfo; 1446 PAGECHAR_INFO charinfo;
800 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(index); 1447 charinfo.m_OriginX = fPosX;
801 if (charinfo.m_Index == -1 || charinfo.m_Flag == FPDFTEXT_CHAR_GENERATED) { 1448 charinfo.m_OriginY = fPosY;
802 return index; 1449 charinfo.m_Index = nCharInfoIndex;
803 } 1450 charinfo.m_Unicode = wChar;
804 if (!IsLetter(charinfo.m_Unicode)) { 1451 charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
805 return index; 1452 charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
806 } 1453 charinfo.m_pTextObj = pTextObj;
807 int breakPos = index; 1454 charinfo.m_CharBox.top = charBox.top;
808 if (direction == FPDFTEXT_LEFT) { 1455 charinfo.m_CharBox.left = charBox.left;
809 while (--breakPos > 0) { 1456 charinfo.m_CharBox.right = charBox.right;
810 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos); 1457 charinfo.m_CharBox.bottom = charBox.bottom;
811 if (!IsLetter(charinfo.m_Unicode)) { 1458 charinfo.m_Matrix.Copy(matrix);
812 return breakPos; 1459 m_TempTextBuf.AppendChar(wChar);
813 } 1460 m_TempCharList.Add(charinfo);
814 } 1461 }
815 } else if (direction == FPDFTEXT_RIGHT) { 1462 }
816 while (++breakPos < m_charList.GetSize()) { 1463 void CPDF_TextPage::FindPreviousTextObject(void) {
817 charinfo = *(PAGECHAR_INFO*)m_charList.GetAt(breakPos); 1464 if (m_TempCharList.GetSize() < 1 && m_charList.GetSize() < 1) {
818 if (!IsLetter(charinfo.m_Unicode)) { 1465 return;
819 return breakPos; 1466 }
820 } 1467 PAGECHAR_INFO preChar;
821 } 1468 if (m_TempCharList.GetSize() >= 1) {
822 } 1469 preChar =
823 return breakPos; 1470 *(PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
824 } 1471 } else {
825 int32_t CPDF_TextPage::FindTextlineFlowDirection() 1472 preChar = *(PAGECHAR_INFO*)m_charList.GetAt(m_charList.GetSize() - 1);
826 { 1473 }
827 if (!m_pPage) { 1474 if (preChar.m_pTextObj) {
828 return -1; 1475 m_pPreTextObj = preChar.m_pTextObj;
829 } 1476 }
830 const int32_t nPageWidth = (int32_t)((CPDF_Page*)m_pPage)->GetPageWidth(); 1477 }
831 const int32_t nPageHeight = (int32_t)((CPDF_Page*)m_pPage)->GetPageHeight(); 1478 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj) {
832 CFX_ByteArray nHorizontalMask; 1479 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
833 if (!nHorizontalMask.SetSize(nPageWidth)) { 1480 if (FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f) {
834 return -1; 1481 return;
835 } 1482 }
836 uint8_t* pDataH = nHorizontalMask.GetData(); 1483 CFX_AffineMatrix formMatrix = Obj.m_formMatrix;
837 CFX_ByteArray nVerticalMask; 1484 CPDF_Font* pFont = pTextObj->GetFont();
838 if (!nVerticalMask.SetSize(nPageHeight)) { 1485 CFX_AffineMatrix matrix;
839 return -1; 1486 pTextObj->GetTextMatrix(&matrix);
840 } 1487 matrix.Concat(formMatrix);
841 uint8_t* pDataV = nVerticalMask.GetData(); 1488 int32_t bPreMKC = PreMarkedContent(Obj);
842 int32_t index = 0; 1489 if (FPDFTEXT_MC_DONE == bPreMKC) {
843 FX_FLOAT fLineHeight = 0.0f;
844 CPDF_PageObject* pPageObj = NULL;
845 FX_POSITION pos = NULL;
846 pos = m_pPage->GetFirstObjectPosition();
847 if(!pos) {
848 return -1;
849 }
850 while(pos) {
851 pPageObj = m_pPage->GetNextObject(pos);
852 if(NULL == pPageObj) {
853 continue;
854 }
855 if(PDFPAGE_TEXT != pPageObj->m_Type) {
856 continue;
857 }
858 int32_t minH = (int32_t)pPageObj->m_Left < 0 ? 0 : (int32_t)pPageObj->m_ Left;
859 int32_t maxH = (int32_t)pPageObj->m_Right > nPageWidth ? nPageWidth : (i nt32_t)pPageObj->m_Right;
860 int32_t minV = (int32_t)pPageObj->m_Bottom < 0 ? 0 : (int32_t)pPageObj-> m_Bottom;
861 int32_t maxV = (int32_t)pPageObj->m_Top > nPageHeight ? nPageHeight : (i nt32_t)pPageObj->m_Top;
862 if (minH >= maxH || minV >= maxV) {
863 continue;
864 }
865 FXSYS_memset(pDataH + minH, 1, maxH - minH);
866 FXSYS_memset(pDataV + minV, 1, maxV - minV);
867 if (fLineHeight <= 0.0f) {
868 fLineHeight = pPageObj->m_Top - pPageObj->m_Bottom;
869 }
870 pPageObj = NULL;
871 }
872 int32_t nStartH = 0;
873 int32_t nEndH = 0;
874 FX_FLOAT nSumH = 0.0f;
875 for (index = 0; index < nPageWidth; index++)
876 if(1 == nHorizontalMask[index]) {
877 break;
878 }
879 nStartH = index;
880 for (index = nPageWidth; index > 0; index--)
881 if(1 == nHorizontalMask[index - 1]) {
882 break;
883 }
884 nEndH = index;
885 for (index = nStartH; index < nEndH; index++) {
886 nSumH += nHorizontalMask[index];
887 }
888 nSumH /= nEndH - nStartH;
889 int32_t nStartV = 0;
890 int32_t nEndV = 0;
891 FX_FLOAT nSumV = 0.0f;
892 for (index = 0; index < nPageHeight; index++)
893 if(1 == nVerticalMask[index]) {
894 break;
895 }
896 nStartV = index;
897 for (index = nPageHeight; index > 0; index--)
898 if(1 == nVerticalMask[index - 1]) {
899 break;
900 }
901 nEndV = index;
902 for (index = nStartV; index < nEndV; index++) {
903 nSumV += nVerticalMask[index];
904 }
905 nSumV /= nEndV - nStartV;
906 if ((nEndV - nStartV) < (int32_t)(2 * fLineHeight)) {
907 return 0;
908 }
909 if ((nEndH - nStartH) < (int32_t)(2 * fLineHeight)) {
910 return 1;
911 }
912 if (nSumH > 0.8f) {
913 return 0;
914 }
915 if (nSumH - nSumV > 0.0f) {
916 return 0;
917 }
918 if (nSumV - nSumH > 0.0f) {
919 return 1;
920 }
921 return -1;
922 }
923 void CPDF_TextPage::ProcessObject()
924 {
925 CPDF_PageObject* pPageObj = NULL;
926 if (!m_pPage) {
927 return;
928 }
929 FX_POSITION pos;
930 pos = m_pPage->GetFirstObjectPosition();
931 if (!pos) {
932 return;
933 }
934 m_TextlineDir = FindTextlineFlowDirection();
935 int nCount = 0;
936 while (pos) {
937 pPageObj = m_pPage->GetNextObject(pos);
938 if(pPageObj) {
939 if(pPageObj->m_Type == PDFPAGE_TEXT) {
940 CFX_AffineMatrix matrix;
941 ProcessTextObject((CPDF_TextObject*)pPageObj, matrix, pos);
942 nCount++;
943 } else if (pPageObj->m_Type == PDFPAGE_FORM) {
944 CFX_AffineMatrix formMatrix(1, 0, 0, 1, 0, 0);
945 ProcessFormObject((CPDF_FormObject*)pPageObj, formMatrix);
946 }
947 }
948 pPageObj = NULL;
949 }
950 int count = m_LineObj.GetSize();
951 for(int i = 0; i < count; i++) {
952 ProcessTextObject(m_LineObj.GetAt(i));
953 }
954 m_LineObj.RemoveAll();
955 CloseTempLine();
956 }
957 void CPDF_TextPage::ProcessFormObject(CPDF_FormObject* pFormObj, const CFX_Affin eMatrix& formMatrix)
958 {
959 CPDF_PageObject* pPageObj = NULL;
960 FX_POSITION pos;
961 if (!pFormObj) {
962 return;
963 }
964 pos = pFormObj->m_pForm->GetFirstObjectPosition();
965 if (!pos) {
966 return;
967 }
968 CFX_AffineMatrix curFormMatrix;
969 curFormMatrix.Copy(pFormObj->m_FormMatrix);
970 curFormMatrix.Concat(formMatrix);
971 while (pos) {
972 pPageObj = pFormObj->m_pForm->GetNextObject(pos);
973 if(pPageObj) {
974 if(pPageObj->m_Type == PDFPAGE_TEXT) {
975 ProcessTextObject((CPDF_TextObject*)pPageObj, curFormMatrix, pos );
976 } else if (pPageObj->m_Type == PDFPAGE_FORM) {
977 ProcessFormObject((CPDF_FormObject*)pPageObj, curFormMatrix);
978 }
979 }
980 pPageObj = NULL;
981 }
982 }
983 int CPDF_TextPage::GetCharWidth(FX_DWORD charCode, CPDF_Font* pFont) const
984 {
985 if(charCode == -1) {
986 return 0;
987 }
988 int w = pFont->GetCharWidthF(charCode);
989 if(w == 0) {
990 CFX_ByteString str;
991 pFont->AppendChar(str, charCode);
992 w = pFont->GetStringWidth(str, 1);
993 if(w == 0) {
994 FX_RECT BBox;
995 pFont->GetCharBBox(charCode, BBox);
996 w = BBox.right - BBox.left;
997 }
998 }
999 return w;
1000 }
1001 void CPDF_TextPage::OnPiece(IFX_BidiChar* pBidi, CFX_WideString& str)
1002 {
1003 int32_t start, count;
1004 int32_t ret = pBidi->GetBidiInfo(start, count);
1005 if(ret == 2) {
1006 for(int i = start + count - 1; i >= start; i--) {
1007 m_TextBuf.AppendChar(str.GetAt(i));
1008 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
1009 }
1010 } else {
1011 int end = start + count ;
1012 for(int i = start; i < end; i++) {
1013 m_TextBuf.AppendChar(str.GetAt(i));
1014 m_charList.Add(*(PAGECHAR_INFO*)m_TempCharList.GetAt(i));
1015 }
1016 }
1017 }
1018 void CPDF_TextPage::AddCharInfoByLRDirection(CFX_WideString& str, int i)
1019 {
1020 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
1021 FX_WCHAR wChar = str.GetAt(i);
1022 if(!IsControlChar(Info)) {
1023 Info.m_Index = m_TextBuf.GetLength();
1024 if (wChar >= 0xFB00 && wChar <= 0xFB06) {
1025 FX_WCHAR* pDst = NULL;
1026 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1027 if (nCount >= 1) {
1028 pDst = FX_Alloc(FX_WCHAR, nCount);
1029 FX_Unicode_GetNormalization(wChar, pDst);
1030 for (int nIndex = 0; nIndex < nCount; nIndex++) {
1031 PAGECHAR_INFO Info2 = Info;
1032 Info2.m_Unicode = pDst[nIndex];
1033 Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1034 m_TextBuf.AppendChar(Info2.m_Unicode);
1035 if( !m_ParseOptions.m_bGetCharCodeOnly) {
1036 m_charList.Add(Info2);
1037 }
1038 }
1039 FX_Free(pDst);
1040 return;
1041 }
1042 }
1043 m_TextBuf.AppendChar(wChar);
1044 } else {
1045 Info.m_Index = -1;
1046 }
1047 if( !m_ParseOptions.m_bGetCharCodeOnly) {
1048 m_charList.Add(Info);
1049 }
1050 }
1051 void CPDF_TextPage::AddCharInfoByRLDirection(CFX_WideString& str, int i)
1052 {
1053 PAGECHAR_INFO Info = *(PAGECHAR_INFO*)m_TempCharList.GetAt(i);
1054 if(!IsControlChar(Info)) {
1055 Info.m_Index = m_TextBuf.GetLength();
1056 FX_WCHAR wChar = FX_GetMirrorChar(str.GetAt(i), TRUE, FALSE);
1057 FX_WCHAR* pDst = NULL;
1058 FX_STRSIZE nCount = FX_Unicode_GetNormalization(wChar, pDst);
1059 if (nCount >= 1) {
1060 pDst = FX_Alloc(FX_WCHAR, nCount);
1061 FX_Unicode_GetNormalization(wChar, pDst);
1062 for (int nIndex = 0; nIndex < nCount; nIndex++) {
1063 PAGECHAR_INFO Info2 = Info;
1064 Info2.m_Unicode = pDst[nIndex];
1065 Info2.m_Flag = FPDFTEXT_CHAR_PIECE;
1066 m_TextBuf.AppendChar(Info2.m_Unicode);
1067 if( !m_ParseOptions.m_bGetCharCodeOnly) {
1068 m_charList.Add(Info2);
1069 }
1070 }
1071 FX_Free(pDst);
1072 return;
1073 }
1074 Info.m_Unicode = wChar;
1075 m_TextBuf.AppendChar(Info.m_Unicode);
1076 } else {
1077 Info.m_Index = -1;
1078 }
1079 if( !m_ParseOptions.m_bGetCharCodeOnly) {
1080 m_charList.Add(Info);
1081 }
1082 }
1083 void CPDF_TextPage::CloseTempLine()
1084 {
1085 int count1 = m_TempCharList.GetSize();
1086 if (count1 <= 0) {
1087 return;
1088 }
1089 nonstd::unique_ptr<IFX_BidiChar> pBidiChar(IFX_BidiChar::Create());
1090 CFX_WideString str = m_TempTextBuf.GetWideString();
1091 CFX_WordArray order;
1092 FX_BOOL bR2L = FALSE;
1093 int32_t start = 0, count = 0;
1094 int nR2L = 0, nL2R = 0;
1095 FX_BOOL bPrevSpace = FALSE;
1096 for (int i = 0; i < str.GetLength(); i++) {
1097 if(str.GetAt(i) == 32) {
1098 if(bPrevSpace) {
1099 m_TempTextBuf.Delete(i, 1);
1100 m_TempCharList.Delete(i);
1101 str.Delete(i);
1102 count1--;
1103 i--;
1104 continue;
1105 }
1106 bPrevSpace = TRUE;
1107 } else {
1108 bPrevSpace = FALSE;
1109 }
1110 if(pBidiChar->AppendChar(str.GetAt(i))) {
1111 int32_t ret = pBidiChar->GetBidiInfo(start, count);
1112 order.Add(start);
1113 order.Add(count);
1114 order.Add(ret);
1115 if(!bR2L) {
1116 if(ret == 2) {
1117 nR2L++;
1118 } else if (ret == 1) {
1119 nL2R++;
1120 }
1121 }
1122 }
1123 }
1124 if(pBidiChar->EndChar()) {
1125 int32_t ret = pBidiChar->GetBidiInfo(start, count);
1126 order.Add(start);
1127 order.Add(count);
1128 order.Add(ret);
1129 if(!bR2L) {
1130 if(ret == 2) {
1131 nR2L++;
1132 } else if(ret == 1) {
1133 nL2R++;
1134 }
1135 }
1136 }
1137 if(nR2L > 0 && nR2L >= nL2R) {
1138 bR2L = TRUE;
1139 }
1140 if (m_parserflag == FPDFTEXT_RLTB || bR2L) {
1141 int count = order.GetSize();
1142 for(int i = count - 1; i > 0; i -= 3) {
1143 int ret = order.GetAt(i);
1144 int start = order.GetAt(i - 2);
1145 int count1 = order.GetAt(i - 1);
1146 if(ret == 2 || ret == 0) {
1147 for(int j = start + count1 - 1; j >= start; j--) {
1148 AddCharInfoByRLDirection(str, j);
1149 }
1150 } else {
1151 int j = i;
1152 FX_BOOL bSymbol = FALSE;
1153 while(j > 0 && order.GetAt(j) != 2) {
1154 bSymbol = !order.GetAt(j);
1155 j -= 3;
1156 }
1157 int end = start + count1 ;
1158 int n = 0;
1159 if(bSymbol) {
1160 n = j + 6;
1161 } else {
1162 n = j + 3;
1163 }
1164 if(n >= i) {
1165 for(int m = start; m < end; m++) {
1166 AddCharInfoByLRDirection(str, m);
1167 }
1168 } else {
1169 j = i;
1170 i = n;
1171 for(; n <= j; n += 3) {
1172 int start = order.GetAt(n - 2);
1173 int count1 = order.GetAt(n - 1);
1174 int end = start + count1 ;
1175 for(int m = start; m < end; m++) {
1176 AddCharInfoByLRDirection(str, m);
1177 }
1178 }
1179 }
1180 }
1181 }
1182 } else {
1183 int count = order.GetSize();
1184 FX_BOOL bL2R = FALSE;
1185 for(int i = 0; i < count; i += 3) {
1186 int ret = order.GetAt(i + 2);
1187 int start = order.GetAt(i);
1188 int count1 = order.GetAt(i + 1);
1189 if(ret == 2 || (i == 0 && ret == 0 && !bL2R)) {
1190 int j = i + 3;
1191 while(bR2L && j < count) {
1192 if(order.GetAt(j + 2) == 1) {
1193 break;
1194 } else {
1195 j += 3;
1196 }
1197 }
1198 if(j == 3) {
1199 i = -3;
1200 bL2R = TRUE;
1201 continue;
1202 }
1203 int end = m_TempCharList.GetSize() - 1;
1204 if(j < count) {
1205 end = order.GetAt(j) - 1;
1206 }
1207 i = j - 3;
1208 for(int n = end; n >= start; n--) {
1209 AddCharInfoByRLDirection(str, n);
1210 }
1211 } else {
1212 int end = start + count1 ;
1213 for(int n = start; n < end; n++) {
1214 AddCharInfoByLRDirection(str, n);
1215 }
1216 }
1217 }
1218 }
1219 order.RemoveAll();
1220 m_TempCharList.RemoveAll();
1221 m_TempTextBuf.Delete(0, m_TempTextBuf.GetLength());
1222 }
1223 void CPDF_TextPage::ProcessTextObject(CPDF_TextObject* pTextObj, const CFX_Affi neMatrix& formMatrix, FX_POSITION ObjPos)
1224 {
1225 CFX_FloatRect re(pTextObj->m_Left, pTextObj->m_Bottom, pTextObj->m_Right, pT extObj->m_Top);
1226 if(FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f ) {
1227 return;
1228 }
1229 int count = m_LineObj.GetSize();
1230 PDFTEXT_Obj Obj;
1231 Obj.m_pTextObj = pTextObj;
1232 Obj.m_formMatrix = formMatrix;
1233 if(count == 0) {
1234 m_LineObj.Add(Obj);
1235 return;
1236 }
1237 if (IsSameAsPreTextObject(pTextObj, ObjPos)) {
1238 return;
1239 }
1240 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(count - 1);
1241 CPDF_TextObjectItem item;
1242 int nItem = prev_Obj.m_pTextObj->CountItems();
1243 prev_Obj.m_pTextObj->GetItemInfo(nItem - 1, &item);
1244 FX_FLOAT prev_width = GetCharWidth(item.m_CharCode, prev_Obj.m_pTextObj->Get Font()) * prev_Obj.m_pTextObj->GetFontSize() / 1000;
1245 CFX_AffineMatrix prev_matrix;
1246 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1247 prev_width = FXSYS_fabs(prev_width);
1248 prev_matrix.Concat(prev_Obj.m_formMatrix);
1249 prev_width = prev_matrix.TransformDistance(prev_width);
1250 pTextObj->GetItemInfo(0, &item);
1251 FX_FLOAT this_width = GetCharWidth(item.m_CharCode, pTextObj->GetFont()) * p TextObj->GetFontSize() / 1000;
1252 this_width = FXSYS_fabs(this_width);
1253 CFX_AffineMatrix this_matrix;
1254 pTextObj->GetTextMatrix(&this_matrix);
1255 this_width = FXSYS_fabs(this_width);
1256 this_matrix.Concat(formMatrix);
1257 this_width = this_matrix.TransformDistance(this_width);
1258 FX_FLOAT threshold = prev_width > this_width ? prev_width / 4 : this_width / 4;
1259 FX_FLOAT prev_x = prev_Obj.m_pTextObj->GetPosX(), prev_y = prev_Obj.m_pTextO bj->GetPosY();
1260 prev_Obj.m_formMatrix.Transform(prev_x, prev_y);
1261 m_DisplayMatrix.Transform(prev_x, prev_y);
1262 FX_FLOAT this_x = pTextObj->GetPosX(), this_y = pTextObj->GetPosY();
1263 formMatrix.Transform(this_x, this_y);
1264 m_DisplayMatrix.Transform(this_x, this_y);
1265 if (FXSYS_fabs(this_y - prev_y) > threshold * 2) {
1266 for(int i = 0; i < count; i++) {
1267 ProcessTextObject(m_LineObj.GetAt(i));
1268 }
1269 m_LineObj.RemoveAll();
1270 m_LineObj.Add(Obj);
1271 return;
1272 }
1273 int i = 0;
1274 if(m_ParseOptions.m_bNormalizeObjs) {
1275 for(i = count - 1; i >= 0; i--) {
1276 PDFTEXT_Obj prev_Obj = m_LineObj.GetAt(i);
1277 CFX_AffineMatrix prev_matrix;
1278 prev_Obj.m_pTextObj->GetTextMatrix(&prev_matrix);
1279 FX_FLOAT Prev_x = prev_Obj.m_pTextObj->GetPosX(), Prev_y = prev_Obj. m_pTextObj->GetPosY();
1280 prev_Obj.m_formMatrix.Transform(Prev_x, Prev_y);
1281 m_DisplayMatrix.Transform(Prev_x, Prev_y);
1282 if(this_x >= Prev_x) {
1283 if(i == count - 1) {
1284 m_LineObj.Add(Obj);
1285 } else {
1286 m_LineObj.InsertAt(i + 1, Obj);
1287 }
1288 break;
1289 }
1290 }
1291 if(i < 0) {
1292 m_LineObj.InsertAt(0, Obj);
1293 }
1294 } else {
1295 m_LineObj.Add(Obj);
1296 }
1297 }
1298 int32_t CPDF_TextPage::PreMarkedContent(PDFTEXT_Obj Obj)
1299 {
1300 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1301 CPDF_ContentMarkData* pMarkData = (CPDF_ContentMarkData*)pTextObj->m_Content Mark.GetObject();
1302 if(!pMarkData) {
1303 return FPDFTEXT_MC_PASS;
1304 }
1305 int nContentMark = pMarkData->CountItems();
1306 if (nContentMark < 1) {
1307 return FPDFTEXT_MC_PASS;
1308 }
1309 CFX_WideString actText;
1310 FX_BOOL bExist = FALSE;
1311 CPDF_Dictionary* pDict = NULL;
1312 int n = 0;
1313 for (n = 0; n < nContentMark; n++) {
1314 CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1315 CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1316 pDict = (CPDF_Dictionary*)item.GetParam();
1317 CPDF_String* temp = (CPDF_String*)(pDict ? pDict->GetElement(FX_BSTRC("A ctualText")) : NULL);
1318 if (temp) {
1319 bExist = TRUE;
1320 actText = temp->GetUnicodeText();
1321 }
1322 }
1323 if (!bExist) {
1324 return FPDFTEXT_MC_PASS;
1325 }
1326 if (m_pPreTextObj) {
1327 if (CPDF_ContentMarkData* pPreMarkData = (CPDF_ContentMarkData*)m_pPreTe xtObj->m_ContentMark.GetObject()) {
1328 if (pPreMarkData->CountItems() == n) {
1329 CPDF_ContentMarkItem& item = pPreMarkData->GetItem(n - 1);
1330 if (pDict == item.GetParam()) {
1331 return FPDFTEXT_MC_DONE;
1332 }
1333 }
1334 }
1335 }
1336 CPDF_Font* pFont = pTextObj->GetFont();
1337 FX_STRSIZE nItems = actText.GetLength();
1338 if (nItems < 1) {
1339 return FPDFTEXT_MC_PASS;
1340 }
1341 bExist = FALSE;
1342 for (FX_STRSIZE i = 0; i < nItems; i++) {
1343 FX_WCHAR wChar = actText.GetAt(i);
1344 if (-1 == pFont->CharCodeFromUnicode(wChar)) {
1345 continue;
1346 } else {
1347 bExist = TRUE;
1348 break;
1349 }
1350 }
1351 if (!bExist) {
1352 return FPDFTEXT_MC_PASS;
1353 }
1354 bExist = FALSE;
1355 for (FX_STRSIZE i = 0; i < nItems; i++) {
1356 FX_WCHAR wChar = actText.GetAt(i);
1357 if ((wChar > 0x80 && wChar < 0xFFFD) || (wChar <= 0x80 && isprint(wChar) )) {
1358 bExist = TRUE;
1359 break;
1360 }
1361 }
1362 if (!bExist) {
1363 return FPDFTEXT_MC_DONE;
1364 }
1365 return FPDFTEXT_MC_DELAY;
1366 }
1367 void CPDF_TextPage::ProcessMarkedContent(PDFTEXT_Obj Obj)
1368 {
1369 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1370 CPDF_ContentMarkData* pMarkData = (CPDF_ContentMarkData*)pTextObj->m_Content Mark.GetObject();
1371 if(!pMarkData) {
1372 return;
1373 }
1374 int nContentMark = pMarkData->CountItems();
1375 if (nContentMark < 1) {
1376 return;
1377 }
1378 CFX_WideString actText;
1379 CPDF_Dictionary* pDict = NULL;
1380 int n = 0;
1381 for (n = 0; n < nContentMark; n++) {
1382 CPDF_ContentMarkItem& item = pMarkData->GetItem(n);
1383 CFX_ByteString tagStr = (CFX_ByteString)item.GetName();
1384 pDict = (CPDF_Dictionary*)item.GetParam();
1385 CPDF_String* temp = (CPDF_String*)(pDict ? pDict->GetElement(FX_BSTRC("A ctualText")) : NULL);
1386 if (temp) {
1387 actText = temp->GetUnicodeText();
1388 }
1389 }
1390 FX_STRSIZE nItems = actText.GetLength();
1391 if (nItems < 1) {
1392 return;
1393 }
1394 CPDF_Font* pFont = pTextObj->GetFont();
1395 CFX_AffineMatrix formMatrix = Obj.m_formMatrix;
1396 CFX_AffineMatrix matrix;
1397 pTextObj->GetTextMatrix(&matrix);
1398 matrix.Concat(formMatrix);
1399 FX_FLOAT fPosX = pTextObj->GetPosX();
1400 FX_FLOAT fPosY = pTextObj->GetPosY();
1401 int nCharInfoIndex = m_TextBuf.GetLength();
1402 CFX_FloatRect charBox;
1403 charBox.top = pTextObj->m_Top;
1404 charBox.left = pTextObj->m_Left;
1405 charBox.right = pTextObj->m_Right;
1406 charBox.bottom = pTextObj->m_Bottom;
1407 for (FX_STRSIZE k = 0; k < nItems; k++) {
1408 FX_WCHAR wChar = actText.GetAt(k);
1409 if (wChar <= 0x80 && !isprint(wChar)) {
1410 wChar = 0x20;
1411 }
1412 if (wChar >= 0xFFFD) {
1413 continue;
1414 }
1415 PAGECHAR_INFO charinfo;
1416 charinfo.m_OriginX = fPosX;
1417 charinfo.m_OriginY = fPosY;
1418 charinfo.m_Index = nCharInfoIndex;
1419 charinfo.m_Unicode = wChar;
1420 charinfo.m_CharCode = pFont->CharCodeFromUnicode(wChar);
1421 charinfo.m_Flag = FPDFTEXT_CHAR_PIECE;
1422 charinfo.m_pTextObj = pTextObj;
1423 charinfo.m_CharBox.top = charBox.top;
1424 charinfo.m_CharBox.left = charBox.left;
1425 charinfo.m_CharBox.right = charBox.right;
1426 charinfo.m_CharBox.bottom = charBox.bottom;
1427 charinfo.m_Matrix.Copy(matrix);
1428 m_TempTextBuf.AppendChar(wChar);
1429 m_TempCharList.Add(charinfo);
1430 }
1431 }
1432 void CPDF_TextPage::FindPreviousTextObject(void)
1433 {
1434 if (m_TempCharList.GetSize() < 1 && m_charList.GetSize() < 1) {
1435 return;
1436 }
1437 PAGECHAR_INFO preChar;
1438 if (m_TempCharList.GetSize() >= 1) {
1439 preChar = *(PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
1440 } else {
1441 preChar = *(PAGECHAR_INFO*)m_charList.GetAt(m_charList.GetSize() - 1);
1442 }
1443 if (preChar.m_pTextObj) {
1444 m_pPreTextObj = preChar.m_pTextObj;
1445 }
1446 }
1447 void CPDF_TextPage::ProcessTextObject(PDFTEXT_Obj Obj)
1448 {
1449 CPDF_TextObject* pTextObj = Obj.m_pTextObj;
1450 if(FXSYS_fabs(pTextObj->m_Right - pTextObj->m_Left) < 0.01f ) {
1451 return;
1452 }
1453 CFX_AffineMatrix formMatrix = Obj.m_formMatrix;
1454 CPDF_Font* pFont = pTextObj->GetFont();
1455 CFX_AffineMatrix matrix;
1456 pTextObj->GetTextMatrix(&matrix);
1457 matrix.Concat(formMatrix);
1458 int32_t bPreMKC = PreMarkedContent(Obj);
1459 if (FPDFTEXT_MC_DONE == bPreMKC) {
1460 m_pPreTextObj = pTextObj;
1461 m_perMatrix.Copy(formMatrix);
1462 return;
1463 }
1464 int result = 0;
1465 if (m_pPreTextObj) {
1466 result = ProcessInsertObject(pTextObj, formMatrix);
1467 if (2 == result) {
1468 m_CurlineRect = CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj ->m_Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1469 } else {
1470 m_CurlineRect.Union(CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTex tObj->m_Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top));
1471 }
1472 PAGECHAR_INFO generateChar;
1473 if (result == 1) {
1474 if (GenerateCharInfo(TEXT_BLANK_CHAR, generateChar)) {
1475 if (!formMatrix.IsIdentity()) {
1476 generateChar.m_Matrix.Copy(formMatrix);
1477 }
1478 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1479 m_TempCharList.Add(generateChar);
1480 }
1481 } else if(result == 2) {
1482 CloseTempLine();
1483 if(m_TextBuf.GetSize()) {
1484 if(m_ParseOptions.m_bGetCharCodeOnly) {
1485 m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1486 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1487 } else {
1488 if(GenerateCharInfo(TEXT_RETURN_CHAR, generateChar)) {
1489 m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1490 if (!formMatrix.IsIdentity()) {
1491 generateChar.m_Matrix.Copy(formMatrix);
1492 }
1493 m_charList.Add(generateChar);
1494 }
1495 if(GenerateCharInfo(TEXT_LINEFEED_CHAR, generateChar)) {
1496 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1497 if (!formMatrix.IsIdentity()) {
1498 generateChar.m_Matrix.Copy(formMatrix);
1499 }
1500 m_charList.Add(generateChar);
1501 }
1502 }
1503 }
1504 } else if (result == 3 && !m_ParseOptions.m_bOutputHyphen) {
1505 int32_t nChars = pTextObj->CountChars();
1506 if (nChars == 1) {
1507 CPDF_TextObjectItem item;
1508 pTextObj->GetCharInfo(0, &item);
1509 CFX_WideString wstrItem = pTextObj->GetFont()->UnicodeFromCharCo de(item.m_CharCode);
1510 if(wstrItem.IsEmpty()) {
1511 wstrItem += (FX_WCHAR)item.m_CharCode;
1512 }
1513 FX_WCHAR curChar = wstrItem.GetAt(0);
1514 if (0x2D == curChar || 0xAD == curChar) {
1515 return;
1516 }
1517 }
1518 while (m_TempTextBuf.GetSize() > 0 && m_TempTextBuf.GetWideString(). GetAt(m_TempTextBuf.GetLength() - 1) == 0x20) {
1519 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1520 m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1521 }
1522 PAGECHAR_INFO* cha = (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempChar List.GetSize() - 1);
1523 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1524 cha->m_Unicode = 0x2;
1525 cha->m_Flag = FPDFTEXT_CHAR_HYPHEN;
1526 m_TempTextBuf.AppendChar(0xfffe);
1527 }
1528 } else {
1529 m_CurlineRect = CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_ Bottom, Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1530 }
1531 if (FPDFTEXT_MC_DELAY == bPreMKC) {
1532 ProcessMarkedContent(Obj);
1533 m_pPreTextObj = pTextObj;
1534 m_perMatrix.Copy(formMatrix);
1535 return;
1536 }
1537 m_pPreTextObj = pTextObj; 1490 m_pPreTextObj = pTextObj;
1538 m_perMatrix.Copy(formMatrix); 1491 m_perMatrix.Copy(formMatrix);
1539 int nItems = pTextObj->CountItems(); 1492 return;
1540 FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix); 1493 }
1494 int result = 0;
1495 if (m_pPreTextObj) {
1496 result = ProcessInsertObject(pTextObj, formMatrix);
1497 if (2 == result) {
1498 m_CurlineRect =
1499 CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1500 Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1501 } else {
1502 m_CurlineRect.Union(
1503 CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1504 Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top));
1505 }
1506 PAGECHAR_INFO generateChar;
1507 if (result == 1) {
1508 if (GenerateCharInfo(TEXT_BLANK_CHAR, generateChar)) {
1509 if (!formMatrix.IsIdentity()) {
1510 generateChar.m_Matrix.Copy(formMatrix);
1511 }
1512 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1513 m_TempCharList.Add(generateChar);
1514 }
1515 } else if (result == 2) {
1516 CloseTempLine();
1517 if (m_TextBuf.GetSize()) {
1518 if (m_ParseOptions.m_bGetCharCodeOnly) {
1519 m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1520 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1521 } else {
1522 if (GenerateCharInfo(TEXT_RETURN_CHAR, generateChar)) {
1523 m_TextBuf.AppendChar(TEXT_RETURN_CHAR);
1524 if (!formMatrix.IsIdentity()) {
1525 generateChar.m_Matrix.Copy(formMatrix);
1526 }
1527 m_charList.Add(generateChar);
1528 }
1529 if (GenerateCharInfo(TEXT_LINEFEED_CHAR, generateChar)) {
1530 m_TextBuf.AppendChar(TEXT_LINEFEED_CHAR);
1531 if (!formMatrix.IsIdentity()) {
1532 generateChar.m_Matrix.Copy(formMatrix);
1533 }
1534 m_charList.Add(generateChar);
1535 }
1536 }
1537 }
1538 } else if (result == 3 && !m_ParseOptions.m_bOutputHyphen) {
1539 int32_t nChars = pTextObj->CountChars();
1540 if (nChars == 1) {
1541 CPDF_TextObjectItem item;
1542 pTextObj->GetCharInfo(0, &item);
1543 CFX_WideString wstrItem =
1544 pTextObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1545 if (wstrItem.IsEmpty()) {
1546 wstrItem += (FX_WCHAR)item.m_CharCode;
1547 }
1548 FX_WCHAR curChar = wstrItem.GetAt(0);
1549 if (0x2D == curChar || 0xAD == curChar) {
1550 return;
1551 }
1552 }
1553 while (m_TempTextBuf.GetSize() > 0 &&
1554 m_TempTextBuf.GetWideString().GetAt(m_TempTextBuf.GetLength() -
1555 1) == 0x20) {
1556 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1557 m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1558 }
1559 PAGECHAR_INFO* cha =
1560 (PAGECHAR_INFO*)m_TempCharList.GetAt(m_TempCharList.GetSize() - 1);
1561 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1562 cha->m_Unicode = 0x2;
1563 cha->m_Flag = FPDFTEXT_CHAR_HYPHEN;
1564 m_TempTextBuf.AppendChar(0xfffe);
1565 }
1566 } else {
1567 m_CurlineRect =
1568 CFX_FloatRect(Obj.m_pTextObj->m_Left, Obj.m_pTextObj->m_Bottom,
1569 Obj.m_pTextObj->m_Right, Obj.m_pTextObj->m_Top);
1570 }
1571 if (FPDFTEXT_MC_DELAY == bPreMKC) {
1572 ProcessMarkedContent(Obj);
1573 m_pPreTextObj = pTextObj;
1574 m_perMatrix.Copy(formMatrix);
1575 return;
1576 }
1577 m_pPreTextObj = pTextObj;
1578 m_perMatrix.Copy(formMatrix);
1579 int nItems = pTextObj->CountItems();
1580 FX_FLOAT baseSpace = _CalculateBaseSpace(pTextObj, matrix);
1541 1581
1542 const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems); 1582 const FX_BOOL bR2L = IsRightToLeft(pTextObj, pFont, nItems);
1543 const FX_BOOL bIsBidiAndMirrorInverse = 1583 const FX_BOOL bIsBidiAndMirrorInverse =
1544 bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0; 1584 bR2L && (matrix.a * matrix.d - matrix.b * matrix.c) < 0;
1545 int32_t iBufStartAppend = m_TempTextBuf.GetLength(); 1585 int32_t iBufStartAppend = m_TempTextBuf.GetLength();
1546 int32_t iCharListStartAppend = m_TempCharList.GetSize(); 1586 int32_t iCharListStartAppend = m_TempCharList.GetSize();
1547 1587
1548 FX_FLOAT spacing = 0; 1588 FX_FLOAT spacing = 0;
1549 for (int i = 0; i < nItems; i++) { 1589 for (int i = 0; i < nItems; i++) {
1550 CPDF_TextObjectItem item; 1590 CPDF_TextObjectItem item;
1551 PAGECHAR_INFO charinfo; 1591 PAGECHAR_INFO charinfo;
1552 charinfo.m_OriginX = 0; 1592 charinfo.m_OriginX = 0;
1553 charinfo.m_OriginY = 0; 1593 charinfo.m_OriginY = 0;
1554 pTextObj->GetItemInfo(i, &item); 1594 pTextObj->GetItemInfo(i, &item);
1555 if (item.m_CharCode == (FX_DWORD) - 1) { 1595 if (item.m_CharCode == (FX_DWORD)-1) {
1556 CFX_WideString str = m_TempTextBuf.GetWideString(); 1596 CFX_WideString str = m_TempTextBuf.GetWideString();
1557 if(str.IsEmpty()) { 1597 if (str.IsEmpty()) {
1558 str = m_TextBuf.GetWideString(); 1598 str = m_TextBuf.GetWideString();
1559 } 1599 }
1560 if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CH AR) { 1600 if (str.IsEmpty() || str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
1561 continue; 1601 continue;
1562 } 1602 }
1563 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 1603 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1564 spacing = -fontsize_h * item.m_OriginX / 1000; 1604 spacing = -fontsize_h * item.m_OriginX / 1000;
1565 continue; 1605 continue;
1566 } 1606 }
1567 FX_FLOAT charSpace = pTextObj->m_TextState.GetObject()->m_CharSpace; 1607 FX_FLOAT charSpace = pTextObj->m_TextState.GetObject()->m_CharSpace;
1568 if (charSpace > 0.001) { 1608 if (charSpace > 0.001) {
1569 spacing += matrix.TransformDistance(charSpace); 1609 spacing += matrix.TransformDistance(charSpace);
1570 } else if(charSpace < -0.001) { 1610 } else if (charSpace < -0.001) {
1571 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace)); 1611 spacing -= matrix.TransformDistance(FXSYS_fabs(charSpace));
1572 } 1612 }
1573 spacing -= baseSpace; 1613 spacing -= baseSpace;
1574 if (spacing && i > 0) { 1614 if (spacing && i > 0) {
1575 int last_width = 0; 1615 int last_width = 0;
1576 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH(); 1616 FX_FLOAT fontsize_h = pTextObj->m_TextState.GetFontSizeH();
1577 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' '); 1617 FX_DWORD space_charcode = pFont->CharCodeFromUnicode(' ');
1578 FX_FLOAT threshold = 0; 1618 FX_FLOAT threshold = 0;
1579 if (space_charcode != -1) { 1619 if (space_charcode != -1) {
1580 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000 ; 1620 threshold = fontsize_h * pFont->GetCharWidthF(space_charcode) / 1000;
1581 } 1621 }
1582 if (threshold > fontsize_h / 3) { 1622 if (threshold > fontsize_h / 3) {
1583 threshold = 0; 1623 threshold = 0;
1584 } else { 1624 } else {
1585 threshold /= 2; 1625 threshold /= 2;
1586 } 1626 }
1587 if (threshold == 0) { 1627 if (threshold == 0) {
1588 threshold = fontsize_h; 1628 threshold = fontsize_h;
1589 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont)) ; 1629 int this_width = FXSYS_abs(GetCharWidth(item.m_CharCode, pFont));
1590 threshold = this_width > last_width ? (FX_FLOAT)this_width : (FX _FLOAT)last_width; 1630 threshold = this_width > last_width ? (FX_FLOAT)this_width
1591 threshold = _NormalizeThreshold(threshold); 1631 : (FX_FLOAT)last_width;
1592 threshold = fontsize_h * threshold / 1000; 1632 threshold = _NormalizeThreshold(threshold);
1593 } 1633 threshold = fontsize_h * threshold / 1000;
1594 if (threshold && (spacing && spacing >= threshold) ) { 1634 }
1595 charinfo.m_Unicode = TEXT_BLANK_CHAR; 1635 if (threshold && (spacing && spacing >= threshold)) {
1596 charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED; 1636 charinfo.m_Unicode = TEXT_BLANK_CHAR;
1597 charinfo.m_pTextObj = pTextObj; 1637 charinfo.m_Flag = FPDFTEXT_CHAR_GENERATED;
1598 charinfo.m_Index = m_TextBuf.GetLength();
1599 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1600 charinfo.m_CharCode = -1;
1601 charinfo.m_Matrix.Copy(formMatrix);
1602 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_Orig inX, charinfo.m_OriginY);
1603 charinfo.m_CharBox = CFX_FloatRect(charinfo.m_OriginX, charinfo. m_OriginY, charinfo.m_OriginX, charinfo.m_OriginY);
1604 m_TempCharList.Add(charinfo);
1605 }
1606 if (item.m_CharCode == (FX_DWORD) - 1) {
1607 continue;
1608 }
1609 }
1610 spacing = 0;
1611 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1612 FX_BOOL bNoUnicode = FALSE;
1613 FX_WCHAR wChar = wstrItem.GetAt(0);
1614 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
1615 if(wstrItem.IsEmpty()) {
1616 wstrItem += (FX_WCHAR)item.m_CharCode;
1617 } else {
1618 wstrItem.SetAt(0, (FX_WCHAR)item.m_CharCode);
1619 }
1620 bNoUnicode = TRUE;
1621 }
1622 charinfo.m_Index = -1;
1623 charinfo.m_CharCode = item.m_CharCode;
1624 if(bNoUnicode) {
1625 charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE;
1626 } else {
1627 charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL;
1628 }
1629 charinfo.m_pTextObj = pTextObj; 1638 charinfo.m_pTextObj = pTextObj;
1630 charinfo.m_OriginX = 0, charinfo.m_OriginY = 0; 1639 charinfo.m_Index = m_TextBuf.GetLength();
1631 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX, cha rinfo.m_OriginY); 1640 m_TempTextBuf.AppendChar(TEXT_BLANK_CHAR);
1632 FX_RECT rect(0, 0, 0, 0); 1641 charinfo.m_CharCode = -1;
1633 rect.Intersect(0, 0, 0, 0); 1642 charinfo.m_Matrix.Copy(formMatrix);
1634 charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode, rect); 1643 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX,
1635 charinfo.m_CharBox.top = rect.top * pTextObj->GetFontSize() / 1000 + ite m.m_OriginY; 1644 charinfo.m_OriginY);
1636 charinfo.m_CharBox.left = rect.left * pTextObj->GetFontSize() / 1000 + i tem.m_OriginX; 1645 charinfo.m_CharBox =
1637 charinfo.m_CharBox.right = rect.right * pTextObj->GetFontSize() / 1000 + item.m_OriginX; 1646 CFX_FloatRect(charinfo.m_OriginX, charinfo.m_OriginY,
1638 charinfo.m_CharBox.bottom = rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_OriginY; 1647 charinfo.m_OriginX, charinfo.m_OriginY);
1639 if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) { 1648 m_TempCharList.Add(charinfo);
1640 charinfo.m_CharBox.top = charinfo.m_CharBox.bottom + pTextObj->GetFo ntSize(); 1649 }
1641 } 1650 if (item.m_CharCode == (FX_DWORD)-1) {
1642 if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) { 1651 continue;
1643 charinfo.m_CharBox.right = charinfo.m_CharBox.left + pTextObj->GetCh arWidth(charinfo.m_CharCode); 1652 }
1644 } 1653 }
1645 matrix.TransformRect(charinfo.m_CharBox); 1654 spacing = 0;
1646 charinfo.m_Matrix.Copy(matrix); 1655 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1647 if (wstrItem.IsEmpty()) { 1656 FX_BOOL bNoUnicode = FALSE;
1648 charinfo.m_Unicode = 0; 1657 FX_WCHAR wChar = wstrItem.GetAt(0);
1649 m_TempCharList.Add(charinfo); 1658 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
1659 if (wstrItem.IsEmpty()) {
1660 wstrItem += (FX_WCHAR)item.m_CharCode;
1661 } else {
1662 wstrItem.SetAt(0, (FX_WCHAR)item.m_CharCode);
1663 }
1664 bNoUnicode = TRUE;
1665 }
1666 charinfo.m_Index = -1;
1667 charinfo.m_CharCode = item.m_CharCode;
1668 if (bNoUnicode) {
1669 charinfo.m_Flag = FPDFTEXT_CHAR_UNUNICODE;
1670 } else {
1671 charinfo.m_Flag = FPDFTEXT_CHAR_NORMAL;
1672 }
1673 charinfo.m_pTextObj = pTextObj;
1674 charinfo.m_OriginX = 0, charinfo.m_OriginY = 0;
1675 matrix.Transform(item.m_OriginX, item.m_OriginY, charinfo.m_OriginX,
1676 charinfo.m_OriginY);
1677 FX_RECT rect(0, 0, 0, 0);
1678 rect.Intersect(0, 0, 0, 0);
1679 charinfo.m_pTextObj->GetFont()->GetCharBBox(charinfo.m_CharCode, rect);
1680 charinfo.m_CharBox.top =
1681 rect.top * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
1682 charinfo.m_CharBox.left =
1683 rect.left * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
1684 charinfo.m_CharBox.right =
1685 rect.right * pTextObj->GetFontSize() / 1000 + item.m_OriginX;
1686 charinfo.m_CharBox.bottom =
1687 rect.bottom * pTextObj->GetFontSize() / 1000 + item.m_OriginY;
1688 if (fabsf(charinfo.m_CharBox.top - charinfo.m_CharBox.bottom) < 0.01f) {
1689 charinfo.m_CharBox.top =
1690 charinfo.m_CharBox.bottom + pTextObj->GetFontSize();
1691 }
1692 if (fabsf(charinfo.m_CharBox.right - charinfo.m_CharBox.left) < 0.01f) {
1693 charinfo.m_CharBox.right =
1694 charinfo.m_CharBox.left + pTextObj->GetCharWidth(charinfo.m_CharCode);
1695 }
1696 matrix.TransformRect(charinfo.m_CharBox);
1697 charinfo.m_Matrix.Copy(matrix);
1698 if (wstrItem.IsEmpty()) {
1699 charinfo.m_Unicode = 0;
1700 m_TempCharList.Add(charinfo);
1701 m_TempTextBuf.AppendChar(0xfffe);
1702 continue;
1703 } else {
1704 int nTotal = wstrItem.GetLength();
1705 FX_BOOL bDel = FALSE;
1706 const int count = std::min(m_TempCharList.GetSize(), 7);
1707 FX_FLOAT threshold = charinfo.m_Matrix.TransformXDistance(
1708 (FX_FLOAT)TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize());
1709 for (int n = m_TempCharList.GetSize();
1710 n > m_TempCharList.GetSize() - count; n--) {
1711 PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt(n - 1);
1712 if (charinfo1->m_CharCode == charinfo.m_CharCode &&
1713 charinfo1->m_pTextObj->GetFont() ==
1714 charinfo.m_pTextObj->GetFont() &&
1715 FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) < threshold &&
1716 FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) < threshold) {
1717 bDel = TRUE;
1718 break;
1719 }
1720 }
1721 if (!bDel) {
1722 for (int nIndex = 0; nIndex < nTotal; nIndex++) {
1723 charinfo.m_Unicode = wstrItem.GetAt(nIndex);
1724 if (charinfo.m_Unicode) {
1725 charinfo.m_Index = m_TextBuf.GetLength();
1726 m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1727 } else {
1650 m_TempTextBuf.AppendChar(0xfffe); 1728 m_TempTextBuf.AppendChar(0xfffe);
1651 continue; 1729 }
1652 } else { 1730 m_TempCharList.Add(charinfo);
1653 int nTotal = wstrItem.GetLength(); 1731 }
1654 FX_BOOL bDel = FALSE; 1732 } else if (i == 0) {
1655 const int count = std::min(m_TempCharList.GetSize(), 7); 1733 CFX_WideString str = m_TempTextBuf.GetWideString();
1656 FX_FLOAT threshold = charinfo.m_Matrix.TransformXDistance((FX_FLOAT) TEXT_CHARRATIO_GAPDELTA * pTextObj->GetFontSize()); 1734 if (!str.IsEmpty() &&
1657 for (int n = m_TempCharList.GetSize(); 1735 str.GetAt(str.GetLength() - 1) == TEXT_BLANK_CHAR) {
1658 n > m_TempCharList.GetSize() - count; 1736 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1659 n--) { 1737 m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1660 PAGECHAR_INFO* charinfo1 = (PAGECHAR_INFO*)m_TempCharList.GetAt( n - 1); 1738 }
1661 if(charinfo1->m_CharCode == charinfo.m_CharCode && 1739 }
1662 charinfo1->m_pTextObj->GetFont() == charinfo.m_pTextObj- >GetFont() && 1740 }
1663 FXSYS_fabs(charinfo1->m_OriginX - charinfo.m_OriginX) < threshold && 1741 }
1664 FXSYS_fabs(charinfo1->m_OriginY - charinfo.m_OriginY) < threshold) { 1742 if (bIsBidiAndMirrorInverse) {
1665 bDel = TRUE; 1743 SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
1666 break; 1744 }
1667 }
1668 }
1669 if(!bDel) {
1670 for (int nIndex = 0; nIndex < nTotal; nIndex++) {
1671 charinfo.m_Unicode = wstrItem.GetAt(nIndex);
1672 if (charinfo.m_Unicode) {
1673 charinfo.m_Index = m_TextBuf.GetLength();
1674 m_TempTextBuf.AppendChar(charinfo.m_Unicode);
1675 } else {
1676 m_TempTextBuf.AppendChar(0xfffe);
1677 }
1678 m_TempCharList.Add(charinfo);
1679 }
1680 } else if(i == 0) {
1681 CFX_WideString str = m_TempTextBuf.GetWideString();
1682 if (!str.IsEmpty() && str.GetAt(str.GetLength() - 1) == TEXT_BLA NK_CHAR) {
1683 m_TempTextBuf.Delete(m_TempTextBuf.GetLength() - 1, 1);
1684 m_TempCharList.Delete(m_TempCharList.GetSize() - 1);
1685 }
1686 }
1687 }
1688 }
1689 if (bIsBidiAndMirrorInverse) {
1690 SwapTempTextBuf(iCharListStartAppend, iBufStartAppend);
1691 }
1692 } 1745 }
1693 void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend, 1746 void CPDF_TextPage::SwapTempTextBuf(int32_t iCharListStartAppend,
1694 int32_t iBufStartAppend) 1747 int32_t iBufStartAppend) {
1695 { 1748 int32_t i, j;
1696 int32_t i, j; 1749 i = iCharListStartAppend;
1697 i = iCharListStartAppend; 1750 j = m_TempCharList.GetSize() - 1;
1698 j = m_TempCharList.GetSize() - 1; 1751 for (; i < j; i++, j--) {
1699 for (; i < j; i++, j--) { 1752 std::swap(m_TempCharList[i], m_TempCharList[j]);
1700 std::swap(m_TempCharList[i], m_TempCharList[j]); 1753 std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index);
1701 std::swap(m_TempCharList[i].m_Index, m_TempCharList[j].m_Index); 1754 }
1702 } 1755 FX_WCHAR* pTempBuffer = m_TempTextBuf.GetBuffer();
1703 FX_WCHAR * pTempBuffer = m_TempTextBuf.GetBuffer(); 1756 i = iBufStartAppend;
1704 i = iBufStartAppend; 1757 j = m_TempTextBuf.GetLength() - 1;
1705 j = m_TempTextBuf.GetLength() - 1; 1758 for (; i < j; i++, j--) {
1706 for (; i < j; i++, j--) { 1759 std::swap(pTempBuffer[i], pTempBuffer[j]);
1707 std::swap(pTempBuffer[i], pTempBuffer[j]); 1760 }
1708 }
1709 } 1761 }
1710 FX_BOOL CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj, 1762 FX_BOOL CPDF_TextPage::IsRightToLeft(const CPDF_TextObject* pTextObj,
1711 const CPDF_Font* pFont, 1763 const CPDF_Font* pFont,
1712 int nItems) const 1764 int nItems) const {
1713 { 1765 nonstd::unique_ptr<IFX_BidiChar> pBidiChar(IFX_BidiChar::Create());
1714 nonstd::unique_ptr<IFX_BidiChar> pBidiChar(IFX_BidiChar::Create()); 1766 int32_t nR2L = 0;
1715 int32_t nR2L = 0; 1767 int32_t nL2R = 0;
1716 int32_t nL2R = 0; 1768 int32_t start = 0, count = 0;
1717 int32_t start = 0, count = 0; 1769 CPDF_TextObjectItem item;
1718 CPDF_TextObjectItem item; 1770 for (int32_t i = 0; i < nItems; i++) {
1719 for (int32_t i = 0; i < nItems; i++) { 1771 pTextObj->GetItemInfo(i, &item);
1720 pTextObj->GetItemInfo(i, &item); 1772 if (item.m_CharCode == (FX_DWORD)-1) {
1721 if (item.m_CharCode == (FX_DWORD)-1) { 1773 continue;
1722 continue; 1774 }
1723 } 1775 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode);
1724 CFX_WideString wstrItem = pFont->UnicodeFromCharCode(item.m_CharCode); 1776 FX_WCHAR wChar = wstrItem.GetAt(0);
1725 FX_WCHAR wChar = wstrItem.GetAt(0); 1777 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) {
1726 if ((wstrItem.IsEmpty() || wChar == 0) && item.m_CharCode) { 1778 wChar = (FX_WCHAR)item.m_CharCode;
1727 wChar = (FX_WCHAR)item.m_CharCode; 1779 }
1728 } 1780 if (!wChar) {
1729 if (!wChar) { 1781 continue;
1730 continue; 1782 }
1731 } 1783 if (pBidiChar->AppendChar(wChar)) {
1732 if (pBidiChar->AppendChar(wChar)) { 1784 int32_t ret = pBidiChar->GetBidiInfo(start, count);
1733 int32_t ret = pBidiChar->GetBidiInfo(start, count); 1785 if (ret == 2) {
1734 if (ret == 2) { 1786 nR2L++;
1735 nR2L++; 1787 } else if (ret == 1) {
1736 } 1788 nL2R++;
1737 else if (ret == 1) { 1789 }
1738 nL2R++; 1790 }
1739 } 1791 }
1740 } 1792 if (pBidiChar->EndChar()) {
1741 } 1793 int32_t ret = pBidiChar->GetBidiInfo(start, count);
1742 if (pBidiChar->EndChar()) { 1794 if (ret == 2) {
1743 int32_t ret = pBidiChar->GetBidiInfo(start, count); 1795 nR2L++;
1744 if (ret == 2) { 1796 } else if (ret == 1) {
1745 nR2L++; 1797 nL2R++;
1746 } 1798 }
1747 else if (ret == 1) { 1799 }
1748 nL2R++; 1800 return (nR2L > 0 && nR2L >= nL2R);
1749 } 1801 }
1750 } 1802 int32_t CPDF_TextPage::GetTextObjectWritingMode(
1751 return (nR2L > 0 && nR2L >= nL2R); 1803 const CPDF_TextObject* pTextObj) {
1752 } 1804 int32_t nChars = pTextObj->CountChars();
1753 int32_t CPDF_TextPage::GetTextObjectWritingMode(const CPDF_TextObject* pTextObj) 1805 if (nChars == 1) {
1754 {
1755 int32_t nChars = pTextObj->CountChars();
1756 if (nChars == 1) {
1757 return m_TextlineDir;
1758 }
1759 CPDF_TextObjectItem first, last;
1760 pTextObj->GetCharInfo(0, &first);
1761 pTextObj->GetCharInfo(nChars - 1, &last);
1762 CFX_Matrix textMatrix;
1763 pTextObj->GetTextMatrix(&textMatrix);
1764 textMatrix.TransformPoint(first.m_OriginX, first.m_OriginY);
1765 textMatrix.TransformPoint(last.m_OriginX, last.m_OriginY);
1766 FX_FLOAT dX = FXSYS_fabs(last.m_OriginX - first.m_OriginX);
1767 FX_FLOAT dY = FXSYS_fabs(last.m_OriginY - first.m_OriginY);
1768 if (dX <= 0.0001f && dY <= 0.0001f) {
1769 return -1;
1770 }
1771 CFX_VectorF v;
1772 v.Set(dX, dY);
1773 v.Normalize();
1774 if (v.y <= 0.0872f) {
1775 return v.x <= 0.0872f ? m_TextlineDir : 0;
1776 }
1777 if (v.x <= 0.0872f) {
1778 return 1;
1779 }
1780 return m_TextlineDir; 1806 return m_TextlineDir;
1781 } 1807 }
1782 FX_BOOL CPDF_TextPage::IsHyphen(FX_WCHAR curChar) 1808 CPDF_TextObjectItem first, last;
1783 { 1809 pTextObj->GetCharInfo(0, &first);
1784 CFX_WideString strCurText = m_TempTextBuf.GetWideString(); 1810 pTextObj->GetCharInfo(nChars - 1, &last);
1785 if(strCurText.GetLength() == 0) { 1811 CFX_Matrix textMatrix;
1786 strCurText = m_TextBuf.GetWideString(); 1812 pTextObj->GetTextMatrix(&textMatrix);
1787 } 1813 textMatrix.TransformPoint(first.m_OriginX, first.m_OriginY);
1788 FX_STRSIZE nCount = strCurText.GetLength(); 1814 textMatrix.TransformPoint(last.m_OriginX, last.m_OriginY);
1789 int nIndex = nCount - 1; 1815 FX_FLOAT dX = FXSYS_fabs(last.m_OriginX - first.m_OriginX);
1790 FX_WCHAR wcTmp = strCurText.GetAt(nIndex); 1816 FX_FLOAT dY = FXSYS_fabs(last.m_OriginY - first.m_OriginY);
1791 while(wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0) { 1817 if (dX <= 0.0001f && dY <= 0.0001f) {
1792 wcTmp = strCurText.GetAt(--nIndex); 1818 return -1;
1793 } 1819 }
1794 if (0x2D == wcTmp || 0xAD == wcTmp) { 1820 CFX_VectorF v;
1795 if (--nIndex > 0) { 1821 v.Set(dX, dY);
1796 FX_WCHAR preChar = strCurText.GetAt((nIndex)); 1822 v.Normalize();
1797 if (((preChar >= L'A' && preChar <= L'Z') || (preChar >= L'a' && pre Char <= L'z')) 1823 if (v.y <= 0.0872f) {
1798 && ((curChar >= L'A' && curChar <= L'Z') || (curChar >= L'a' && curChar <= L'z'))) { 1824 return v.x <= 0.0872f ? m_TextlineDir : 0;
1799 return TRUE; 1825 }
1800 } 1826 if (v.x <= 0.0872f) {
1801 } 1827 return 1;
1802 int size = m_TempCharList.GetSize(); 1828 }
1803 PAGECHAR_INFO preChar; 1829 return m_TextlineDir;
1804 if (size) { 1830 }
1805 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1]; 1831 FX_BOOL CPDF_TextPage::IsHyphen(FX_WCHAR curChar) {
1806 } else { 1832 CFX_WideString strCurText = m_TempTextBuf.GetWideString();
1807 size = m_charList.GetSize(); 1833 if (strCurText.GetLength() == 0) {
1808 if(size == 0) { 1834 strCurText = m_TextBuf.GetWideString();
1809 return FALSE; 1835 }
1810 } 1836 FX_STRSIZE nCount = strCurText.GetLength();
1811 preChar = (PAGECHAR_INFO)m_charList[size - 1]; 1837 int nIndex = nCount - 1;
1812 } 1838 FX_WCHAR wcTmp = strCurText.GetAt(nIndex);
1813 if (FPDFTEXT_CHAR_PIECE == preChar.m_Flag) 1839 while (wcTmp == 0x20 && nIndex <= nCount - 1 && nIndex >= 0) {
1814 if (0xAD == preChar.m_Unicode || 0x2D == preChar.m_Unicode) { 1840 wcTmp = strCurText.GetAt(--nIndex);
1815 return TRUE; 1841 }
1816 } 1842 if (0x2D == wcTmp || 0xAD == wcTmp) {
1817 } 1843 if (--nIndex > 0) {
1818 return FALSE; 1844 FX_WCHAR preChar = strCurText.GetAt((nIndex));
1819 } 1845 if (((preChar >= L'A' && preChar <= L'Z') ||
1820 int CPDF_TextPage::ProcessInsertObject(const CPDF_TextObject* pObj, const CFX_Af fineMatrix& formMatrix) 1846 (preChar >= L'a' && preChar <= L'z')) &&
1821 { 1847 ((curChar >= L'A' && curChar <= L'Z') ||
1822 FindPreviousTextObject(); 1848 (curChar >= L'a' && curChar <= L'z'))) {
1823 FX_BOOL bNewline = FALSE; 1849 return TRUE;
1824 int WritingMode = GetTextObjectWritingMode(pObj); 1850 }
1825 if(WritingMode == -1) { 1851 }
1826 WritingMode = GetTextObjectWritingMode(m_pPreTextObj);
1827 }
1828 CFX_FloatRect this_rect(pObj->m_Left, pObj->m_Bottom, pObj->m_Right, pObj->m _Top);
1829 CFX_FloatRect prev_rect(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom, m_pP reTextObj->m_Right, m_pPreTextObj->m_Top);
1830 CPDF_TextObjectItem PrevItem, item;
1831 int nItem = m_pPreTextObj->CountItems();
1832 m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem);
1833 pObj->GetItemInfo(0, &item);
1834 CFX_WideString wstrItem = pObj->GetFont()->UnicodeFromCharCode(item.m_CharCo de);
1835 if(wstrItem.IsEmpty()) {
1836 wstrItem += (FX_WCHAR)item.m_CharCode;
1837 }
1838 FX_WCHAR curChar = wstrItem.GetAt(0);
1839 if(WritingMode == 0) {
1840 if(this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) {
1841 FX_FLOAT top = this_rect.top < prev_rect.top ? this_rect.top : prev_ rect.top;
1842 FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bo ttom : prev_rect.bottom;
1843 if(bottom >= top) {
1844 if(IsHyphen(curChar)) {
1845 return 3;
1846 }
1847 return 2;
1848 }
1849 }
1850 } else if (WritingMode == 1) {
1851 if(this_rect.Width() > pObj->GetFontSize() * 0.1f && prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) {
1852 FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left : m_CurlineRect.left;
1853 FX_FLOAT right = this_rect.right < m_CurlineRect.right ? this_rect.r ight : m_CurlineRect.right;
1854 if(right <= left) {
1855 if(IsHyphen(curChar)) {
1856 return 3;
1857 }
1858 return 2;
1859 }
1860 }
1861 }
1862 FX_FLOAT last_pos = PrevItem.m_OriginX;
1863 int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont()) ;
1864 FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000;
1865 last_width = FXSYS_fabs(last_width);
1866 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
1867 FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
1868 this_width = FXSYS_fabs(this_width);
1869 FX_FLOAT threshold = last_width > this_width ? last_width / 4 : this_width / 4;
1870 CFX_AffineMatrix prev_matrix, prev_reverse;
1871 m_pPreTextObj->GetTextMatrix(&prev_matrix);
1872 prev_matrix.Concat(m_perMatrix);
1873 prev_reverse.SetReverse(prev_matrix);
1874 FX_FLOAT x = pObj->GetPosX();
1875 FX_FLOAT y = pObj->GetPosY();
1876 formMatrix.Transform(x, y);
1877 prev_reverse.Transform(x, y);
1878 if(last_width < this_width) {
1879 threshold = prev_reverse.TransformDistance(threshold);
1880 }
1881 CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom, m_pPreTextObj->m_ Right, pObj->m_Top);
1882 CFX_FloatRect rect2(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom, m_pPreTe xtObj->m_Right, m_pPreTextObj->m_Top);
1883 CFX_FloatRect rect3 = rect1;
1884 rect1.Intersect(rect2);
1885 if (WritingMode == 0) {
1886 if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5)
1887 || ((y > threshold * 2 || y < threshold * -3) && (FXSYS_fabs(y) < 1 ? FXSYS_fabs(x) < FXSYS_fabs(y) : TRUE))) {
1888 bNewline = TRUE;
1889 if(nItem > 1 ) {
1890 CPDF_TextObjectItem tempItem;
1891 m_pPreTextObj->GetItemInfo(0, &tempItem);
1892 CFX_AffineMatrix m;
1893 m_pPreTextObj->GetTextMatrix(&m);
1894 if(PrevItem.m_OriginX > tempItem.m_OriginX &&
1895 m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1896 m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9
1897 && m.b < 0.1 && m.c < 0.1 ) {
1898 CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000, m_pPreTex tObj->m_Top);
1899 if(re.Contains(pObj->GetPosX(), pObj->GetPosY())) {
1900 bNewline = FALSE;
1901 } else {
1902 CFX_FloatRect re(0, pObj->m_Bottom, 1000, pObj->m_Top);
1903 if(re.Contains(m_pPreTextObj->GetPosX(), m_pPreTextObj-> GetPosY())) {
1904 bNewline = FALSE;
1905 }
1906 }
1907 }
1908 }
1909 }
1910 }
1911 if(bNewline) {
1912 if(IsHyphen(curChar)) {
1913 return 3;
1914 }
1915 return 2;
1916 }
1917 int32_t nChars = pObj->CountChars();
1918 if (nChars == 1 && ( 0x2D == curChar || 0xAD == curChar))
1919 if (IsHyphen(curChar)) {
1920 return 3;
1921 }
1922 CFX_WideString PrevStr = m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevI tem.m_CharCode);
1923 FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1);
1924 CFX_AffineMatrix matrix;
1925 pObj->GetTextMatrix(&matrix);
1926 matrix.Concat(formMatrix);
1927 threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
1928 threshold = threshold > 400 ? (threshold < 700 ? threshold / 4 : (threshold > 800 ? threshold / 6 : threshold / 5)) : (threshold / 2);
1929 if(nLastWidth >= nThisWidth) {
1930 threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize());
1931 } else {
1932 threshold *= FXSYS_fabs(pObj->GetFontSize());
1933 threshold = matrix.TransformDistance(threshold);
1934 threshold = prev_reverse.TransformDistance(threshold);
1935 }
1936 threshold /= 1000;
1937 if((threshold < 1.4881 && threshold > 1.4879)
1938 || (threshold < 1.39001 && threshold > 1.38999)) {
1939 threshold *= 1.5;
1940 }
1941 if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' && preChar != L' ')
1942 if (curChar != L' ' && preChar != L' ') {
1943 if((x - last_pos - last_width) > threshold || (last_pos - x - last_w idth) > threshold) {
1944 return 1;
1945 }
1946 if(x < 0 && (last_pos - x - last_width) > threshold) {
1947 return 1;
1948 }
1949 if((x - last_pos - last_width) > this_width || (x - last_pos - this_ width) > last_width ) {
1950 return 1;
1951 }
1952 }
1953 return 0;
1954 }
1955 FX_BOOL CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1, CPDF_TextObj ect* pTextObj2)
1956 {
1957 if (!pTextObj1 || !pTextObj2) {
1958 return FALSE;
1959 }
1960 CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom, pTextObj2->m_ Right, pTextObj2->m_Top);
1961 CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom, pTextObj1->m_ Right, pTextObj1->m_Top);
1962 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty() && !m_ParseOptions.m_bGetCharCo deOnly) {
1963 FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left);
1964 int nCount = m_charList.GetSize();
1965 if (nCount >= 2) {
1966 PAGECHAR_INFO perCharTemp = (PAGECHAR_INFO)m_charList[nCount - 2];
1967 FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width();
1968 if (dbXdif > dbSpace) {
1969 return FALSE;
1970 }
1971 }
1972 }
1973 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
1974 rcPreObj.Intersect(rcCurObj);
1975 if (rcPreObj.IsEmpty()) {
1976 return FALSE;
1977 }
1978 if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) > rcCurObj.Width() / 2) {
1979 return FALSE;
1980 }
1981 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
1982 return FALSE;
1983 }
1984 }
1985 int nPreCount = pTextObj2->CountItems();
1986 int nCurCount = pTextObj1->CountItems();
1987 if (nPreCount != nCurCount) {
1988 return FALSE;
1989 }
1990 CPDF_TextObjectItem itemPer, itemCur;
1991 for (int i = 0; i < nPreCount; i++) {
1992 pTextObj2->GetItemInfo(i, &itemPer);
1993 pTextObj1->GetItemInfo(i, &itemCur);
1994 if (itemCur.m_CharCode != itemPer.m_CharCode) {
1995 return FALSE;
1996 }
1997 }
1998 if(FXSYS_fabs(pTextObj1->GetPosX() - pTextObj2->GetPosX()) > GetCharWidth(it emPer.m_CharCode, pTextObj2->GetFont())*pTextObj2->GetFontSize() / 1000 * 0.9 ||
1999 FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) >
2000 FX_MAX(FX_MAX(rcPreObj.Height() , rcPreObj.Width()), pTextObj2->GetF ontSize()) / 8) {
2001 return FALSE;
2002 }
2003 return TRUE;
2004 }
2005 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj, FX_POSIT ION ObjPos)
2006 {
2007 if (!pTextObj) {
2008 return FALSE;
2009 }
2010 int i = 0;
2011 if (!ObjPos) {
2012 ObjPos = m_pPage->GetLastObjectPosition();
2013 }
2014 CPDF_PageObject* pObj = m_pPage->GetPrevObject(ObjPos);
2015 while (i < 5 && ObjPos) {
2016 pObj = m_pPage->GetPrevObject(ObjPos);
2017 if(pObj == pTextObj) {
2018 continue;
2019 }
2020 if(pObj->m_Type != PDFPAGE_TEXT) {
2021 continue;
2022 }
2023 if(IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) {
2024 return TRUE;
2025 }
2026 i++;
2027 }
2028 return FALSE;
2029 }
2030 FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info)
2031 {
2032 int size = m_TempCharList.GetSize(); 1852 int size = m_TempCharList.GetSize();
2033 PAGECHAR_INFO preChar; 1853 PAGECHAR_INFO preChar;
2034 if (size) { 1854 if (size) {
2035 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1]; 1855 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
2036 } else { 1856 } else {
2037 size = m_charList.GetSize(); 1857 size = m_charList.GetSize();
2038 if(size == 0) { 1858 if (size == 0) {
2039 return FALSE;
2040 }
2041 preChar = (PAGECHAR_INFO)m_charList[size - 1];
2042 }
2043 info.m_Index = m_TextBuf.GetLength();
2044 info.m_Unicode = unicode;
2045 info.m_pTextObj = NULL;
2046 info.m_CharCode = -1;
2047 info.m_Flag = FPDFTEXT_CHAR_GENERATED;
2048 int preWidth = 0;
2049 if (preChar.m_pTextObj && preChar.m_CharCode != (FX_DWORD) - 1) {
2050 preWidth = GetCharWidth(preChar.m_CharCode, preChar.m_pTextObj->GetFont( ));
2051 }
2052 FX_FLOAT fs = 0;
2053 if(preChar.m_pTextObj) {
2054 fs = preChar.m_pTextObj->GetFontSize();
2055 } else {
2056 fs = preChar.m_CharBox.Height();
2057 }
2058 if(!fs) {
2059 fs = 1;
2060 }
2061 info.m_OriginX = preChar.m_OriginX + preWidth * (fs) / 1000;
2062 info.m_OriginY = preChar.m_OriginY;
2063 info.m_CharBox = CFX_FloatRect(info.m_OriginX, info.m_OriginY, info.m_Origin X, info.m_OriginY);
2064 return TRUE;
2065 }
2066 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1, const CFX_Flo atRect& rect2)
2067 {
2068 CFX_FloatRect rect = rect1;
2069 rect.Intersect(rect2);
2070 return !rect.IsEmpty();
2071 }
2072 FX_BOOL»CPDF_TextPage::IsLetter(FX_WCHAR unicode)
2073 {
2074 if (unicode < L'A') {
2075 return FALSE; 1859 return FALSE;
2076 } 1860 }
2077 if (unicode > L'Z' && unicode < L'a') { 1861 preChar = (PAGECHAR_INFO)m_charList[size - 1];
1862 }
1863 if (FPDFTEXT_CHAR_PIECE == preChar.m_Flag)
1864 if (0xAD == preChar.m_Unicode || 0x2D == preChar.m_Unicode) {
1865 return TRUE;
1866 }
1867 }
1868 return FALSE;
1869 }
1870 int CPDF_TextPage::ProcessInsertObject(const CPDF_TextObject* pObj,
1871 const CFX_AffineMatrix& formMatrix) {
1872 FindPreviousTextObject();
1873 FX_BOOL bNewline = FALSE;
1874 int WritingMode = GetTextObjectWritingMode(pObj);
1875 if (WritingMode == -1) {
1876 WritingMode = GetTextObjectWritingMode(m_pPreTextObj);
1877 }
1878 CFX_FloatRect this_rect(pObj->m_Left, pObj->m_Bottom, pObj->m_Right,
1879 pObj->m_Top);
1880 CFX_FloatRect prev_rect(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom,
1881 m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
1882 CPDF_TextObjectItem PrevItem, item;
1883 int nItem = m_pPreTextObj->CountItems();
1884 m_pPreTextObj->GetItemInfo(nItem - 1, &PrevItem);
1885 pObj->GetItemInfo(0, &item);
1886 CFX_WideString wstrItem =
1887 pObj->GetFont()->UnicodeFromCharCode(item.m_CharCode);
1888 if (wstrItem.IsEmpty()) {
1889 wstrItem += (FX_WCHAR)item.m_CharCode;
1890 }
1891 FX_WCHAR curChar = wstrItem.GetAt(0);
1892 if (WritingMode == 0) {
1893 if (this_rect.Height() > 4.5 && prev_rect.Height() > 4.5) {
1894 FX_FLOAT top =
1895 this_rect.top < prev_rect.top ? this_rect.top : prev_rect.top;
1896 FX_FLOAT bottom = this_rect.bottom > prev_rect.bottom ? this_rect.bottom
1897 : prev_rect.bottom;
1898 if (bottom >= top) {
1899 if (IsHyphen(curChar)) {
1900 return 3;
1901 }
1902 return 2;
1903 }
1904 }
1905 } else if (WritingMode == 1) {
1906 if (this_rect.Width() > pObj->GetFontSize() * 0.1f &&
1907 prev_rect.Width() > m_pPreTextObj->GetFontSize() * 0.1f) {
1908 FX_FLOAT left = this_rect.left > m_CurlineRect.left ? this_rect.left
1909 : m_CurlineRect.left;
1910 FX_FLOAT right = this_rect.right < m_CurlineRect.right
1911 ? this_rect.right
1912 : m_CurlineRect.right;
1913 if (right <= left) {
1914 if (IsHyphen(curChar)) {
1915 return 3;
1916 }
1917 return 2;
1918 }
1919 }
1920 }
1921 FX_FLOAT last_pos = PrevItem.m_OriginX;
1922 int nLastWidth = GetCharWidth(PrevItem.m_CharCode, m_pPreTextObj->GetFont());
1923 FX_FLOAT last_width = nLastWidth * m_pPreTextObj->GetFontSize() / 1000;
1924 last_width = FXSYS_fabs(last_width);
1925 int nThisWidth = GetCharWidth(item.m_CharCode, pObj->GetFont());
1926 FX_FLOAT this_width = nThisWidth * pObj->GetFontSize() / 1000;
1927 this_width = FXSYS_fabs(this_width);
1928 FX_FLOAT threshold =
1929 last_width > this_width ? last_width / 4 : this_width / 4;
1930 CFX_AffineMatrix prev_matrix, prev_reverse;
1931 m_pPreTextObj->GetTextMatrix(&prev_matrix);
1932 prev_matrix.Concat(m_perMatrix);
1933 prev_reverse.SetReverse(prev_matrix);
1934 FX_FLOAT x = pObj->GetPosX();
1935 FX_FLOAT y = pObj->GetPosY();
1936 formMatrix.Transform(x, y);
1937 prev_reverse.Transform(x, y);
1938 if (last_width < this_width) {
1939 threshold = prev_reverse.TransformDistance(threshold);
1940 }
1941 CFX_FloatRect rect1(m_pPreTextObj->m_Left, pObj->m_Bottom,
1942 m_pPreTextObj->m_Right, pObj->m_Top);
1943 CFX_FloatRect rect2(m_pPreTextObj->m_Left, m_pPreTextObj->m_Bottom,
1944 m_pPreTextObj->m_Right, m_pPreTextObj->m_Top);
1945 CFX_FloatRect rect3 = rect1;
1946 rect1.Intersect(rect2);
1947 if (WritingMode == 0) {
1948 if ((rect1.IsEmpty() && rect2.Height() > 5 && rect3.Height() > 5) ||
1949 ((y > threshold * 2 || y < threshold * -3) &&
1950 (FXSYS_fabs(y) < 1 ? FXSYS_fabs(x) < FXSYS_fabs(y) : TRUE))) {
1951 bNewline = TRUE;
1952 if (nItem > 1) {
1953 CPDF_TextObjectItem tempItem;
1954 m_pPreTextObj->GetItemInfo(0, &tempItem);
1955 CFX_AffineMatrix m;
1956 m_pPreTextObj->GetTextMatrix(&m);
1957 if (PrevItem.m_OriginX > tempItem.m_OriginX &&
1958 m_DisplayMatrix.a > 0.9 && m_DisplayMatrix.b < 0.1 &&
1959 m_DisplayMatrix.c < 0.1 && m_DisplayMatrix.d < -0.9 && m.b < 0.1 &&
1960 m.c < 0.1) {
1961 CFX_FloatRect re(0, m_pPreTextObj->m_Bottom, 1000,
1962 m_pPreTextObj->m_Top);
1963 if (re.Contains(pObj->GetPosX(), pObj->GetPosY())) {
1964 bNewline = FALSE;
1965 } else {
1966 CFX_FloatRect re(0, pObj->m_Bottom, 1000, pObj->m_Top);
1967 if (re.Contains(m_pPreTextObj->GetPosX(),
1968 m_pPreTextObj->GetPosY())) {
1969 bNewline = FALSE;
1970 }
1971 }
1972 }
1973 }
1974 }
1975 }
1976 if (bNewline) {
1977 if (IsHyphen(curChar)) {
1978 return 3;
1979 }
1980 return 2;
1981 }
1982 int32_t nChars = pObj->CountChars();
1983 if (nChars == 1 && (0x2D == curChar || 0xAD == curChar))
1984 if (IsHyphen(curChar)) {
1985 return 3;
1986 }
1987 CFX_WideString PrevStr =
1988 m_pPreTextObj->GetFont()->UnicodeFromCharCode(PrevItem.m_CharCode);
1989 FX_WCHAR preChar = PrevStr.GetAt(PrevStr.GetLength() - 1);
1990 CFX_AffineMatrix matrix;
1991 pObj->GetTextMatrix(&matrix);
1992 matrix.Concat(formMatrix);
1993 threshold = (FX_FLOAT)(nLastWidth > nThisWidth ? nLastWidth : nThisWidth);
1994 threshold = threshold > 400
1995 ? (threshold < 700
1996 ? threshold / 4
1997 : (threshold > 800 ? threshold / 6 : threshold / 5))
1998 : (threshold / 2);
1999 if (nLastWidth >= nThisWidth) {
2000 threshold *= FXSYS_fabs(m_pPreTextObj->GetFontSize());
2001 } else {
2002 threshold *= FXSYS_fabs(pObj->GetFontSize());
2003 threshold = matrix.TransformDistance(threshold);
2004 threshold = prev_reverse.TransformDistance(threshold);
2005 }
2006 threshold /= 1000;
2007 if ((threshold < 1.4881 && threshold > 1.4879) ||
2008 (threshold < 1.39001 && threshold > 1.38999)) {
2009 threshold *= 1.5;
2010 }
2011 if (FXSYS_fabs(last_pos + last_width - x) > threshold && curChar != L' ' &&
2012 preChar != L' ')
2013 if (curChar != L' ' && preChar != L' ') {
2014 if ((x - last_pos - last_width) > threshold ||
2015 (last_pos - x - last_width) > threshold) {
2016 return 1;
2017 }
2018 if (x < 0 && (last_pos - x - last_width) > threshold) {
2019 return 1;
2020 }
2021 if ((x - last_pos - last_width) > this_width ||
2022 (x - last_pos - this_width) > last_width) {
2023 return 1;
2024 }
2025 }
2026 return 0;
2027 }
2028 FX_BOOL CPDF_TextPage::IsSameTextObject(CPDF_TextObject* pTextObj1,
2029 CPDF_TextObject* pTextObj2) {
2030 if (!pTextObj1 || !pTextObj2) {
2031 return FALSE;
2032 }
2033 CFX_FloatRect rcPreObj(pTextObj2->m_Left, pTextObj2->m_Bottom,
2034 pTextObj2->m_Right, pTextObj2->m_Top);
2035 CFX_FloatRect rcCurObj(pTextObj1->m_Left, pTextObj1->m_Bottom,
2036 pTextObj1->m_Right, pTextObj1->m_Top);
2037 if (rcPreObj.IsEmpty() && rcCurObj.IsEmpty() &&
2038 !m_ParseOptions.m_bGetCharCodeOnly) {
2039 FX_FLOAT dbXdif = FXSYS_fabs(rcPreObj.left - rcCurObj.left);
2040 int nCount = m_charList.GetSize();
2041 if (nCount >= 2) {
2042 PAGECHAR_INFO perCharTemp = (PAGECHAR_INFO)m_charList[nCount - 2];
2043 FX_FLOAT dbSpace = perCharTemp.m_CharBox.Width();
2044 if (dbXdif > dbSpace) {
2078 return FALSE; 2045 return FALSE;
2079 } 2046 }
2080 if (unicode > L'z') { 2047 }
2081 return FALSE; 2048 }
2082 } 2049 if (!rcPreObj.IsEmpty() || !rcCurObj.IsEmpty()) {
2083 return TRUE; 2050 rcPreObj.Intersect(rcCurObj);
2051 if (rcPreObj.IsEmpty()) {
2052 return FALSE;
2053 }
2054 if (FXSYS_fabs(rcPreObj.Width() - rcCurObj.Width()) >
2055 rcCurObj.Width() / 2) {
2056 return FALSE;
2057 }
2058 if (pTextObj2->GetFontSize() != pTextObj1->GetFontSize()) {
2059 return FALSE;
2060 }
2061 }
2062 int nPreCount = pTextObj2->CountItems();
2063 int nCurCount = pTextObj1->CountItems();
2064 if (nPreCount != nCurCount) {
2065 return FALSE;
2066 }
2067 CPDF_TextObjectItem itemPer, itemCur;
2068 for (int i = 0; i < nPreCount; i++) {
2069 pTextObj2->GetItemInfo(i, &itemPer);
2070 pTextObj1->GetItemInfo(i, &itemCur);
2071 if (itemCur.m_CharCode != itemPer.m_CharCode) {
2072 return FALSE;
2073 }
2074 }
2075 if (FXSYS_fabs(pTextObj1->GetPosX() - pTextObj2->GetPosX()) >
2076 GetCharWidth(itemPer.m_CharCode, pTextObj2->GetFont()) *
2077 pTextObj2->GetFontSize() / 1000 * 0.9 ||
2078 FXSYS_fabs(pTextObj1->GetPosY() - pTextObj2->GetPosY()) >
2079 FX_MAX(FX_MAX(rcPreObj.Height(), rcPreObj.Width()),
2080 pTextObj2->GetFontSize()) /
2081 8) {
2082 return FALSE;
2083 }
2084 return TRUE;
2085 }
2086 FX_BOOL CPDF_TextPage::IsSameAsPreTextObject(CPDF_TextObject* pTextObj,
2087 FX_POSITION ObjPos) {
2088 if (!pTextObj) {
2089 return FALSE;
2090 }
2091 int i = 0;
2092 if (!ObjPos) {
2093 ObjPos = m_pPage->GetLastObjectPosition();
2094 }
2095 CPDF_PageObject* pObj = m_pPage->GetPrevObject(ObjPos);
2096 while (i < 5 && ObjPos) {
2097 pObj = m_pPage->GetPrevObject(ObjPos);
2098 if (pObj == pTextObj) {
2099 continue;
2100 }
2101 if (pObj->m_Type != PDFPAGE_TEXT) {
2102 continue;
2103 }
2104 if (IsSameTextObject((CPDF_TextObject*)pObj, pTextObj)) {
2105 return TRUE;
2106 }
2107 i++;
2108 }
2109 return FALSE;
2110 }
2111 FX_BOOL CPDF_TextPage::GenerateCharInfo(FX_WCHAR unicode, PAGECHAR_INFO& info) {
2112 int size = m_TempCharList.GetSize();
2113 PAGECHAR_INFO preChar;
2114 if (size) {
2115 preChar = (PAGECHAR_INFO)m_TempCharList[size - 1];
2116 } else {
2117 size = m_charList.GetSize();
2118 if (size == 0) {
2119 return FALSE;
2120 }
2121 preChar = (PAGECHAR_INFO)m_charList[size - 1];
2122 }
2123 info.m_Index = m_TextBuf.GetLength();
2124 info.m_Unicode = unicode;
2125 info.m_pTextObj = NULL;
2126 info.m_CharCode = -1;
2127 info.m_Flag = FPDFTEXT_CHAR_GENERATED;
2128 int preWidth = 0;
2129 if (preChar.m_pTextObj && preChar.m_CharCode != (FX_DWORD)-1) {
2130 preWidth = GetCharWidth(preChar.m_CharCode, preChar.m_pTextObj->GetFont());
2131 }
2132 FX_FLOAT fs = 0;
2133 if (preChar.m_pTextObj) {
2134 fs = preChar.m_pTextObj->GetFontSize();
2135 } else {
2136 fs = preChar.m_CharBox.Height();
2137 }
2138 if (!fs) {
2139 fs = 1;
2140 }
2141 info.m_OriginX = preChar.m_OriginX + preWidth * (fs) / 1000;
2142 info.m_OriginY = preChar.m_OriginY;
2143 info.m_CharBox = CFX_FloatRect(info.m_OriginX, info.m_OriginY, info.m_OriginX,
2144 info.m_OriginY);
2145 return TRUE;
2146 }
2147 FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
2148 const CFX_FloatRect& rect2) {
2149 CFX_FloatRect rect = rect1;
2150 rect.Intersect(rect2);
2151 return !rect.IsEmpty();
2152 }
2153 FX_BOOL CPDF_TextPage::IsLetter(FX_WCHAR unicode) {
2154 if (unicode < L'A') {
2155 return FALSE;
2156 }
2157 if (unicode > L'Z' && unicode < L'a') {
2158 return FALSE;
2159 }
2160 if (unicode > L'z') {
2161 return FALSE;
2162 }
2163 return TRUE;
2084 } 2164 }
2085 CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage) 2165 CPDF_TextPageFind::CPDF_TextPageFind(const IPDF_TextPage* pTextPage)
2086 : m_pTextPage(pTextPage), 2166 : m_pTextPage(pTextPage),
2087 m_flags(0), 2167 m_flags(0),
2088 m_findNextStart(-1), 2168 m_findNextStart(-1),
2089 m_findPreStart(-1), 2169 m_findPreStart(-1),
2090 m_bMatchCase(FALSE), 2170 m_bMatchCase(FALSE),
2091 m_bMatchWholeWord(FALSE), 2171 m_bMatchWholeWord(FALSE),
2092 m_resStart(0), 2172 m_resStart(0),
2093 m_resEnd(-1), 2173 m_resEnd(-1),
2094 m_IsFind(FALSE) 2174 m_IsFind(FALSE) {
2095 { 2175 m_strText = m_pTextPage->GetPageText();
2176 int nCount = pTextPage->CountChars();
2177 if (nCount) {
2178 m_CharIndex.Add(0);
2179 }
2180 for (int i = 0; i < nCount; i++) {
2181 FPDF_CHAR_INFO info;
2182 pTextPage->GetCharInfo(i, info);
2183 int indexSize = m_CharIndex.GetSize();
2184 if (info.m_Flag == CHAR_NORMAL || info.m_Flag == CHAR_GENERATED) {
2185 if (indexSize % 2) {
2186 m_CharIndex.Add(1);
2187 } else {
2188 if (indexSize <= 0) {
2189 continue;
2190 }
2191 m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1) + 1);
2192 }
2193 } else {
2194 if (indexSize % 2) {
2195 if (indexSize <= 0) {
2196 continue;
2197 }
2198 m_CharIndex.SetAt(indexSize - 1, i + 1);
2199 } else {
2200 m_CharIndex.Add(i + 1);
2201 }
2202 }
2203 }
2204 int indexSize = m_CharIndex.GetSize();
2205 if (indexSize % 2) {
2206 m_CharIndex.RemoveAt(indexSize - 1);
2207 }
2208 }
2209 int CPDF_TextPageFind::GetCharIndex(int index) const {
2210 return m_pTextPage->CharIndexFromTextIndex(index);
2211 int indexSize = m_CharIndex.GetSize();
2212 int count = 0;
2213 for (int i = 0; i < indexSize; i += 2) {
2214 count += m_CharIndex.GetAt(i + 1);
2215 if (count > index) {
2216 return index - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.GetAt(i);
2217 }
2218 }
2219 return -1;
2220 }
2221 FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
2222 int flags,
2223 int startPos) {
2224 if (!m_pTextPage) {
2225 return FALSE;
2226 }
2227 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) {
2096 m_strText = m_pTextPage->GetPageText(); 2228 m_strText = m_pTextPage->GetPageText();
2097 int nCount = pTextPage->CountChars(); 2229 }
2098 if(nCount) { 2230 CFX_WideString findwhatStr = findwhat;
2099 m_CharIndex.Add(0); 2231 m_findWhat = findwhatStr;
2100 } 2232 m_flags = flags;
2101 for(int i = 0; i < nCount; i++) { 2233 m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
2102 FPDF_CHAR_INFO info; 2234 if (m_strText.IsEmpty()) {
2103 pTextPage->GetCharInfo(i, info); 2235 m_IsFind = FALSE;
2104 int indexSize = m_CharIndex.GetSize(); 2236 return TRUE;
2105 if(info.m_Flag == CHAR_NORMAL || info.m_Flag == CHAR_GENERATED) { 2237 }
2106 if(indexSize % 2) { 2238 FX_STRSIZE len = findwhatStr.GetLength();
2107 m_CharIndex.Add(1); 2239 if (!m_bMatchCase) {
2108 } else { 2240 findwhatStr.MakeLower();
2109 if(indexSize <= 0) { 2241 m_strText.MakeLower();
2110 continue; 2242 }
2111 } 2243 m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
2112 m_CharIndex.SetAt(indexSize - 1, m_CharIndex.GetAt(indexSize - 1 ) + 1); 2244 m_findNextStart = startPos;
2113 } 2245 if (startPos == -1) {
2246 m_findPreStart = m_strText.GetLength() - 1;
2247 } else {
2248 m_findPreStart = startPos;
2249 }
2250 m_csFindWhatArray.RemoveAll();
2251 int i = 0;
2252 while (i < len) {
2253 if (findwhatStr.GetAt(i) != ' ') {
2254 break;
2255 }
2256 i++;
2257 }
2258 if (i < len) {
2259 ExtractFindWhat(findwhatStr);
2260 } else {
2261 m_csFindWhatArray.Add(findwhatStr);
2262 }
2263 if (m_csFindWhatArray.GetSize() <= 0) {
2264 return FALSE;
2265 }
2266 m_IsFind = TRUE;
2267 m_resStart = 0;
2268 m_resEnd = -1;
2269 return TRUE;
2270 }
2271 FX_BOOL CPDF_TextPageFind::FindNext() {
2272 if (!m_pTextPage) {
2273 return FALSE;
2274 }
2275 m_resArray.RemoveAll();
2276 if (m_findNextStart == -1) {
2277 return FALSE;
2278 }
2279 if (m_strText.IsEmpty()) {
2280 m_IsFind = FALSE;
2281 return m_IsFind;
2282 }
2283 int strLen = m_strText.GetLength();
2284 if (m_findNextStart > strLen - 1) {
2285 m_IsFind = FALSE;
2286 return m_IsFind;
2287 }
2288 int nCount = m_csFindWhatArray.GetSize();
2289 int nResultPos = 0;
2290 int nStartPos = 0;
2291 nStartPos = m_findNextStart;
2292 FX_BOOL bSpaceStart = FALSE;
2293 for (int iWord = 0; iWord < nCount; iWord++) {
2294 CFX_WideString csWord = m_csFindWhatArray[iWord];
2295 if (csWord.IsEmpty()) {
2296 if (iWord == nCount - 1) {
2297 FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
2298 if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_BLANK_CHAR ||
2299 strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
2300 nResultPos = nStartPos + 1;
2301 break;
2302 }
2303 iWord = -1;
2304 } else if (iWord == 0) {
2305 bSpaceStart = TRUE;
2306 }
2307 continue;
2308 }
2309 int endIndex;
2310 nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
2311 if (nResultPos == -1) {
2312 m_IsFind = FALSE;
2313 return m_IsFind;
2314 }
2315 endIndex = nResultPos + csWord.GetLength() - 1;
2316 if (iWord == 0) {
2317 m_resStart = nResultPos;
2318 }
2319 FX_BOOL bMatch = TRUE;
2320 if (iWord != 0 && !bSpaceStart) {
2321 int PreResEndPos = nStartPos;
2322 int curChar = csWord.GetAt(0);
2323 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
2324 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
2325 if (nStartPos == nResultPos &&
2326 !(_IsIgnoreSpaceCharacter(lastChar) ||
2327 _IsIgnoreSpaceCharacter(curChar))) {
2328 bMatch = FALSE;
2329 }
2330 for (int d = PreResEndPos; d < nResultPos; d++) {
2331 FX_WCHAR strInsert = m_strText.GetAt(d);
2332 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR &&
2333 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2334 bMatch = FALSE;
2335 break;
2336 }
2337 }
2338 } else if (bSpaceStart) {
2339 if (nResultPos > 0) {
2340 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
2341 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CHAR &&
2342 strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2343 bMatch = FALSE;
2344 m_resStart = nResultPos;
2114 } else { 2345 } else {
2115 if(indexSize % 2) { 2346 m_resStart = nResultPos - 1;
2116 if(indexSize <= 0) { 2347 }
2117 continue; 2348 }
2118 } 2349 }
2119 m_CharIndex.SetAt(indexSize - 1, i + 1); 2350 if (m_bMatchWholeWord && bMatch) {
2120 } else { 2351 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
2121 m_CharIndex.Add(i + 1); 2352 }
2122 } 2353 nStartPos = endIndex + 1;
2123 } 2354 if (!bMatch) {
2124 } 2355 iWord = -1;
2125 int indexSize = m_CharIndex.GetSize(); 2356 if (bSpaceStart) {
2126 if(indexSize % 2) { 2357 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
2127 m_CharIndex.RemoveAt(indexSize - 1); 2358 } else {
2128 } 2359 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
2129 } 2360 }
2130 int CPDF_TextPageFind::GetCharIndex(int index) const 2361 }
2131 { 2362 }
2132 return m_pTextPage->CharIndexFromTextIndex(index); 2363 m_resEnd = nResultPos +
2133 int indexSize = m_CharIndex.GetSize(); 2364 m_csFindWhatArray[m_csFindWhatArray.GetSize() - 1].GetLength() - 1;
2134 int count = 0; 2365 m_IsFind = TRUE;
2135 for(int i = 0; i < indexSize; i += 2) { 2366 int resStart = GetCharIndex(m_resStart);
2136 count += m_CharIndex.GetAt(i + 1); 2367 int resEnd = GetCharIndex(m_resEnd);
2137 if(count > index) { 2368 m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1, m_resArray);
2138 return » index - count + m_CharIndex.GetAt(i + 1) + m_CharIndex.G etAt(i); 2369 if (m_flags & FPDFTEXT_CONSECUTIVE) {
2139 } 2370 m_findNextStart = m_resStart + 1;
2140 } 2371 m_findPreStart = m_resEnd - 1;
2372 } else {
2373 m_findNextStart = m_resEnd + 1;
2374 m_findPreStart = m_resStart - 1;
2375 }
2376 return m_IsFind;
2377 }
2378 FX_BOOL CPDF_TextPageFind::FindPrev() {
2379 if (!m_pTextPage) {
2380 return FALSE;
2381 }
2382 m_resArray.RemoveAll();
2383 if (m_strText.IsEmpty() || m_findPreStart < 0) {
2384 m_IsFind = FALSE;
2385 return m_IsFind;
2386 }
2387 CPDF_TextPageFind findEngine(m_pTextPage);
2388 FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);
2389 if (!ret) {
2390 m_IsFind = FALSE;
2391 return m_IsFind;
2392 }
2393 int order = -1, MatchedCount = 0;
2394 while (ret) {
2395 ret = findEngine.FindNext();
2396 if (ret) {
2397 int order1 = findEngine.GetCurOrder();
2398 int MatchedCount1 = findEngine.GetMatchedCount();
2399 if (((order1 + MatchedCount1) - 1) > m_findPreStart) {
2400 break;
2401 }
2402 order = order1;
2403 MatchedCount = MatchedCount1;
2404 }
2405 }
2406 if (order == -1) {
2407 m_IsFind = FALSE;
2408 return m_IsFind;
2409 }
2410 m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
2411 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
2412 m_IsFind = TRUE;
2413 m_pTextPage->GetRectArray(order, MatchedCount, m_resArray);
2414 if (m_flags & FPDFTEXT_CONSECUTIVE) {
2415 m_findNextStart = m_resStart + 1;
2416 m_findPreStart = m_resEnd - 1;
2417 } else {
2418 m_findNextStart = m_resEnd + 1;
2419 m_findPreStart = m_resStart - 1;
2420 }
2421 return m_IsFind;
2422 }
2423 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
2424 if (findwhat.IsEmpty()) {
2425 return;
2426 }
2427 int index = 0;
2428 while (1) {
2429 CFX_WideString csWord = TEXT_EMPTY;
2430 int ret =
2431 ExtractSubString(csWord, findwhat.c_str(), index, TEXT_BLANK_CHAR);
2432 if (csWord.IsEmpty()) {
2433 if (ret) {
2434 m_csFindWhatArray.Add(CFX_WideString(L""));
2435 index++;
2436 continue;
2437 } else {
2438 break;
2439 }
2440 }
2441 int pos = 0;
2442 while (pos < csWord.GetLength()) {
2443 CFX_WideString curStr = csWord.Mid(pos, 1);
2444 FX_WCHAR curChar = csWord.GetAt(pos);
2445 if (_IsIgnoreSpaceCharacter(curChar)) {
2446 if (pos > 0 && curChar == 0x2019) {
2447 pos++;
2448 continue;
2449 }
2450 if (pos > 0) {
2451 CFX_WideString preStr = csWord.Mid(0, pos);
2452 m_csFindWhatArray.Add(preStr);
2453 }
2454 m_csFindWhatArray.Add(curStr);
2455 if (pos == csWord.GetLength() - 1) {
2456 csWord.Empty();
2457 break;
2458 }
2459 csWord = csWord.Right(csWord.GetLength() - pos - 1);
2460 pos = 0;
2461 continue;
2462 }
2463 pos++;
2464 }
2465 if (!csWord.IsEmpty()) {
2466 m_csFindWhatArray.Add(csWord);
2467 }
2468 index++;
2469 }
2470 }
2471 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
2472 int startPos,
2473 int endPos) {
2474 int char_left = 0;
2475 int char_right = 0;
2476 int char_count = endPos - startPos + 1;
2477 if (char_count < 1) {
2478 return FALSE;
2479 }
2480 if (char_count == 1 && csPageText.GetAt(startPos) > 255) {
2481 return TRUE;
2482 }
2483 if (startPos - 1 >= 0) {
2484 char_left = csPageText.GetAt(startPos - 1);
2485 }
2486 if (startPos + char_count < csPageText.GetLength()) {
2487 char_right = csPageText.GetAt(startPos + char_count);
2488 }
2489 if ((char_left > 'A' && char_left < 'a') ||
2490 (char_left > 'a' && char_left < 'z') ||
2491 (char_left > 0xfb00 && char_left < 0xfb06) ||
2492 (char_left >= '0' && char_left <= '9') ||
2493 (char_right > 'A' && char_right < 'a') ||
2494 (char_right > 'a' && char_right < 'z') ||
2495 (char_right > 0xfb00 && char_right < 0xfb06) ||
2496 (char_right >= '0' && char_right <= '9')) {
2497 return FALSE;
2498 }
2499 if (!(('A' > char_left || char_left > 'Z') &&
2500 ('a' > char_left || char_left > 'z') &&
2501 ('A' > char_right || char_right > 'Z') &&
2502 ('a' > char_right || char_right > 'z'))) {
2503 return FALSE;
2504 }
2505 if (char_count > 0) {
2506 if (csPageText.GetAt(startPos) >= L'0' &&
2507 csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
2508 char_left <= L'9') {
2509 return FALSE;
2510 }
2511 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
2512 char_right >= L'0' && char_right <= L'9') {
2513 return FALSE;
2514 }
2515 }
2516 return TRUE;
2517 }
2518 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
2519 const FX_WCHAR* lpszFullString,
2520 int iSubString,
2521 FX_WCHAR chSep) {
2522 if (lpszFullString == NULL) {
2523 return FALSE;
2524 }
2525 while (iSubString--) {
2526 lpszFullString = FXSYS_wcschr(lpszFullString, chSep);
2527 if (lpszFullString == NULL) {
2528 rString.Empty();
2529 return FALSE;
2530 }
2531 lpszFullString++;
2532 while (*lpszFullString == chSep) {
2533 lpszFullString++;
2534 }
2535 }
2536 const FX_WCHAR* lpchEnd = FXSYS_wcschr(lpszFullString, chSep);
2537 int nLen = (lpchEnd == NULL) ? (int)FXSYS_wcslen(lpszFullString)
2538 : (int)(lpchEnd - lpszFullString);
2539 ASSERT(nLen >= 0);
2540 FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
2541 nLen * sizeof(FX_WCHAR));
2542 rString.ReleaseBuffer();
2543 return TRUE;
2544 }
2545 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
2546 CFX_WideString str2;
2547 str2.Empty();
2548 int nlen = str.GetLength();
2549 for (int i = nlen - 1; i >= 0; i--) {
2550 str2 += str.GetAt(i);
2551 }
2552 return str2;
2553 }
2554 void CPDF_TextPageFind::GetRectArray(CFX_RectArray& rects) const {
2555 rects.Copy(m_resArray);
2556 }
2557 int CPDF_TextPageFind::GetCurOrder() const {
2558 return GetCharIndex(m_resStart);
2559 }
2560 int CPDF_TextPageFind::GetMatchedCount() const {
2561 int resStart = GetCharIndex(m_resStart);
2562 int resEnd = GetCharIndex(m_resEnd);
2563 return resEnd - resStart + 1;
2564 }
2565 CPDF_LinkExtract::CPDF_LinkExtract() : m_pTextPage(NULL), m_IsParserd(FALSE) {}
2566 CPDF_LinkExtract::~CPDF_LinkExtract() {
2567 DeleteLinkList();
2568 }
2569 FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage) {
2570 if (!pTextPage || !pTextPage->IsParsered()) {
2571 return FALSE;
2572 }
2573 m_pTextPage = (const CPDF_TextPage*)pTextPage;
2574 m_strPageText = m_pTextPage->GetPageText(0, -1);
2575 DeleteLinkList();
2576 if (m_strPageText.IsEmpty()) {
2577 return FALSE;
2578 }
2579 parserLink();
2580 m_IsParserd = TRUE;
2581 return TRUE;
2582 }
2583 void CPDF_LinkExtract::DeleteLinkList() {
2584 while (m_LinkList.GetSize()) {
2585 CPDF_LinkExt* linkinfo = NULL;
2586 linkinfo = m_LinkList.GetAt(0);
2587 m_LinkList.RemoveAt(0);
2588 delete linkinfo;
2589 }
2590 m_LinkList.RemoveAll();
2591 }
2592 int CPDF_LinkExtract::CountLinks() const {
2593 if (!m_IsParserd) {
2141 return -1; 2594 return -1;
2142 } 2595 }
2143 FX_BOOL»CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat, int flags, int startPos) 2596 return m_LinkList.GetSize();
2144 { 2597 }
2145 if (!m_pTextPage) { 2598 void CPDF_LinkExtract::parserLink() {
2599 int start = 0, pos = 0;
2600 int TotalChar = m_pTextPage->CountChars();
2601 while (pos < TotalChar) {
2602 FPDF_CHAR_INFO pageChar;
2603 m_pTextPage->GetCharInfo(pos, pageChar);
2604 if (pageChar.m_Flag == CHAR_GENERATED || pageChar.m_Unicode == 0x20 ||
2605 pos == TotalChar - 1) {
2606 int nCount = pos - start;
2607 if (pos == TotalChar - 1) {
2608 nCount++;
2609 }
2610 CFX_WideString strBeCheck;
2611 strBeCheck = m_pTextPage->GetPageText(start, nCount);
2612 if (strBeCheck.GetLength() > 5) {
2613 while (strBeCheck.GetLength() > 0) {
2614 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
2615 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
2616 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
2617 nCount--;
2618 } else {
2619 break;
2620 }
2621 }
2622 if (nCount > 5 &&
2623 (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
2624 if (!AppendToLinkList(start, nCount, strBeCheck)) {
2625 break;
2626 }
2627 }
2628 }
2629 start = ++pos;
2630 } else {
2631 pos++;
2632 }
2633 }
2634 }
2635 FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
2636 CFX_WideString str = strBeCheck;
2637 str.MakeLower();
2638 if (str.Find(L"http://www.") != -1) {
2639 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
2640 return TRUE;
2641 }
2642 if (str.Find(L"http://") != -1) {
2643 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
2644 return TRUE;
2645 }
2646 if (str.Find(L"https://www.") != -1) {
2647 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
2648 return TRUE;
2649 }
2650 if (str.Find(L"https://") != -1) {
2651 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
2652 return TRUE;
2653 }
2654 if (str.Find(L"www.") != -1) {
2655 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
2656 strBeCheck = L"http://" + strBeCheck;
2657 return TRUE;
2658 }
2659 return FALSE;
2660 }
2661 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
2662 str.MakeLower();
2663 int aPos = str.Find(L'@');
2664 if (aPos < 1) {
2665 return FALSE;
2666 }
2667 if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') {
2668 return FALSE;
2669 }
2670 int i;
2671 for (i = aPos - 1; i >= 0; i--) {
2672 FX_WCHAR ch = str.GetAt(i);
2673 if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') ||
2674 (ch >= L'0' && ch <= L'9')) {
2675 continue;
2676 } else {
2677 if (i == aPos - 1) {
2146 return FALSE; 2678 return FALSE;
2147 } 2679 }
2148 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) { 2680 str = str.Right(str.GetLength() - i - 1);
2149 m_strText = m_pTextPage->GetPageText(); 2681 break;
2150 } 2682 }
2151 CFX_WideString findwhatStr = findwhat; 2683 }
2152 m_findWhat = findwhatStr; 2684 aPos = str.Find(L'@');
2153 m_flags = flags; 2685 if (aPos < 1) {
2154 m_bMatchCase = flags & FPDFTEXT_MATCHCASE; 2686 return FALSE;
2155 if (m_strText.IsEmpty()) { 2687 }
2156 m_IsFind = FALSE; 2688 CFX_WideString strtemp = L"";
2157 return TRUE; 2689 for (i = 0; i < aPos; i++) {
2158 } 2690 FX_WCHAR wch = str.GetAt(i);
2159 FX_STRSIZE len = findwhatStr.GetLength(); 2691 if (wch >= L'a' && wch <= L'z') {
2160 if (!m_bMatchCase) { 2692 break;
2161 findwhatStr.MakeLower();
2162 m_strText.MakeLower();
2163 }
2164 m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
2165 m_findNextStart = startPos;
2166 if (startPos == -1) {
2167 m_findPreStart = m_strText.GetLength() - 1;
2168 } else { 2693 } else {
2169 m_findPreStart = startPos; 2694 strtemp = str.Right(str.GetLength() - i + 1);
2170 } 2695 }
2171 m_csFindWhatArray.RemoveAll(); 2696 }
2172 int i = 0; 2697 if (strtemp != L"") {
2173 while(i < len) { 2698 str = strtemp;
2174 if(findwhatStr.GetAt(i) != ' ') { 2699 }
2175 break; 2700 aPos = str.Find(L'@');
2176 } 2701 if (aPos < 1) {
2177 i++; 2702 return FALSE;
2178 } 2703 }
2179 if(i < len) { 2704 str.TrimRight(L'.');
2180 ExtractFindWhat(findwhatStr); 2705 strtemp = str;
2706 int ePos = str.Find(L'.');
2707 if (ePos == -1) {
2708 return FALSE;
2709 }
2710 while (ePos != -1) {
2711 strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1);
2712 ePos = strtemp.Find('.');
2713 }
2714 ePos = strtemp.GetLength();
2715 for (i = 0; i < ePos; i++) {
2716 FX_WCHAR wch = str.GetAt(i);
2717 if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) {
2718 continue;
2181 } else { 2719 } else {
2182 m_csFindWhatArray.Add(findwhatStr); 2720 str = str.Left(str.GetLength() - ePos + i + 1);
2183 } 2721 ePos = ePos - i - 1;
2184 if(m_csFindWhatArray.GetSize() <= 0) { 2722 break;
2185 return FALSE; 2723 }
2186 } 2724 }
2187 m_IsFind = TRUE; 2725 int nLen = str.GetLength();
2188 m_resStart = 0; 2726 for (i = aPos + 1; i < nLen - ePos; i++) {
2189 m_resEnd = -1; 2727 FX_WCHAR wch = str.GetAt(i);
2190 return TRUE; 2728 if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') ||
2191 } 2729 (wch >= L'0' && wch <= L'9')) {
2192 FX_BOOL CPDF_TextPageFind::FindNext() 2730 continue;
2193 {
2194 if (!m_pTextPage) {
2195 return FALSE;
2196 }
2197 m_resArray.RemoveAll();
2198 if(m_findNextStart == -1) {
2199 return FALSE;
2200 }
2201 if(m_strText.IsEmpty()) {
2202 m_IsFind = FALSE;
2203 return m_IsFind;
2204 }
2205 int strLen = m_strText.GetLength();
2206 if (m_findNextStart > strLen - 1) {
2207 m_IsFind = FALSE;
2208 return m_IsFind;
2209 }
2210 int nCount = m_csFindWhatArray.GetSize();
2211 int nResultPos = 0;
2212 int»nStartPos = 0;
2213 nStartPos = m_findNextStart;
2214 FX_BOOL bSpaceStart = FALSE;
2215 for(int iWord = 0; iWord < nCount; iWord++) {
2216 CFX_WideString csWord = m_csFindWhatArray[iWord];
2217 if(csWord.IsEmpty()) {
2218 if(iWord == nCount - 1) {
2219 FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
2220 if(strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_BLANK_CH AR || strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
2221 nResultPos = nStartPos + 1;
2222 break;
2223 }
2224 iWord = -1;
2225 } else if(iWord == 0) {
2226 bSpaceStart = TRUE;
2227 }
2228 continue;
2229 }
2230 int endIndex;
2231 nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
2232 if (nResultPos == -1) {
2233 m_IsFind = FALSE;
2234 return m_IsFind;
2235 }
2236 endIndex = nResultPos + csWord.GetLength() - 1;
2237 if(iWord == 0) {
2238 m_resStart = nResultPos;
2239 }
2240 FX_BOOL bMatch = TRUE;
2241 if(iWord != 0 && !bSpaceStart) {
2242 int PreResEndPos = nStartPos;
2243 int curChar = csWord.GetAt(0);
2244 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
2245 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
2246 if(nStartPos == nResultPos && !(_IsIgnoreSpaceCharacter(lastChar) || _IsIgnoreSpaceCharacter(curChar))) {
2247 bMatch = FALSE;
2248 }
2249 for(int d = PreResEndPos; d < nResultPos; d++) {
2250 FX_WCHAR strInsert = m_strText.GetAt(d);
2251 if(strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CH AR && strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2252 bMatch = FALSE;
2253 break;
2254 }
2255 }
2256 } else if(bSpaceStart) {
2257 if(nResultPos > 0) {
2258 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
2259 if(strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_BLANK_CH AR && strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
2260 bMatch = FALSE;
2261 m_resStart = nResultPos;
2262 } else {
2263 m_resStart = nResultPos - 1;
2264 }
2265 }
2266 }
2267 if(m_bMatchWholeWord && bMatch) {
2268 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
2269 }
2270 nStartPos = endIndex + 1;
2271 if(!bMatch) {
2272 iWord = -1;
2273 if(bSpaceStart) {
2274 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
2275 } else {
2276 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
2277 }
2278 }
2279 }
2280 m_resEnd = nResultPos + m_csFindWhatArray[m_csFindWhatArray.GetSize() - 1].G etLength() - 1;
2281 m_IsFind = TRUE;
2282 int resStart = GetCharIndex(m_resStart);
2283 int resEnd = GetCharIndex(m_resEnd);
2284 m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1, m_resArray);
2285 if(m_flags & FPDFTEXT_CONSECUTIVE) {
2286 m_findNextStart = m_resStart + 1;
2287 m_findPreStart = m_resEnd - 1;
2288 } else { 2731 } else {
2289 m_findNextStart = m_resEnd + 1; 2732 return FALSE;
2290 m_findPreStart = m_resStart - 1; 2733 }
2291 } 2734 }
2292 return m_IsFind; 2735 if (str.Find(L"mailto:") == -1) {
2293 } 2736 str = L"mailto:" + str;
2294 FX_BOOL CPDF_TextPageFind::FindPrev() 2737 }
2295 { 2738 return TRUE;
2296 if (!m_pTextPage) { 2739 }
2297 return FALSE; 2740 FX_BOOL CPDF_LinkExtract::AppendToLinkList(int start,
2298 } 2741 int count,
2299 m_resArray.RemoveAll(); 2742 const CFX_WideString& strUrl) {
2300 if(m_strText.IsEmpty() || m_findPreStart < 0) { 2743 CPDF_LinkExt* linkInfo = new CPDF_LinkExt;
2301 m_IsFind = FALSE; 2744 linkInfo->m_strUrl = strUrl;
2302 return m_IsFind; 2745 linkInfo->m_Start = start;
2303 } 2746 linkInfo->m_Count = count;
2304 CPDF_TextPageFind findEngine(m_pTextPage); 2747 m_LinkList.Add(linkInfo);
2305 FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags); 2748 return TRUE;
2306 if(!ret) { 2749 }
2307 m_IsFind = FALSE; 2750 CFX_WideString CPDF_LinkExtract::GetURL(int index) const {
2308 return m_IsFind; 2751 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
2309 } 2752 return L"";
2310 int order = -1, MatchedCount = 0; 2753 }
2311 while(ret) { 2754 CPDF_LinkExt* link = NULL;
2312 ret = findEngine.FindNext(); 2755 link = m_LinkList.GetAt(index);
2313 if(ret) { 2756 if (!link) {
2314 int order1 = findEngine.GetCurOrder() ; 2757 return L"";
2315 int MatchedCount1 = findEngine.GetMatchedCount(); 2758 }
2316 if(((order1 + MatchedCount1) - 1) > m_findPreStart) { 2759 return link->m_strUrl;
2317 break; 2760 }
2318 } 2761 void CPDF_LinkExtract::GetBoundedSegment(int index,
2319 order = order1; 2762 int& start,
2320 MatchedCount = MatchedCount1; 2763 int& count) const {
2321 } 2764 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
2322 } 2765 return;
2323 if(order == -1) { 2766 }
2324 m_IsFind = FALSE; 2767 CPDF_LinkExt* link = NULL;
2325 return m_IsFind; 2768 link = m_LinkList.GetAt(index);
2326 } 2769 if (!link) {
2327 m_resStart = m_pTextPage->TextIndexFromCharIndex(order); 2770 return;
2328 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1); 2771 }
2329 m_IsFind = TRUE; 2772 start = link->m_Start;
2330 m_pTextPage->GetRectArray(order, MatchedCount, m_resArray); 2773 count = link->m_Count;
2331 if(m_flags & FPDFTEXT_CONSECUTIVE) { 2774 }
2332 m_findNextStart = m_resStart + 1; 2775 void CPDF_LinkExtract::GetRects(int index, CFX_RectArray& rects) const {
2333 m_findPreStart = m_resEnd - 1; 2776 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
2334 } else { 2777 return;
2335 m_findNextStart = m_resEnd + 1; 2778 }
2336 m_findPreStart = m_resStart - 1; 2779 CPDF_LinkExt* link = NULL;
2337 } 2780 link = m_LinkList.GetAt(index);
2338 return m_IsFind; 2781 if (!link) {
2339 } 2782 return;
2340 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) 2783 }
2341 { 2784 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);
2342 if(findwhat.IsEmpty()) { 2785 }
2343 return ;
2344 }
2345 int index = 0;
2346 while(1) {
2347 CFX_WideString csWord = TEXT_EMPTY;
2348 int ret = ExtractSubString(csWord, findwhat.c_str(), index, TEXT_BLANK_C HAR);
2349 if(csWord.IsEmpty()) {
2350 if(ret) {
2351 m_csFindWhatArray.Add(CFX_WideString(L""));
2352 index++;
2353 continue;
2354 } else {
2355 break;
2356 }
2357 }
2358 int pos = 0;
2359 while(pos < csWord.GetLength()) {
2360 CFX_WideString curStr = csWord.Mid(pos, 1);
2361 FX_WCHAR curChar = csWord.GetAt(pos);
2362 if (_IsIgnoreSpaceCharacter(curChar)) {
2363 if (pos > 0 && curChar == 0x2019) {
2364 pos++;
2365 continue;
2366 }
2367 if (pos > 0 ) {
2368 CFX_WideString preStr = csWord.Mid(0, pos);
2369 m_csFindWhatArray.Add(preStr);
2370 }
2371 m_csFindWhatArray.Add(curStr);
2372 if (pos == csWord.GetLength() - 1) {
2373 csWord.Empty();
2374 break;
2375 }
2376 csWord = csWord.Right(csWord.GetLength() - pos - 1);
2377 pos = 0;
2378 continue;
2379 }
2380 pos++;
2381 }
2382 if (!csWord.IsEmpty()) {
2383 m_csFindWhatArray.Add(csWord);
2384 }
2385 index++;
2386 }
2387 }
2388 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText, in t startPos, int endPos)
2389 {
2390 int char_left = 0;
2391 int char_right = 0;
2392 int char_count = endPos - startPos + 1;
2393 if(char_count < 1) {
2394 return FALSE;
2395 }
2396 if (char_count == 1 && csPageText.GetAt(startPos) > 255) {
2397 return TRUE;
2398 }
2399 if(startPos - 1 >= 0 ) {
2400 char_left = csPageText.GetAt(startPos - 1);
2401 }
2402 if(startPos + char_count < csPageText.GetLength()) {
2403 char_right = csPageText.GetAt(startPos + char_count);
2404 }
2405 if ((char_left > 'A' && char_left < 'a') || (char_left > 'a' && char_left < 'z') || (char_left > 0xfb00 && char_left < 0xfb06) || (char_left >= '0' && char_ left <= '9') ||
2406 (char_right > 'A' && char_right < 'a') || (char_right > 'a' && char_ right < 'z') || (char_right > 0xfb00 && char_right < 0xfb06) || (char_right >= ' 0' && char_right <= '9')) {
2407 return FALSE;
2408 }
2409 if(!(('A' > char_left || char_left > 'Z') && ('a' > char_left || char_left > 'z')
2410 && ('A' > char_right || char_right > 'Z') && ('a' > char_right || c har_right > 'z'))) {
2411 return FALSE;
2412 }
2413 if (char_count > 0) {
2414 if (csPageText.GetAt(startPos) >= L'0' && csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' && char_left <= L'9') {
2415 return FALSE;
2416 }
2417 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' && char_right >= L'0' && char_right <= L'9') {
2418 return FALSE;
2419 }
2420 }
2421 return TRUE;
2422 }
2423 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString, const FX_WC HAR* lpszFullString,
2424 int iSubString, FX_WCHAR chSep)
2425 {
2426 if (lpszFullString == NULL) {
2427 return FALSE;
2428 }
2429 while (iSubString--) {
2430 lpszFullString = FXSYS_wcschr(lpszFullString, chSep);
2431 if (lpszFullString == NULL) {
2432 rString.Empty();
2433 return FALSE;
2434 }
2435 lpszFullString++;
2436 while(*lpszFullString == chSep) {
2437 lpszFullString++;
2438 }
2439 }
2440 const FX_WCHAR* lpchEnd = FXSYS_wcschr(lpszFullString, chSep);
2441 int nLen = (lpchEnd == NULL) ?
2442 (int)FXSYS_wcslen(lpszFullString) : (int)(lpchEnd - lpszFullStrin g);
2443 ASSERT(nLen >= 0);
2444 FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString, nLen * sizeof(FX_WCHAR ));
2445 rString.ReleaseBuffer();
2446 return TRUE;
2447 }
2448 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str)
2449 {
2450 CFX_WideString str2;
2451 str2.Empty();
2452 int nlen = str.GetLength();
2453 for(int i = nlen - 1; i >= 0; i--) {
2454 str2 += str.GetAt(i);
2455 }
2456 return str2;
2457 }
2458 void CPDF_TextPageFind::GetRectArray(CFX_RectArray& rects) const
2459 {
2460 rects.Copy(m_resArray);
2461 }
2462 int CPDF_TextPageFind::GetCurOrder() const
2463 {
2464 return GetCharIndex(m_resStart);
2465 }
2466 int CPDF_TextPageFind::GetMatchedCount()const
2467 {
2468 int resStart = GetCharIndex(m_resStart);
2469 int resEnd = GetCharIndex(m_resEnd);
2470 return resEnd - resStart + 1;
2471 }
2472 CPDF_LinkExtract::CPDF_LinkExtract()
2473 : m_pTextPage(NULL),
2474 m_IsParserd(FALSE)
2475 {
2476 }
2477 CPDF_LinkExtract::~CPDF_LinkExtract()
2478 {
2479 DeleteLinkList();
2480 }
2481 FX_BOOL CPDF_LinkExtract::ExtractLinks(const IPDF_TextPage* pTextPage)
2482 {
2483 if (!pTextPage || !pTextPage->IsParsered()) {
2484 return FALSE;
2485 }
2486 m_pTextPage = (const CPDF_TextPage*)pTextPage;
2487 m_strPageText = m_pTextPage->GetPageText(0, -1);
2488 DeleteLinkList();
2489 if (m_strPageText.IsEmpty()) {
2490 return FALSE;
2491 }
2492 parserLink();
2493 m_IsParserd = TRUE;
2494 return TRUE;
2495 }
2496 void CPDF_LinkExtract::DeleteLinkList()
2497 {
2498 while (m_LinkList.GetSize()) {
2499 CPDF_LinkExt* linkinfo = NULL;
2500 linkinfo = m_LinkList.GetAt(0);
2501 m_LinkList.RemoveAt(0);
2502 delete linkinfo;
2503 }
2504 m_LinkList.RemoveAll();
2505 }
2506 int CPDF_LinkExtract::CountLinks() const
2507 {
2508 if (!m_IsParserd) {
2509 return -1;
2510 }
2511 return m_LinkList.GetSize();
2512 }
2513 void CPDF_LinkExtract::parserLink()
2514 {
2515 int start = 0, pos = 0;
2516 int TotalChar = m_pTextPage->CountChars();
2517 while (pos < TotalChar) {
2518 FPDF_CHAR_INFO pageChar;
2519 m_pTextPage->GetCharInfo(pos, pageChar);
2520 if (pageChar.m_Flag == CHAR_GENERATED || pageChar.m_Unicode == 0x20 || p os == TotalChar - 1) {
2521 int nCount = pos - start;
2522 if(pos == TotalChar - 1) {
2523 nCount++;
2524 }
2525 CFX_WideString strBeCheck;
2526 strBeCheck = m_pTextPage->GetPageText(start, nCount);
2527 if (strBeCheck.GetLength() > 5) {
2528 while(strBeCheck.GetLength() > 0) {
2529 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
2530 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
2531 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
2532 nCount--;
2533 } else {
2534 break;
2535 }
2536 }
2537 if (nCount > 5 && (CheckWebLink(strBeCheck) || CheckMailLink(str BeCheck))) {
2538 if (!AppendToLinkList(start, nCount, strBeCheck)) {
2539 break;
2540 }
2541 }
2542 }
2543 start = ++pos;
2544 } else {
2545 pos++;
2546 }
2547 }
2548 }
2549 FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck)
2550 {
2551 CFX_WideString str = strBeCheck;
2552 str.MakeLower();
2553 if (str.Find(L"http://www.") != -1) {
2554 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.") );
2555 return TRUE;
2556 }
2557 if (str.Find(L"http://") != -1) {
2558 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
2559 return TRUE;
2560 }
2561 if (str.Find(L"https://www.") != -1) {
2562 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www." ));
2563 return TRUE;
2564 }
2565 if (str.Find(L"https://") != -1) {
2566 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
2567 return TRUE;
2568 }
2569 if (str.Find(L"www.") != -1) {
2570 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
2571 strBeCheck = L"http://" + strBeCheck;
2572 return TRUE;
2573 }
2574 return FALSE;
2575 }
2576 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str)
2577 {
2578 str.MakeLower();
2579 int aPos = str.Find(L'@');
2580 if (aPos < 1) {
2581 return FALSE;
2582 }
2583 if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') {
2584 return FALSE;
2585 }
2586 int i;
2587 for (i = aPos - 1; i >= 0; i--) {
2588 FX_WCHAR ch = str.GetAt(i);
2589 if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') || (ch >= L'0 ' && ch <= L'9')) {
2590 continue;
2591 } else {
2592 if (i == aPos - 1) {
2593 return FALSE;
2594 }
2595 str = str.Right(str.GetLength() - i - 1);
2596 break;
2597 }
2598 }
2599 aPos = str.Find(L'@');
2600 if (aPos < 1) {
2601 return FALSE;
2602 }
2603 CFX_WideString strtemp = L"";
2604 for (i = 0; i < aPos; i++) {
2605 FX_WCHAR wch = str.GetAt(i);
2606 if (wch >= L'a' && wch <= L'z') {
2607 break;
2608 } else {
2609 strtemp = str.Right(str.GetLength() - i + 1);
2610 }
2611 }
2612 if (strtemp != L"") {
2613 str = strtemp;
2614 }
2615 aPos = str.Find(L'@');
2616 if (aPos < 1) {
2617 return FALSE;
2618 }
2619 str.TrimRight(L'.');
2620 strtemp = str;
2621 int ePos = str.Find(L'.');
2622 if (ePos == -1) {
2623 return FALSE;
2624 }
2625 while (ePos != -1) {
2626 strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1);
2627 ePos = strtemp.Find('.');
2628 }
2629 ePos = strtemp.GetLength();
2630 for (i = 0; i < ePos; i++) {
2631 FX_WCHAR wch = str.GetAt(i);
2632 if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) {
2633 continue;
2634 } else {
2635 str = str.Left(str.GetLength() - ePos + i + 1);
2636 ePos = ePos - i - 1;
2637 break;
2638 }
2639 }
2640 int nLen = str.GetLength();
2641 for (i = aPos + 1; i < nLen - ePos; i++) {
2642 FX_WCHAR wch = str.GetAt(i);
2643 if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') || (wch > = L'0' && wch <= L'9')) {
2644 continue;
2645 } else {
2646 return FALSE;
2647 }
2648 }
2649 if (str.Find(L"mailto:") == -1) {
2650 str = L"mailto:" + str;
2651 }
2652 return TRUE;
2653 }
2654 FX_BOOL CPDF_LinkExtract::AppendToLinkList(int start, int count, const CFX_WideS tring& strUrl)
2655 {
2656 CPDF_LinkExt* linkInfo = new CPDF_LinkExt;
2657 linkInfo->m_strUrl = strUrl;
2658 linkInfo->m_Start = start;
2659 linkInfo->m_Count = count;
2660 m_LinkList.Add(linkInfo);
2661 return TRUE;
2662 }
2663 CFX_WideString CPDF_LinkExtract::GetURL(int index) const
2664 {
2665 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
2666 return L"";
2667 }
2668 CPDF_LinkExt* link = NULL;
2669 link = m_LinkList.GetAt(index);
2670 if (!link) {
2671 return L"";
2672 }
2673 return link->m_strUrl;
2674 }
2675 void CPDF_LinkExtract::GetBoundedSegment(int index, int& start, int& count) cons t
2676 {
2677 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
2678 return ;
2679 }
2680 CPDF_LinkExt* link = NULL;
2681 link = m_LinkList.GetAt(index);
2682 if (!link) {
2683 return ;
2684 }
2685 start = link->m_Start;
2686 count = link->m_Count;
2687 }
2688 void CPDF_LinkExtract::GetRects(int index, CFX_RectArray& rects) const
2689 {
2690 if (!m_IsParserd || index < 0 || index >= m_LinkList.GetSize()) {
2691 return;
2692 }
2693 CPDF_LinkExt* link = NULL;
2694 link = m_LinkList.GetAt(index);
2695 if (!link) {
2696 return ;
2697 }
2698 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);
2699 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698