OLD | NEW |
(Empty) | |
| 1 // Copyright 2016 PDFium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| 6 |
| 7 #include "core/fpdftext/include/cpdf_textpagefind.h" |
| 8 |
| 9 #include <cwchar> |
| 10 #include <cwctype> |
| 11 #include <vector> |
| 12 |
| 13 #include "core/fpdftext/include/cpdf_textpage.h" |
| 14 #include "core/fxcrt/include/fx_string.h" |
| 15 #include "core/fxcrt/include/fx_system.h" |
| 16 #include "third_party/base/stl_util.h" |
| 17 |
| 18 namespace { |
| 19 |
| 20 FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) { |
| 21 if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) || |
| 22 (curChar >= 0xFE70 && curChar <= 0xFEFF) || |
| 23 (curChar >= 0xFB50 && curChar <= 0xFDFF) || |
| 24 (curChar >= 0x0400 && curChar <= 0x04FF) || |
| 25 (curChar >= 0x0500 && curChar <= 0x052F) || |
| 26 (curChar >= 0xA640 && curChar <= 0xA69F) || |
| 27 (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 || |
| 28 (curChar >= 0x2000 && curChar <= 0x206F)) { |
| 29 return FALSE; |
| 30 } |
| 31 return TRUE; |
| 32 } |
| 33 |
| 34 } // namespace |
| 35 |
| 36 CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) |
| 37 : m_pTextPage(pTextPage), |
| 38 m_flags(0), |
| 39 m_findNextStart(-1), |
| 40 m_findPreStart(-1), |
| 41 m_bMatchCase(FALSE), |
| 42 m_bMatchWholeWord(FALSE), |
| 43 m_resStart(0), |
| 44 m_resEnd(-1), |
| 45 m_IsFind(FALSE) { |
| 46 m_strText = m_pTextPage->GetPageText(); |
| 47 int nCount = pTextPage->CountChars(); |
| 48 if (nCount) |
| 49 m_CharIndex.push_back(0); |
| 50 for (int i = 0; i < nCount; i++) { |
| 51 FPDF_CHAR_INFO info; |
| 52 pTextPage->GetCharInfo(i, &info); |
| 53 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); |
| 54 if (info.m_Flag == FPDFTEXT_CHAR_NORMAL || |
| 55 info.m_Flag == FPDFTEXT_CHAR_GENERATED) { |
| 56 if (indexSize % 2) { |
| 57 m_CharIndex.push_back(1); |
| 58 } else { |
| 59 if (indexSize <= 0) |
| 60 continue; |
| 61 m_CharIndex[indexSize - 1] += 1; |
| 62 } |
| 63 } else { |
| 64 if (indexSize % 2) { |
| 65 if (indexSize <= 0) |
| 66 continue; |
| 67 m_CharIndex[indexSize - 1] = i + 1; |
| 68 } else { |
| 69 m_CharIndex.push_back(i + 1); |
| 70 } |
| 71 } |
| 72 } |
| 73 int indexSize = pdfium::CollectionSize<int>(m_CharIndex); |
| 74 if (indexSize % 2) |
| 75 m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1); |
| 76 } |
| 77 |
| 78 CPDF_TextPageFind::~CPDF_TextPageFind() {} |
| 79 |
| 80 int CPDF_TextPageFind::GetCharIndex(int index) const { |
| 81 return m_pTextPage->CharIndexFromTextIndex(index); |
| 82 } |
| 83 |
| 84 FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat, |
| 85 int flags, |
| 86 int startPos) { |
| 87 if (!m_pTextPage) |
| 88 return FALSE; |
| 89 if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) |
| 90 m_strText = m_pTextPage->GetPageText(); |
| 91 CFX_WideString findwhatStr = findwhat; |
| 92 m_findWhat = findwhatStr; |
| 93 m_flags = flags; |
| 94 m_bMatchCase = flags & FPDFTEXT_MATCHCASE; |
| 95 if (m_strText.IsEmpty()) { |
| 96 m_IsFind = FALSE; |
| 97 return TRUE; |
| 98 } |
| 99 FX_STRSIZE len = findwhatStr.GetLength(); |
| 100 if (!m_bMatchCase) { |
| 101 findwhatStr.MakeLower(); |
| 102 m_strText.MakeLower(); |
| 103 } |
| 104 m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD; |
| 105 m_findNextStart = startPos; |
| 106 if (startPos == -1) |
| 107 m_findPreStart = m_strText.GetLength() - 1; |
| 108 else |
| 109 m_findPreStart = startPos; |
| 110 m_csFindWhatArray.clear(); |
| 111 int i = 0; |
| 112 while (i < len) { |
| 113 if (findwhatStr.GetAt(i) != ' ') |
| 114 break; |
| 115 i++; |
| 116 } |
| 117 if (i < len) |
| 118 ExtractFindWhat(findwhatStr); |
| 119 else |
| 120 m_csFindWhatArray.push_back(findwhatStr); |
| 121 if (m_csFindWhatArray.empty()) |
| 122 return FALSE; |
| 123 m_IsFind = TRUE; |
| 124 m_resStart = 0; |
| 125 m_resEnd = -1; |
| 126 return TRUE; |
| 127 } |
| 128 |
| 129 FX_BOOL CPDF_TextPageFind::FindNext() { |
| 130 if (!m_pTextPage) |
| 131 return FALSE; |
| 132 m_resArray.clear(); |
| 133 if (m_findNextStart == -1) |
| 134 return FALSE; |
| 135 if (m_strText.IsEmpty()) { |
| 136 m_IsFind = FALSE; |
| 137 return m_IsFind; |
| 138 } |
| 139 int strLen = m_strText.GetLength(); |
| 140 if (m_findNextStart > strLen - 1) { |
| 141 m_IsFind = FALSE; |
| 142 return m_IsFind; |
| 143 } |
| 144 int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray); |
| 145 int nResultPos = 0; |
| 146 int nStartPos = 0; |
| 147 nStartPos = m_findNextStart; |
| 148 bool bSpaceStart = false; |
| 149 for (int iWord = 0; iWord < nCount; iWord++) { |
| 150 CFX_WideString csWord = m_csFindWhatArray[iWord]; |
| 151 if (csWord.IsEmpty()) { |
| 152 if (iWord == nCount - 1) { |
| 153 FX_WCHAR strInsert = m_strText.GetAt(nStartPos); |
| 154 if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR || |
| 155 strInsert == TEXT_RETURN_CHAR || strInsert == 160) { |
| 156 nResultPos = nStartPos + 1; |
| 157 break; |
| 158 } |
| 159 iWord = -1; |
| 160 } else if (iWord == 0) { |
| 161 bSpaceStart = true; |
| 162 } |
| 163 continue; |
| 164 } |
| 165 int endIndex; |
| 166 nResultPos = m_strText.Find(csWord.c_str(), nStartPos); |
| 167 if (nResultPos == -1) { |
| 168 m_IsFind = FALSE; |
| 169 return m_IsFind; |
| 170 } |
| 171 endIndex = nResultPos + csWord.GetLength() - 1; |
| 172 if (iWord == 0) |
| 173 m_resStart = nResultPos; |
| 174 FX_BOOL bMatch = TRUE; |
| 175 if (iWord != 0 && !bSpaceStart) { |
| 176 int PreResEndPos = nStartPos; |
| 177 int curChar = csWord.GetAt(0); |
| 178 CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; |
| 179 int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); |
| 180 if (nStartPos == nResultPos && |
| 181 !(IsIgnoreSpaceCharacter(lastChar) || |
| 182 IsIgnoreSpaceCharacter(curChar))) { |
| 183 bMatch = FALSE; |
| 184 } |
| 185 for (int d = PreResEndPos; d < nResultPos; d++) { |
| 186 FX_WCHAR strInsert = m_strText.GetAt(d); |
| 187 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && |
| 188 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { |
| 189 bMatch = FALSE; |
| 190 break; |
| 191 } |
| 192 } |
| 193 } else if (bSpaceStart) { |
| 194 if (nResultPos > 0) { |
| 195 FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1); |
| 196 if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && |
| 197 strInsert != TEXT_RETURN_CHAR && strInsert != 160) { |
| 198 bMatch = FALSE; |
| 199 m_resStart = nResultPos; |
| 200 } else { |
| 201 m_resStart = nResultPos - 1; |
| 202 } |
| 203 } |
| 204 } |
| 205 if (m_bMatchWholeWord && bMatch) { |
| 206 bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex); |
| 207 } |
| 208 nStartPos = endIndex + 1; |
| 209 if (!bMatch) { |
| 210 iWord = -1; |
| 211 if (bSpaceStart) |
| 212 nStartPos = m_resStart + m_csFindWhatArray[1].GetLength(); |
| 213 else |
| 214 nStartPos = m_resStart + m_csFindWhatArray[0].GetLength(); |
| 215 } |
| 216 } |
| 217 m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1; |
| 218 m_IsFind = TRUE; |
| 219 int resStart = GetCharIndex(m_resStart); |
| 220 int resEnd = GetCharIndex(m_resEnd); |
| 221 m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1); |
| 222 if (m_flags & FPDFTEXT_CONSECUTIVE) { |
| 223 m_findNextStart = m_resStart + 1; |
| 224 m_findPreStart = m_resEnd - 1; |
| 225 } else { |
| 226 m_findNextStart = m_resEnd + 1; |
| 227 m_findPreStart = m_resStart - 1; |
| 228 } |
| 229 return m_IsFind; |
| 230 } |
| 231 |
| 232 FX_BOOL CPDF_TextPageFind::FindPrev() { |
| 233 if (!m_pTextPage) |
| 234 return FALSE; |
| 235 m_resArray.clear(); |
| 236 if (m_strText.IsEmpty() || m_findPreStart < 0) { |
| 237 m_IsFind = FALSE; |
| 238 return m_IsFind; |
| 239 } |
| 240 CPDF_TextPageFind findEngine(m_pTextPage); |
| 241 FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags); |
| 242 if (!ret) { |
| 243 m_IsFind = FALSE; |
| 244 return m_IsFind; |
| 245 } |
| 246 int order = -1, MatchedCount = 0; |
| 247 while (ret) { |
| 248 ret = findEngine.FindNext(); |
| 249 if (ret) { |
| 250 int order1 = findEngine.GetCurOrder(); |
| 251 int MatchedCount1 = findEngine.GetMatchedCount(); |
| 252 if (((order1 + MatchedCount1) - 1) > m_findPreStart) |
| 253 break; |
| 254 order = order1; |
| 255 MatchedCount = MatchedCount1; |
| 256 } |
| 257 } |
| 258 if (order == -1) { |
| 259 m_IsFind = FALSE; |
| 260 return m_IsFind; |
| 261 } |
| 262 m_resStart = m_pTextPage->TextIndexFromCharIndex(order); |
| 263 m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1); |
| 264 m_IsFind = TRUE; |
| 265 m_resArray = m_pTextPage->GetRectArray(order, MatchedCount); |
| 266 if (m_flags & FPDFTEXT_CONSECUTIVE) { |
| 267 m_findNextStart = m_resStart + 1; |
| 268 m_findPreStart = m_resEnd - 1; |
| 269 } else { |
| 270 m_findNextStart = m_resEnd + 1; |
| 271 m_findPreStart = m_resStart - 1; |
| 272 } |
| 273 return m_IsFind; |
| 274 } |
| 275 |
| 276 void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) { |
| 277 if (findwhat.IsEmpty()) |
| 278 return; |
| 279 int index = 0; |
| 280 while (1) { |
| 281 CFX_WideString csWord = TEXT_EMPTY; |
| 282 int ret = |
| 283 ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR); |
| 284 if (csWord.IsEmpty()) { |
| 285 if (ret) { |
| 286 m_csFindWhatArray.push_back(L""); |
| 287 index++; |
| 288 continue; |
| 289 } else { |
| 290 break; |
| 291 } |
| 292 } |
| 293 int pos = 0; |
| 294 while (pos < csWord.GetLength()) { |
| 295 CFX_WideString curStr = csWord.Mid(pos, 1); |
| 296 FX_WCHAR curChar = csWord.GetAt(pos); |
| 297 if (IsIgnoreSpaceCharacter(curChar)) { |
| 298 if (pos > 0 && curChar == 0x2019) { |
| 299 pos++; |
| 300 continue; |
| 301 } |
| 302 if (pos > 0) |
| 303 m_csFindWhatArray.push_back(csWord.Mid(0, pos)); |
| 304 m_csFindWhatArray.push_back(curStr); |
| 305 if (pos == csWord.GetLength() - 1) { |
| 306 csWord.clear(); |
| 307 break; |
| 308 } |
| 309 csWord = csWord.Right(csWord.GetLength() - pos - 1); |
| 310 pos = 0; |
| 311 continue; |
| 312 } |
| 313 pos++; |
| 314 } |
| 315 if (!csWord.IsEmpty()) |
| 316 m_csFindWhatArray.push_back(csWord); |
| 317 index++; |
| 318 } |
| 319 } |
| 320 |
| 321 FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText, |
| 322 int startPos, |
| 323 int endPos) { |
| 324 FX_WCHAR char_left = 0; |
| 325 FX_WCHAR char_right = 0; |
| 326 int char_count = endPos - startPos + 1; |
| 327 if (char_count < 1) |
| 328 return FALSE; |
| 329 if (char_count == 1 && csPageText.GetAt(startPos) > 255) |
| 330 return TRUE; |
| 331 if (startPos - 1 >= 0) |
| 332 char_left = csPageText.GetAt(startPos - 1); |
| 333 if (startPos + char_count < csPageText.GetLength()) |
| 334 char_right = csPageText.GetAt(startPos + char_count); |
| 335 if ((char_left > 'A' && char_left < 'a') || |
| 336 (char_left > 'a' && char_left < 'z') || |
| 337 (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) || |
| 338 (char_right > 'A' && char_right < 'a') || |
| 339 (char_right > 'a' && char_right < 'z') || |
| 340 (char_right > 0xfb00 && char_right < 0xfb06) || |
| 341 std::iswdigit(char_right)) { |
| 342 return FALSE; |
| 343 } |
| 344 if (!(('A' > char_left || char_left > 'Z') && |
| 345 ('a' > char_left || char_left > 'z') && |
| 346 ('A' > char_right || char_right > 'Z') && |
| 347 ('a' > char_right || char_right > 'z'))) { |
| 348 return FALSE; |
| 349 } |
| 350 if (char_count > 0) { |
| 351 if (csPageText.GetAt(startPos) >= L'0' && |
| 352 csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' && |
| 353 char_left <= L'9') { |
| 354 return FALSE; |
| 355 } |
| 356 if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' && |
| 357 char_right >= L'0' && char_right <= L'9') { |
| 358 return FALSE; |
| 359 } |
| 360 } |
| 361 return TRUE; |
| 362 } |
| 363 |
| 364 FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString, |
| 365 const FX_WCHAR* lpszFullString, |
| 366 int iSubString, |
| 367 FX_WCHAR chSep) { |
| 368 if (!lpszFullString) |
| 369 return FALSE; |
| 370 while (iSubString--) { |
| 371 lpszFullString = std::wcschr(lpszFullString, chSep); |
| 372 if (!lpszFullString) { |
| 373 rString.clear(); |
| 374 return FALSE; |
| 375 } |
| 376 lpszFullString++; |
| 377 while (*lpszFullString == chSep) |
| 378 lpszFullString++; |
| 379 } |
| 380 const FX_WCHAR* lpchEnd = std::wcschr(lpszFullString, chSep); |
| 381 int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString) |
| 382 : (int)FXSYS_wcslen(lpszFullString); |
| 383 ASSERT(nLen >= 0); |
| 384 FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString, |
| 385 nLen * sizeof(FX_WCHAR)); |
| 386 rString.ReleaseBuffer(); |
| 387 return TRUE; |
| 388 } |
| 389 |
| 390 CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) { |
| 391 CFX_WideString str2; |
| 392 str2.clear(); |
| 393 int nlen = str.GetLength(); |
| 394 for (int i = nlen - 1; i >= 0; i--) |
| 395 str2 += str.GetAt(i); |
| 396 return str2; |
| 397 } |
| 398 |
| 399 int CPDF_TextPageFind::GetCurOrder() const { |
| 400 return GetCharIndex(m_resStart); |
| 401 } |
| 402 |
| 403 int CPDF_TextPageFind::GetMatchedCount() const { |
| 404 int resStart = GetCharIndex(m_resStart); |
| 405 int resEnd = GetCharIndex(m_resEnd); |
| 406 return resEnd - resStart + 1; |
| 407 } |
OLD | NEW |