Index: core/fpdftext/cpdf_textpagefind.cpp |
diff --git a/core/fpdftext/cpdf_textpagefind.cpp b/core/fpdftext/cpdf_textpagefind.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..a67bdf15d5eae16585e5dcf613d1369c9444ba3c |
--- /dev/null |
+++ b/core/fpdftext/cpdf_textpagefind.cpp |
@@ -0,0 +1,407 @@ |
+// Copyright 2016 PDFium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
+ |
+#include "core/fpdftext/include/cpdf_textpagefind.h" |
+ |
+#include <cwchar> |
+#include <cwctype> |
+#include <vector> |
+ |
+#include "core/fpdftext/include/cpdf_textpage.h" |
+#include "core/fxcrt/include/fx_string.h" |
+#include "core/fxcrt/include/fx_system.h" |
+#include "third_party/base/stl_util.h" |
+ |
+namespace { |
+ |
+FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) { |
+ if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) || |
+ (curChar >= 0xFE70 && curChar <= 0xFEFF) || |
+ (curChar >= 0xFB50 && curChar <= 0xFDFF) || |
+ (curChar >= 0x0400 && curChar <= 0x04FF) || |
+ (curChar >= 0x0500 && curChar <= 0x052F) || |
+ (curChar >= 0xA640 && curChar <= 0xA69F) || |
+ (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 || |
+ (curChar >= 0x2000 && curChar <= 0x206F)) { |
+ return FALSE; |
+ } |
+ return TRUE; |
+} |
+ |
+} // namespace |
+ |
+CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage) |
+ : m_pTextPage(pTextPage), |
+ m_flags(0), |
+ m_findNextStart(-1), |
+ m_findPreStart(-1), |
+ m_bMatchCase(FALSE), |
+ m_bMatchWholeWord(FALSE), |
+ m_resStart(0), |
+ m_resEnd(-1), |
+ m_IsFind(FALSE) { |
+ m_strText = m_pTextPage->GetPageText(); |
+ int nCount = pTextPage->CountChars(); |
+ if (nCount) |
+ m_CharIndex.push_back(0); |
+ for (int i = 0; i < nCount; i++) { |
+ FPDF_CHAR_INFO info; |
+ pTextPage->GetCharInfo(i, &info); |
+ int indexSize = pdfium::CollectionSize<int>(m_CharIndex); |
+ if (info.m_Flag == FPDFTEXT_CHAR_NORMAL || |
+ info.m_Flag == FPDFTEXT_CHAR_GENERATED) { |
+ if (indexSize % 2) { |
+ m_CharIndex.push_back(1); |
+ } else { |
+ if (indexSize <= 0) |
+ continue; |
+ m_CharIndex[indexSize - 1] += 1; |
+ } |
+ } else { |
+ if (indexSize % 2) { |
+ if (indexSize <= 0) |
+ continue; |
+ m_CharIndex[indexSize - 1] = i + 1; |
+ } else { |
+ m_CharIndex.push_back(i + 1); |
+ } |
+ } |
+ } |
+ int indexSize = pdfium::CollectionSize<int>(m_CharIndex); |
+ if (indexSize % 2) |
+ m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1); |
+} |
+ |
+CPDF_TextPageFind::~CPDF_TextPageFind() {} |
+ |
+int CPDF_TextPageFind::GetCharIndex(int index) const { |
+ return m_pTextPage->CharIndexFromTextIndex(index); |
+} |
+ |
+FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat, |
+ int flags, |
+ int startPos) { |
+ if (!m_pTextPage) |
+ return FALSE; |
+ if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) |
+ m_strText = m_pTextPage->GetPageText(); |
+ CFX_WideString findwhatStr = findwhat; |
+ m_findWhat = findwhatStr; |
+ m_flags = flags; |
+ m_bMatchCase = flags & FPDFTEXT_MATCHCASE; |
+ if (m_strText.IsEmpty()) { |
+ m_IsFind = FALSE; |
+ return TRUE; |
+ } |
+ FX_STRSIZE len = findwhatStr.GetLength(); |
+ if (!m_bMatchCase) { |
+ findwhatStr.MakeLower(); |
+ m_strText.MakeLower(); |
+ } |
+ m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD; |
+ m_findNextStart = startPos; |
+ if (startPos == -1) |
+ m_findPreStart = m_strText.GetLength() - 1; |
+ else |
+ m_findPreStart = startPos; |
+ m_csFindWhatArray.clear(); |
+ int i = 0; |
+ while (i < len) { |
+ if (findwhatStr.GetAt(i) != ' ') |
+ break; |
+ i++; |
+ } |
+ if (i < len) |
+ ExtractFindWhat(findwhatStr); |
+ else |
+ m_csFindWhatArray.push_back(findwhatStr); |
+ if (m_csFindWhatArray.empty()) |
+ return FALSE; |
+ m_IsFind = TRUE; |
+ m_resStart = 0; |
+ m_resEnd = -1; |
+ return TRUE; |
+} |
+ |
+FX_BOOL CPDF_TextPageFind::FindNext() { |
+ if (!m_pTextPage) |
+ return FALSE; |
+ m_resArray.clear(); |
+ if (m_findNextStart == -1) |
+ return FALSE; |
+ if (m_strText.IsEmpty()) { |
+ m_IsFind = FALSE; |
+ return m_IsFind; |
+ } |
+ int strLen = m_strText.GetLength(); |
+ if (m_findNextStart > strLen - 1) { |
+ m_IsFind = FALSE; |
+ return m_IsFind; |
+ } |
+ int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray); |
+ int nResultPos = 0; |
+ int nStartPos = 0; |
+ nStartPos = m_findNextStart; |
+ bool bSpaceStart = false; |
+ for (int iWord = 0; iWord < nCount; iWord++) { |
+ CFX_WideString csWord = m_csFindWhatArray[iWord]; |
+ if (csWord.IsEmpty()) { |
+ if (iWord == nCount - 1) { |
+ FX_WCHAR strInsert = m_strText.GetAt(nStartPos); |
+ if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR || |
+ strInsert == TEXT_RETURN_CHAR || strInsert == 160) { |
+ nResultPos = nStartPos + 1; |
+ break; |
+ } |
+ iWord = -1; |
+ } else if (iWord == 0) { |
+ bSpaceStart = true; |
+ } |
+ continue; |
+ } |
+ int endIndex; |
+ nResultPos = m_strText.Find(csWord.c_str(), nStartPos); |
+ if (nResultPos == -1) { |
+ m_IsFind = FALSE; |
+ return m_IsFind; |
+ } |
+ endIndex = nResultPos + csWord.GetLength() - 1; |
+ if (iWord == 0) |
+ m_resStart = nResultPos; |
+ FX_BOOL bMatch = TRUE; |
+ if (iWord != 0 && !bSpaceStart) { |
+ int PreResEndPos = nStartPos; |
+ int curChar = csWord.GetAt(0); |
+ CFX_WideString lastWord = m_csFindWhatArray[iWord - 1]; |
+ int lastChar = lastWord.GetAt(lastWord.GetLength() - 1); |
+ if (nStartPos == nResultPos && |
+ !(IsIgnoreSpaceCharacter(lastChar) || |
+ IsIgnoreSpaceCharacter(curChar))) { |
+ bMatch = FALSE; |
+ } |
+ for (int d = PreResEndPos; d < nResultPos; d++) { |
+ FX_WCHAR strInsert = m_strText.GetAt(d); |
+ if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && |
+ strInsert != TEXT_RETURN_CHAR && strInsert != 160) { |
+ bMatch = FALSE; |
+ break; |
+ } |
+ } |
+ } else if (bSpaceStart) { |
+ if (nResultPos > 0) { |
+ FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1); |
+ if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR && |
+ strInsert != TEXT_RETURN_CHAR && strInsert != 160) { |
+ bMatch = FALSE; |
+ m_resStart = nResultPos; |
+ } else { |
+ m_resStart = nResultPos - 1; |
+ } |
+ } |
+ } |
+ if (m_bMatchWholeWord && bMatch) { |
+ bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex); |
+ } |
+ nStartPos = endIndex + 1; |
+ if (!bMatch) { |
+ iWord = -1; |
+ if (bSpaceStart) |
+ nStartPos = m_resStart + m_csFindWhatArray[1].GetLength(); |
+ else |
+ nStartPos = m_resStart + m_csFindWhatArray[0].GetLength(); |
+ } |
+ } |
+ m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1; |
+ m_IsFind = TRUE; |
+ int resStart = GetCharIndex(m_resStart); |
+ int resEnd = GetCharIndex(m_resEnd); |
+ m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1); |
+ if (m_flags & FPDFTEXT_CONSECUTIVE) { |
+ m_findNextStart = m_resStart + 1; |
+ m_findPreStart = m_resEnd - 1; |
+ } else { |
+ m_findNextStart = m_resEnd + 1; |
+ m_findPreStart = m_resStart - 1; |
+ } |
+ return m_IsFind; |
+} |
+ |
+FX_BOOL CPDF_TextPageFind::FindPrev() { |
+ if (!m_pTextPage) |
+ return FALSE; |
+ m_resArray.clear(); |
+ if (m_strText.IsEmpty() || m_findPreStart < 0) { |
+ m_IsFind = FALSE; |
+ return m_IsFind; |
+ } |
+ CPDF_TextPageFind findEngine(m_pTextPage); |
+ FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags); |
+ if (!ret) { |
+ m_IsFind = FALSE; |
+ return m_IsFind; |
+ } |
+ int order = -1, MatchedCount = 0; |
+ while (ret) { |
+ ret = findEngine.FindNext(); |
+ if (ret) { |
+ int order1 = findEngine.GetCurOrder(); |
+ int MatchedCount1 = findEngine.GetMatchedCount(); |
+ if (((order1 + MatchedCount1) - 1) > m_findPreStart) |
+ break; |
+ order = order1; |
+ MatchedCount = MatchedCount1; |
+ } |
+ } |
+ if (order == -1) { |
+ m_IsFind = FALSE; |
+ return m_IsFind; |
+ } |
+ m_resStart = m_pTextPage->TextIndexFromCharIndex(order); |
+ m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1); |
+ m_IsFind = TRUE; |
+ m_resArray = m_pTextPage->GetRectArray(order, MatchedCount); |
+ if (m_flags & FPDFTEXT_CONSECUTIVE) { |
+ m_findNextStart = m_resStart + 1; |
+ m_findPreStart = m_resEnd - 1; |
+ } else { |
+ m_findNextStart = m_resEnd + 1; |
+ m_findPreStart = m_resStart - 1; |
+ } |
+ return m_IsFind; |
+} |
+ |
+void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) { |
+ if (findwhat.IsEmpty()) |
+ return; |
+ int index = 0; |
+ while (1) { |
+ CFX_WideString csWord = TEXT_EMPTY; |
+ int ret = |
+ ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR); |
+ if (csWord.IsEmpty()) { |
+ if (ret) { |
+ m_csFindWhatArray.push_back(L""); |
+ index++; |
+ continue; |
+ } else { |
+ break; |
+ } |
+ } |
+ int pos = 0; |
+ while (pos < csWord.GetLength()) { |
+ CFX_WideString curStr = csWord.Mid(pos, 1); |
+ FX_WCHAR curChar = csWord.GetAt(pos); |
+ if (IsIgnoreSpaceCharacter(curChar)) { |
+ if (pos > 0 && curChar == 0x2019) { |
+ pos++; |
+ continue; |
+ } |
+ if (pos > 0) |
+ m_csFindWhatArray.push_back(csWord.Mid(0, pos)); |
+ m_csFindWhatArray.push_back(curStr); |
+ if (pos == csWord.GetLength() - 1) { |
+ csWord.clear(); |
+ break; |
+ } |
+ csWord = csWord.Right(csWord.GetLength() - pos - 1); |
+ pos = 0; |
+ continue; |
+ } |
+ pos++; |
+ } |
+ if (!csWord.IsEmpty()) |
+ m_csFindWhatArray.push_back(csWord); |
+ index++; |
+ } |
+} |
+ |
+FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText, |
+ int startPos, |
+ int endPos) { |
+ FX_WCHAR char_left = 0; |
+ FX_WCHAR char_right = 0; |
+ int char_count = endPos - startPos + 1; |
+ if (char_count < 1) |
+ return FALSE; |
+ if (char_count == 1 && csPageText.GetAt(startPos) > 255) |
+ return TRUE; |
+ if (startPos - 1 >= 0) |
+ char_left = csPageText.GetAt(startPos - 1); |
+ if (startPos + char_count < csPageText.GetLength()) |
+ char_right = csPageText.GetAt(startPos + char_count); |
+ if ((char_left > 'A' && char_left < 'a') || |
+ (char_left > 'a' && char_left < 'z') || |
+ (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) || |
+ (char_right > 'A' && char_right < 'a') || |
+ (char_right > 'a' && char_right < 'z') || |
+ (char_right > 0xfb00 && char_right < 0xfb06) || |
+ std::iswdigit(char_right)) { |
+ return FALSE; |
+ } |
+ if (!(('A' > char_left || char_left > 'Z') && |
+ ('a' > char_left || char_left > 'z') && |
+ ('A' > char_right || char_right > 'Z') && |
+ ('a' > char_right || char_right > 'z'))) { |
+ return FALSE; |
+ } |
+ if (char_count > 0) { |
+ if (csPageText.GetAt(startPos) >= L'0' && |
+ csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' && |
+ char_left <= L'9') { |
+ return FALSE; |
+ } |
+ if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' && |
+ char_right >= L'0' && char_right <= L'9') { |
+ return FALSE; |
+ } |
+ } |
+ return TRUE; |
+} |
+ |
+FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString, |
+ const FX_WCHAR* lpszFullString, |
+ int iSubString, |
+ FX_WCHAR chSep) { |
+ if (!lpszFullString) |
+ return FALSE; |
+ while (iSubString--) { |
+ lpszFullString = std::wcschr(lpszFullString, chSep); |
+ if (!lpszFullString) { |
+ rString.clear(); |
+ return FALSE; |
+ } |
+ lpszFullString++; |
+ while (*lpszFullString == chSep) |
+ lpszFullString++; |
+ } |
+ const FX_WCHAR* lpchEnd = std::wcschr(lpszFullString, chSep); |
+ int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString) |
+ : (int)FXSYS_wcslen(lpszFullString); |
+ ASSERT(nLen >= 0); |
+ FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString, |
+ nLen * sizeof(FX_WCHAR)); |
+ rString.ReleaseBuffer(); |
+ return TRUE; |
+} |
+ |
+CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) { |
+ CFX_WideString str2; |
+ str2.clear(); |
+ int nlen = str.GetLength(); |
+ for (int i = nlen - 1; i >= 0; i--) |
+ str2 += str.GetAt(i); |
+ return str2; |
+} |
+ |
+int CPDF_TextPageFind::GetCurOrder() const { |
+ return GetCharIndex(m_resStart); |
+} |
+ |
+int CPDF_TextPageFind::GetMatchedCount() const { |
+ int resStart = GetCharIndex(m_resStart); |
+ int resEnd = GetCharIndex(m_resEnd); |
+ return resEnd - resStart + 1; |
+} |