| Index: core/fpdftext/cpdf_textpagefind.cpp
|
| diff --git a/core/fpdftext/cpdf_textpagefind.cpp b/core/fpdftext/cpdf_textpagefind.cpp
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..a67bdf15d5eae16585e5dcf613d1369c9444ba3c
|
| --- /dev/null
|
| +++ b/core/fpdftext/cpdf_textpagefind.cpp
|
| @@ -0,0 +1,407 @@
|
| +// Copyright 2016 PDFium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
|
| +
|
| +#include "core/fpdftext/include/cpdf_textpagefind.h"
|
| +
|
| +#include <cwchar>
|
| +#include <cwctype>
|
| +#include <vector>
|
| +
|
| +#include "core/fpdftext/include/cpdf_textpage.h"
|
| +#include "core/fxcrt/include/fx_string.h"
|
| +#include "core/fxcrt/include/fx_system.h"
|
| +#include "third_party/base/stl_util.h"
|
| +
|
| +namespace {
|
| +
|
| +FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
|
| + if (curChar < 255 || (curChar >= 0x0600 && curChar <= 0x06FF) ||
|
| + (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
|
| + (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
|
| + (curChar >= 0x0400 && curChar <= 0x04FF) ||
|
| + (curChar >= 0x0500 && curChar <= 0x052F) ||
|
| + (curChar >= 0xA640 && curChar <= 0xA69F) ||
|
| + (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
|
| + (curChar >= 0x2000 && curChar <= 0x206F)) {
|
| + return FALSE;
|
| + }
|
| + return TRUE;
|
| +}
|
| +
|
| +} // namespace
|
| +
|
| +CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
|
| + : m_pTextPage(pTextPage),
|
| + m_flags(0),
|
| + m_findNextStart(-1),
|
| + m_findPreStart(-1),
|
| + m_bMatchCase(FALSE),
|
| + m_bMatchWholeWord(FALSE),
|
| + m_resStart(0),
|
| + m_resEnd(-1),
|
| + m_IsFind(FALSE) {
|
| + m_strText = m_pTextPage->GetPageText();
|
| + int nCount = pTextPage->CountChars();
|
| + if (nCount)
|
| + m_CharIndex.push_back(0);
|
| + for (int i = 0; i < nCount; i++) {
|
| + FPDF_CHAR_INFO info;
|
| + pTextPage->GetCharInfo(i, &info);
|
| + int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
|
| + if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
|
| + info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
|
| + if (indexSize % 2) {
|
| + m_CharIndex.push_back(1);
|
| + } else {
|
| + if (indexSize <= 0)
|
| + continue;
|
| + m_CharIndex[indexSize - 1] += 1;
|
| + }
|
| + } else {
|
| + if (indexSize % 2) {
|
| + if (indexSize <= 0)
|
| + continue;
|
| + m_CharIndex[indexSize - 1] = i + 1;
|
| + } else {
|
| + m_CharIndex.push_back(i + 1);
|
| + }
|
| + }
|
| + }
|
| + int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
|
| + if (indexSize % 2)
|
| + m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
|
| +}
|
| +
|
| +CPDF_TextPageFind::~CPDF_TextPageFind() {}
|
| +
|
| +int CPDF_TextPageFind::GetCharIndex(int index) const {
|
| + return m_pTextPage->CharIndexFromTextIndex(index);
|
| +}
|
| +
|
| +FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
|
| + int flags,
|
| + int startPos) {
|
| + if (!m_pTextPage)
|
| + return FALSE;
|
| + if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE))
|
| + m_strText = m_pTextPage->GetPageText();
|
| + CFX_WideString findwhatStr = findwhat;
|
| + m_findWhat = findwhatStr;
|
| + m_flags = flags;
|
| + m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
|
| + if (m_strText.IsEmpty()) {
|
| + m_IsFind = FALSE;
|
| + return TRUE;
|
| + }
|
| + FX_STRSIZE len = findwhatStr.GetLength();
|
| + if (!m_bMatchCase) {
|
| + findwhatStr.MakeLower();
|
| + m_strText.MakeLower();
|
| + }
|
| + m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
|
| + m_findNextStart = startPos;
|
| + if (startPos == -1)
|
| + m_findPreStart = m_strText.GetLength() - 1;
|
| + else
|
| + m_findPreStart = startPos;
|
| + m_csFindWhatArray.clear();
|
| + int i = 0;
|
| + while (i < len) {
|
| + if (findwhatStr.GetAt(i) != ' ')
|
| + break;
|
| + i++;
|
| + }
|
| + if (i < len)
|
| + ExtractFindWhat(findwhatStr);
|
| + else
|
| + m_csFindWhatArray.push_back(findwhatStr);
|
| + if (m_csFindWhatArray.empty())
|
| + return FALSE;
|
| + m_IsFind = TRUE;
|
| + m_resStart = 0;
|
| + m_resEnd = -1;
|
| + return TRUE;
|
| +}
|
| +
|
| +FX_BOOL CPDF_TextPageFind::FindNext() {
|
| + if (!m_pTextPage)
|
| + return FALSE;
|
| + m_resArray.clear();
|
| + if (m_findNextStart == -1)
|
| + return FALSE;
|
| + if (m_strText.IsEmpty()) {
|
| + m_IsFind = FALSE;
|
| + return m_IsFind;
|
| + }
|
| + int strLen = m_strText.GetLength();
|
| + if (m_findNextStart > strLen - 1) {
|
| + m_IsFind = FALSE;
|
| + return m_IsFind;
|
| + }
|
| + int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
|
| + int nResultPos = 0;
|
| + int nStartPos = 0;
|
| + nStartPos = m_findNextStart;
|
| + bool bSpaceStart = false;
|
| + for (int iWord = 0; iWord < nCount; iWord++) {
|
| + CFX_WideString csWord = m_csFindWhatArray[iWord];
|
| + if (csWord.IsEmpty()) {
|
| + if (iWord == nCount - 1) {
|
| + FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
|
| + if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
|
| + strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
|
| + nResultPos = nStartPos + 1;
|
| + break;
|
| + }
|
| + iWord = -1;
|
| + } else if (iWord == 0) {
|
| + bSpaceStart = true;
|
| + }
|
| + continue;
|
| + }
|
| + int endIndex;
|
| + nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
|
| + if (nResultPos == -1) {
|
| + m_IsFind = FALSE;
|
| + return m_IsFind;
|
| + }
|
| + endIndex = nResultPos + csWord.GetLength() - 1;
|
| + if (iWord == 0)
|
| + m_resStart = nResultPos;
|
| + FX_BOOL bMatch = TRUE;
|
| + if (iWord != 0 && !bSpaceStart) {
|
| + int PreResEndPos = nStartPos;
|
| + int curChar = csWord.GetAt(0);
|
| + CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
|
| + int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
|
| + if (nStartPos == nResultPos &&
|
| + !(IsIgnoreSpaceCharacter(lastChar) ||
|
| + IsIgnoreSpaceCharacter(curChar))) {
|
| + bMatch = FALSE;
|
| + }
|
| + for (int d = PreResEndPos; d < nResultPos; d++) {
|
| + FX_WCHAR strInsert = m_strText.GetAt(d);
|
| + if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
|
| + strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
|
| + bMatch = FALSE;
|
| + break;
|
| + }
|
| + }
|
| + } else if (bSpaceStart) {
|
| + if (nResultPos > 0) {
|
| + FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
|
| + if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
|
| + strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
|
| + bMatch = FALSE;
|
| + m_resStart = nResultPos;
|
| + } else {
|
| + m_resStart = nResultPos - 1;
|
| + }
|
| + }
|
| + }
|
| + if (m_bMatchWholeWord && bMatch) {
|
| + bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
|
| + }
|
| + nStartPos = endIndex + 1;
|
| + if (!bMatch) {
|
| + iWord = -1;
|
| + if (bSpaceStart)
|
| + nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
|
| + else
|
| + nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
|
| + }
|
| + }
|
| + m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;
|
| + m_IsFind = TRUE;
|
| + int resStart = GetCharIndex(m_resStart);
|
| + int resEnd = GetCharIndex(m_resEnd);
|
| + m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
|
| + if (m_flags & FPDFTEXT_CONSECUTIVE) {
|
| + m_findNextStart = m_resStart + 1;
|
| + m_findPreStart = m_resEnd - 1;
|
| + } else {
|
| + m_findNextStart = m_resEnd + 1;
|
| + m_findPreStart = m_resStart - 1;
|
| + }
|
| + return m_IsFind;
|
| +}
|
| +
|
| +FX_BOOL CPDF_TextPageFind::FindPrev() {
|
| + if (!m_pTextPage)
|
| + return FALSE;
|
| + m_resArray.clear();
|
| + if (m_strText.IsEmpty() || m_findPreStart < 0) {
|
| + m_IsFind = FALSE;
|
| + return m_IsFind;
|
| + }
|
| + CPDF_TextPageFind findEngine(m_pTextPage);
|
| + FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);
|
| + if (!ret) {
|
| + m_IsFind = FALSE;
|
| + return m_IsFind;
|
| + }
|
| + int order = -1, MatchedCount = 0;
|
| + while (ret) {
|
| + ret = findEngine.FindNext();
|
| + if (ret) {
|
| + int order1 = findEngine.GetCurOrder();
|
| + int MatchedCount1 = findEngine.GetMatchedCount();
|
| + if (((order1 + MatchedCount1) - 1) > m_findPreStart)
|
| + break;
|
| + order = order1;
|
| + MatchedCount = MatchedCount1;
|
| + }
|
| + }
|
| + if (order == -1) {
|
| + m_IsFind = FALSE;
|
| + return m_IsFind;
|
| + }
|
| + m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
|
| + m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
|
| + m_IsFind = TRUE;
|
| + m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
|
| + if (m_flags & FPDFTEXT_CONSECUTIVE) {
|
| + m_findNextStart = m_resStart + 1;
|
| + m_findPreStart = m_resEnd - 1;
|
| + } else {
|
| + m_findNextStart = m_resEnd + 1;
|
| + m_findPreStart = m_resStart - 1;
|
| + }
|
| + return m_IsFind;
|
| +}
|
| +
|
| +void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
|
| + if (findwhat.IsEmpty())
|
| + return;
|
| + int index = 0;
|
| + while (1) {
|
| + CFX_WideString csWord = TEXT_EMPTY;
|
| + int ret =
|
| + ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);
|
| + if (csWord.IsEmpty()) {
|
| + if (ret) {
|
| + m_csFindWhatArray.push_back(L"");
|
| + index++;
|
| + continue;
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| + int pos = 0;
|
| + while (pos < csWord.GetLength()) {
|
| + CFX_WideString curStr = csWord.Mid(pos, 1);
|
| + FX_WCHAR curChar = csWord.GetAt(pos);
|
| + if (IsIgnoreSpaceCharacter(curChar)) {
|
| + if (pos > 0 && curChar == 0x2019) {
|
| + pos++;
|
| + continue;
|
| + }
|
| + if (pos > 0)
|
| + m_csFindWhatArray.push_back(csWord.Mid(0, pos));
|
| + m_csFindWhatArray.push_back(curStr);
|
| + if (pos == csWord.GetLength() - 1) {
|
| + csWord.clear();
|
| + break;
|
| + }
|
| + csWord = csWord.Right(csWord.GetLength() - pos - 1);
|
| + pos = 0;
|
| + continue;
|
| + }
|
| + pos++;
|
| + }
|
| + if (!csWord.IsEmpty())
|
| + m_csFindWhatArray.push_back(csWord);
|
| + index++;
|
| + }
|
| +}
|
| +
|
| +FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
|
| + int startPos,
|
| + int endPos) {
|
| + FX_WCHAR char_left = 0;
|
| + FX_WCHAR char_right = 0;
|
| + int char_count = endPos - startPos + 1;
|
| + if (char_count < 1)
|
| + return FALSE;
|
| + if (char_count == 1 && csPageText.GetAt(startPos) > 255)
|
| + return TRUE;
|
| + if (startPos - 1 >= 0)
|
| + char_left = csPageText.GetAt(startPos - 1);
|
| + if (startPos + char_count < csPageText.GetLength())
|
| + char_right = csPageText.GetAt(startPos + char_count);
|
| + if ((char_left > 'A' && char_left < 'a') ||
|
| + (char_left > 'a' && char_left < 'z') ||
|
| + (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
|
| + (char_right > 'A' && char_right < 'a') ||
|
| + (char_right > 'a' && char_right < 'z') ||
|
| + (char_right > 0xfb00 && char_right < 0xfb06) ||
|
| + std::iswdigit(char_right)) {
|
| + return FALSE;
|
| + }
|
| + if (!(('A' > char_left || char_left > 'Z') &&
|
| + ('a' > char_left || char_left > 'z') &&
|
| + ('A' > char_right || char_right > 'Z') &&
|
| + ('a' > char_right || char_right > 'z'))) {
|
| + return FALSE;
|
| + }
|
| + if (char_count > 0) {
|
| + if (csPageText.GetAt(startPos) >= L'0' &&
|
| + csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
|
| + char_left <= L'9') {
|
| + return FALSE;
|
| + }
|
| + if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
|
| + char_right >= L'0' && char_right <= L'9') {
|
| + return FALSE;
|
| + }
|
| + }
|
| + return TRUE;
|
| +}
|
| +
|
| +FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
|
| + const FX_WCHAR* lpszFullString,
|
| + int iSubString,
|
| + FX_WCHAR chSep) {
|
| + if (!lpszFullString)
|
| + return FALSE;
|
| + while (iSubString--) {
|
| + lpszFullString = std::wcschr(lpszFullString, chSep);
|
| + if (!lpszFullString) {
|
| + rString.clear();
|
| + return FALSE;
|
| + }
|
| + lpszFullString++;
|
| + while (*lpszFullString == chSep)
|
| + lpszFullString++;
|
| + }
|
| + const FX_WCHAR* lpchEnd = std::wcschr(lpszFullString, chSep);
|
| + int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
|
| + : (int)FXSYS_wcslen(lpszFullString);
|
| + ASSERT(nLen >= 0);
|
| + FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
|
| + nLen * sizeof(FX_WCHAR));
|
| + rString.ReleaseBuffer();
|
| + return TRUE;
|
| +}
|
| +
|
| +CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
|
| + CFX_WideString str2;
|
| + str2.clear();
|
| + int nlen = str.GetLength();
|
| + for (int i = nlen - 1; i >= 0; i--)
|
| + str2 += str.GetAt(i);
|
| + return str2;
|
| +}
|
| +
|
| +int CPDF_TextPageFind::GetCurOrder() const {
|
| + return GetCharIndex(m_resStart);
|
| +}
|
| +
|
| +int CPDF_TextPageFind::GetMatchedCount() const {
|
| + int resStart = GetCharIndex(m_resStart);
|
| + int resEnd = GetCharIndex(m_resEnd);
|
| + return resEnd - resStart + 1;
|
| +}
|
|
|