core/fpdftext/cpdf_textpage.cpp - Issue 2286723003: Split fpdf_text_int into classes

Unified Diff: core/fpdftext/cpdf_textpage.cpp

Issue 2286723003: Split fpdf_text_int into classes (Closed)

Patch Set: Fix bots Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: core/fpdftext/cpdf_textpage.cpp

diff --git a/core/fpdftext/fpdf_text_int.cpp b/core/fpdftext/cpdf_textpage.cpp

similarity index 75%

rename from core/fpdftext/fpdf_text_int.cpp

rename to core/fpdftext/cpdf_textpage.cpp

index fbd9c9c8c1cfbbc40da724d7632f8145ffc5a350..3981cfee40128d95a6457e906894fe3f2001e0ef 100644

--- a/core/fpdftext/fpdf_text_int.cpp

+++ b/core/fpdftext/cpdf_textpage.cpp

@@ -4,10 +4,9 @@

+#include "core/fpdftext/include/cpdf_textpage.h"

#include <algorithm>

-#include <cctype>

-#include <cwctype>

-#include <memory>

#include <utility>

#include <vector>

@@ -19,35 +18,12 @@

#include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h"

#include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h"

#include "core/fpdfapi/fpdf_parser/include/cpdf_string.h"

-#include "core/fpdftext/include/cpdf_linkextract.h"

-#include "core/fpdftext/include/cpdf_textpage.h"

-#include "core/fpdftext/include/cpdf_textpagefind.h"

#include "core/fpdftext/unicodenormalizationdata.h"

#include "core/fxcrt/fx_bidi.h"

#include "core/fxcrt/include/fx_ext.h"

#include "core/fxcrt/include/fx_ucd.h"

#include "third_party/base/stl_util.h"

-#define FPDFTEXT_MATCHCASE 0x00000001

-#define FPDFTEXT_MATCHWHOLEWORD 0x00000002

-#define FPDFTEXT_CONSECUTIVE 0x00000004

-#define FPDFTEXT_CHAR_ERROR -1

-#define FPDFTEXT_CHAR_NORMAL 0

-#define FPDFTEXT_CHAR_GENERATED 1

-#define FPDFTEXT_CHAR_UNUNICODE 2

-#define FPDFTEXT_CHAR_HYPHEN 3

-#define FPDFTEXT_CHAR_PIECE 4

-#define TEXT_SPACE_CHAR L' '

-#define TEXT_LINEFEED_CHAR L'\n'

-#define TEXT_RETURN_CHAR L'\r'

-#define TEXT_EMPTY L""

-#define TEXT_SPACE L" "

-#define TEXT_RETURN_LINEFEED L"\r\n"

-#define TEXT_LINEFEED L"\n"

-#define TEXT_CHARRATIO_GAPDELTA 0.070

namespace {

const FX_FLOAT kDefaultFontSize = 1.0f;

@@ -55,22 +31,6 @@ const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {

nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,

g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};

-FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) {

- if (curChar < 255)

- return FALSE;

- if ((curChar >= 0x0600 && curChar <= 0x06FF) ||

- (curChar >= 0xFE70 && curChar <= 0xFEFF) ||

- (curChar >= 0xFB50 && curChar <= 0xFDFF) ||

- (curChar >= 0x0400 && curChar <= 0x04FF) ||

- (curChar >= 0x0500 && curChar <= 0x052F) ||

- (curChar >= 0xA640 && curChar <= 0xA69F) ||

- (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||

- (curChar >= 0x2000 && curChar <= 0x206F)) {

- return FALSE;

- }

- return TRUE;

FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) {

if (threshold < 300)

return threshold / 2.0f;

@@ -1587,563 +1547,3 @@ FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,

rect.Intersect(rect2);

return !rect.IsEmpty();

}

-CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)

- : m_pTextPage(pTextPage),

- m_flags(0),

- m_findNextStart(-1),

- m_findPreStart(-1),

- m_bMatchCase(FALSE),

- m_bMatchWholeWord(FALSE),

- m_resStart(0),

- m_resEnd(-1),

- m_IsFind(FALSE) {

- m_strText = m_pTextPage->GetPageText();

- int nCount = pTextPage->CountChars();

- if (nCount) {

- m_CharIndex.push_back(0);

- }

- for (int i = 0; i < nCount; i++) {

- FPDF_CHAR_INFO info;

- pTextPage->GetCharInfo(i, &info);

- int indexSize = pdfium::CollectionSize<int>(m_CharIndex);

- if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||

- info.m_Flag == FPDFTEXT_CHAR_GENERATED) {

- if (indexSize % 2) {

- m_CharIndex.push_back(1);

- } else {

- if (indexSize <= 0) {

- continue;

- }

- m_CharIndex[indexSize - 1] += 1;

- }

- } else {

- if (indexSize % 2) {

- if (indexSize <= 0) {

- continue;

- }

- m_CharIndex[indexSize - 1] = i + 1;

- } else {

- m_CharIndex.push_back(i + 1);

- }

- int indexSize = pdfium::CollectionSize<int>(m_CharIndex);

- if (indexSize % 2) {

- m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);

- }

-CPDF_TextPageFind::~CPDF_TextPageFind() {}

-int CPDF_TextPageFind::GetCharIndex(int index) const {

- return m_pTextPage->CharIndexFromTextIndex(index);

-FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,

- int flags,

- int startPos) {

- if (!m_pTextPage) {

- return FALSE;

- }

- if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) {

- m_strText = m_pTextPage->GetPageText();

- }

- CFX_WideString findwhatStr = findwhat;

- m_findWhat = findwhatStr;

- m_flags = flags;

- m_bMatchCase = flags & FPDFTEXT_MATCHCASE;

- if (m_strText.IsEmpty()) {

- m_IsFind = FALSE;

- return TRUE;

- }

- FX_STRSIZE len = findwhatStr.GetLength();

- if (!m_bMatchCase) {

- findwhatStr.MakeLower();

- m_strText.MakeLower();

- }

- m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;

- m_findNextStart = startPos;

- if (startPos == -1) {

- m_findPreStart = m_strText.GetLength() - 1;

- } else {

- m_findPreStart = startPos;

- }

- m_csFindWhatArray.clear();

- int i = 0;

- while (i < len) {

- if (findwhatStr.GetAt(i) != ' ') {

- break;

- }

- i++;

- }

- if (i < len) {

- ExtractFindWhat(findwhatStr);

- } else {

- m_csFindWhatArray.push_back(findwhatStr);

- }

- if (m_csFindWhatArray.empty()) {

- return FALSE;

- }

- m_IsFind = TRUE;

- m_resStart = 0;

- m_resEnd = -1;

- return TRUE;

-FX_BOOL CPDF_TextPageFind::FindNext() {

- if (!m_pTextPage) {

- return FALSE;

- }

- m_resArray.clear();

- if (m_findNextStart == -1) {

- return FALSE;

- }

- if (m_strText.IsEmpty()) {

- m_IsFind = FALSE;

- return m_IsFind;

- }

- int strLen = m_strText.GetLength();

- if (m_findNextStart > strLen - 1) {

- m_IsFind = FALSE;

- return m_IsFind;

- }

- int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);

- int nResultPos = 0;

- int nStartPos = 0;

- nStartPos = m_findNextStart;

- FX_BOOL bSpaceStart = FALSE;

- for (int iWord = 0; iWord < nCount; iWord++) {

- CFX_WideString csWord = m_csFindWhatArray[iWord];

- if (csWord.IsEmpty()) {

- if (iWord == nCount - 1) {

- FX_WCHAR strInsert = m_strText.GetAt(nStartPos);

- if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||

- strInsert == TEXT_RETURN_CHAR || strInsert == 160) {

- nResultPos = nStartPos + 1;

- break;

- }

- iWord = -1;

- } else if (iWord == 0) {

- bSpaceStart = TRUE;

- }

- continue;

- }

- int endIndex;

- nResultPos = m_strText.Find(csWord.c_str(), nStartPos);

- if (nResultPos == -1) {

- m_IsFind = FALSE;

- return m_IsFind;

- }

- endIndex = nResultPos + csWord.GetLength() - 1;

- if (iWord == 0) {

- m_resStart = nResultPos;

- }

- FX_BOOL bMatch = TRUE;

- if (iWord != 0 && !bSpaceStart) {

- int PreResEndPos = nStartPos;

- int curChar = csWord.GetAt(0);

- CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];

- int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);

- if (nStartPos == nResultPos &&

- !(IsIgnoreSpaceCharacter(lastChar) ||

- IsIgnoreSpaceCharacter(curChar))) {

- bMatch = FALSE;

- }

- for (int d = PreResEndPos; d < nResultPos; d++) {

- FX_WCHAR strInsert = m_strText.GetAt(d);

- if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&

- strInsert != TEXT_RETURN_CHAR && strInsert != 160) {

- bMatch = FALSE;

- break;

- }

- } else if (bSpaceStart) {

- if (nResultPos > 0) {

- FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);

- if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&

- strInsert != TEXT_RETURN_CHAR && strInsert != 160) {

- bMatch = FALSE;

- m_resStart = nResultPos;

- } else {

- m_resStart = nResultPos - 1;

- }

- if (m_bMatchWholeWord && bMatch) {

- bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);

- }

- nStartPos = endIndex + 1;

- if (!bMatch) {

- iWord = -1;

- if (bSpaceStart) {

- nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();

- } else {

- nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();

- }

- m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;

- m_IsFind = TRUE;

- int resStart = GetCharIndex(m_resStart);

- int resEnd = GetCharIndex(m_resEnd);

- m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);

- if (m_flags & FPDFTEXT_CONSECUTIVE) {

- m_findNextStart = m_resStart + 1;

- m_findPreStart = m_resEnd - 1;

- } else {

- m_findNextStart = m_resEnd + 1;

- m_findPreStart = m_resStart - 1;

- }

- return m_IsFind;

-FX_BOOL CPDF_TextPageFind::FindPrev() {

- if (!m_pTextPage) {

- return FALSE;

- }

- m_resArray.clear();

- if (m_strText.IsEmpty() || m_findPreStart < 0) {

- m_IsFind = FALSE;

- return m_IsFind;

- }

- CPDF_TextPageFind findEngine(m_pTextPage);

- FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);

- if (!ret) {

- m_IsFind = FALSE;

- return m_IsFind;

- }

- int order = -1, MatchedCount = 0;

- while (ret) {

- ret = findEngine.FindNext();

- if (ret) {

- int order1 = findEngine.GetCurOrder();

- int MatchedCount1 = findEngine.GetMatchedCount();

- if (((order1 + MatchedCount1) - 1) > m_findPreStart) {

- break;

- }

- order = order1;

- MatchedCount = MatchedCount1;

- }

- if (order == -1) {

- m_IsFind = FALSE;

- return m_IsFind;

- }

- m_resStart = m_pTextPage->TextIndexFromCharIndex(order);

- m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);

- m_IsFind = TRUE;

- m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);

- if (m_flags & FPDFTEXT_CONSECUTIVE) {

- m_findNextStart = m_resStart + 1;

- m_findPreStart = m_resEnd - 1;

- } else {

- m_findNextStart = m_resEnd + 1;

- m_findPreStart = m_resStart - 1;

- }

- return m_IsFind;

-void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {

- if (findwhat.IsEmpty()) {

- return;

- }

- int index = 0;

- while (1) {

- CFX_WideString csWord = TEXT_EMPTY;

- int ret =

- ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);

- if (csWord.IsEmpty()) {

- if (ret) {

- m_csFindWhatArray.push_back(L"");

- index++;

- continue;

- } else {

- break;

- }

- int pos = 0;

- while (pos < csWord.GetLength()) {

- CFX_WideString curStr = csWord.Mid(pos, 1);

- FX_WCHAR curChar = csWord.GetAt(pos);

- if (IsIgnoreSpaceCharacter(curChar)) {

- if (pos > 0 && curChar == 0x2019) {

- pos++;

- continue;

- }

- if (pos > 0) {

- m_csFindWhatArray.push_back(csWord.Mid(0, pos));

- }

- m_csFindWhatArray.push_back(curStr);

- if (pos == csWord.GetLength() - 1) {

- csWord.clear();

- break;

- }

- csWord = csWord.Right(csWord.GetLength() - pos - 1);

- pos = 0;

- continue;

- }

- pos++;

- }

- if (!csWord.IsEmpty()) {

- m_csFindWhatArray.push_back(csWord);

- }

- index++;

- }

-FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,

- int startPos,

- int endPos) {

- FX_WCHAR char_left = 0;

- FX_WCHAR char_right = 0;

- int char_count = endPos - startPos + 1;

- if (char_count < 1) {

- return FALSE;

- }

- if (char_count == 1 && csPageText.GetAt(startPos) > 255) {

- return TRUE;

- }

- if (startPos - 1 >= 0) {

- char_left = csPageText.GetAt(startPos - 1);

- }

- if (startPos + char_count < csPageText.GetLength()) {

- char_right = csPageText.GetAt(startPos + char_count);

- }

- if ((char_left > 'A' && char_left < 'a') ||

- (char_left > 'a' && char_left < 'z') ||

- (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||

- (char_right > 'A' && char_right < 'a') ||

- (char_right > 'a' && char_right < 'z') ||

- (char_right > 0xfb00 && char_right < 0xfb06) ||

- std::iswdigit(char_right)) {

- return FALSE;

- }

- if (!(('A' > char_left || char_left > 'Z') &&

- ('a' > char_left || char_left > 'z') &&

- ('A' > char_right || char_right > 'Z') &&

- ('a' > char_right || char_right > 'z'))) {

- return FALSE;

- }

- if (char_count > 0) {

- if (csPageText.GetAt(startPos) >= L'0' &&

- csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&

- char_left <= L'9') {

- return FALSE;

- }

- if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&

- char_right >= L'0' && char_right <= L'9') {

- return FALSE;

- }

- return TRUE;

-FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,

- const FX_WCHAR* lpszFullString,

- int iSubString,

- FX_WCHAR chSep) {

- if (!lpszFullString) {

- return FALSE;

- }

- while (iSubString--) {

- lpszFullString = wcschr(lpszFullString, chSep);

- if (!lpszFullString) {

- rString.clear();

- return FALSE;

- }

- lpszFullString++;

- while (*lpszFullString == chSep) {

- lpszFullString++;

- }

- const FX_WCHAR* lpchEnd = wcschr(lpszFullString, chSep);

- int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)

- : (int)FXSYS_wcslen(lpszFullString);

- ASSERT(nLen >= 0);

- FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,

- nLen * sizeof(FX_WCHAR));

- rString.ReleaseBuffer();

- return TRUE;

-CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {

- CFX_WideString str2;

- str2.clear();

- int nlen = str.GetLength();

- for (int i = nlen - 1; i >= 0; i--) {

- str2 += str.GetAt(i);

- }

- return str2;

-int CPDF_TextPageFind::GetCurOrder() const {

- return GetCharIndex(m_resStart);

-int CPDF_TextPageFind::GetMatchedCount() const {

- int resStart = GetCharIndex(m_resStart);

- int resEnd = GetCharIndex(m_resEnd);

- return resEnd - resStart + 1;

-CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)

- : m_pTextPage(pTextPage) {}

-CPDF_LinkExtract::~CPDF_LinkExtract() {

-void CPDF_LinkExtract::ExtractLinks() {

- m_LinkArray.clear();

- if (!m_pTextPage->IsParsed())

- return;

- m_strPageText = m_pTextPage->GetPageText(0, -1);

- if (m_strPageText.IsEmpty())

- return;

- ParseLink();

-void CPDF_LinkExtract::ParseLink() {

- int start = 0, pos = 0;

- int TotalChar = m_pTextPage->CountChars();

- while (pos < TotalChar) {

- FPDF_CHAR_INFO pageChar;

- m_pTextPage->GetCharInfo(pos, &pageChar);

- if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||

- pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {

- int nCount = pos - start;

- if (pos == TotalChar - 1) {

- nCount++;

- }

- CFX_WideString strBeCheck;

- strBeCheck = m_pTextPage->GetPageText(start, nCount);

- if (strBeCheck.GetLength() > 5) {

- while (strBeCheck.GetLength() > 0) {

- FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);

- if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {

- strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);

- nCount--;

- } else {

- break;

- }

- if (nCount > 5 &&

- (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {

- m_LinkArray.push_back({start, nCount, strBeCheck});

- }

- start = ++pos;

- } else {

- pos++;

- }

-bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {

- CFX_WideString str = strBeCheck;

- str.MakeLower();

- if (str.Find(L"http://www.") != -1) {

- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));

- return true;

- }

- if (str.Find(L"http://") != -1) {

- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));

- return true;

- }

- if (str.Find(L"https://www.") != -1) {

- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));

- return true;

- }

- if (str.Find(L"https://") != -1) {

- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));

- return true;

- }

- if (str.Find(L"www.") != -1) {

- strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));

- strBeCheck = L"http://" + strBeCheck;

- return true;

- }

- return false;

-bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {

- int aPos = str.Find(L'@');

- // Invalid when no '@'.

- if (aPos < 1)

- return false;

- // Check the local part.

- int pPos = aPos; // Used to track the position of '@' or '.'.

- for (int i = aPos - 1; i >= 0; i--) {

- FX_WCHAR ch = str.GetAt(i);

- if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))

- continue;

- if (ch != L'.' || i == pPos - 1 || i == 0) {

- if (i == aPos - 1) {

- // There is '.' or invalid char before '@'.

- return FALSE;

- }

- // End extracting for other invalid chars, '.' at the beginning, or

- // consecutive '.'.

- int removed_len = i == pPos - 1 ? i + 2 : i + 1;

- str = str.Right(str.GetLength() - removed_len);

- break;

- }

- // Found a valid '.'.

- pPos = i;

- }

- // Check the domain name part.

- aPos = str.Find(L'@');

- if (aPos < 1)

- return false;

- str.TrimRight(L'.');

- // At least one '.' in domain name, but not at the beginning.

- // TODO(weili): RFC5322 allows domain names to be a local name without '.'.

- // Check whether we should remove this check.

- int ePos = str.Find(L'.', aPos + 1);

- if (ePos == -1 || ePos == aPos + 1)

- return false;

- // Validate all other chars in domain name.

- int nLen = str.GetLength();

- pPos = 0; // Used to track the position of '.'.

- for (int i = aPos + 1; i < nLen; i++) {

- FX_WCHAR wch = str.GetAt(i);

- if (wch == L'-' || FXSYS_iswalnum(wch))

- continue;

- if (wch != L'.' || i == pPos + 1) {

- // Domain name should end before invalid char.

- int host_end = i == pPos + 1 ? i - 2 : i - 1;

- if (pPos > 0 && host_end - aPos >= 3) {

- // Trim the ending invalid chars if there is at least one '.' and name.

- str = str.Left(host_end + 1);

- break;

- }

- return false;

- }

- pPos = i;

- }

- if (str.Find(L"mailto:") == -1)

- str = L"mailto:" + str;

- return true;

-CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {

- return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";

-std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {

- if (index >= m_LinkArray.size())

- return std::vector<CFX_FloatRect>();

- return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,

- m_LinkArray[index].m_Count);

« no previous file with comments | « core/fpdftext/cpdf_linkextract.cpp ('k') | core/fpdftext/cpdf_textpagefind.cpp » ('j') | no next file with comments »