| Index: core/fpdftext/cpdf_textpage.cpp
|
| diff --git a/core/fpdftext/fpdf_text_int.cpp b/core/fpdftext/cpdf_textpage.cpp
|
| similarity index 75%
|
| rename from core/fpdftext/fpdf_text_int.cpp
|
| rename to core/fpdftext/cpdf_textpage.cpp
|
| index fbd9c9c8c1cfbbc40da724d7632f8145ffc5a350..3981cfee40128d95a6457e906894fe3f2001e0ef 100644
|
| --- a/core/fpdftext/fpdf_text_int.cpp
|
| +++ b/core/fpdftext/cpdf_textpage.cpp
|
| @@ -4,10 +4,9 @@
|
|
|
| // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
|
|
|
| +#include "core/fpdftext/include/cpdf_textpage.h"
|
| +
|
| #include <algorithm>
|
| -#include <cctype>
|
| -#include <cwctype>
|
| -#include <memory>
|
| #include <utility>
|
| #include <vector>
|
|
|
| @@ -19,35 +18,12 @@
|
| #include "core/fpdfapi/fpdf_page/include/cpdf_textobject.h"
|
| #include "core/fpdfapi/fpdf_parser/include/cpdf_dictionary.h"
|
| #include "core/fpdfapi/fpdf_parser/include/cpdf_string.h"
|
| -#include "core/fpdftext/include/cpdf_linkextract.h"
|
| -#include "core/fpdftext/include/cpdf_textpage.h"
|
| -#include "core/fpdftext/include/cpdf_textpagefind.h"
|
| #include "core/fpdftext/unicodenormalizationdata.h"
|
| #include "core/fxcrt/fx_bidi.h"
|
| #include "core/fxcrt/include/fx_ext.h"
|
| #include "core/fxcrt/include/fx_ucd.h"
|
| #include "third_party/base/stl_util.h"
|
|
|
| -#define FPDFTEXT_MATCHCASE 0x00000001
|
| -#define FPDFTEXT_MATCHWHOLEWORD 0x00000002
|
| -#define FPDFTEXT_CONSECUTIVE 0x00000004
|
| -
|
| -#define FPDFTEXT_CHAR_ERROR -1
|
| -#define FPDFTEXT_CHAR_NORMAL 0
|
| -#define FPDFTEXT_CHAR_GENERATED 1
|
| -#define FPDFTEXT_CHAR_UNUNICODE 2
|
| -#define FPDFTEXT_CHAR_HYPHEN 3
|
| -#define FPDFTEXT_CHAR_PIECE 4
|
| -
|
| -#define TEXT_SPACE_CHAR L' '
|
| -#define TEXT_LINEFEED_CHAR L'\n'
|
| -#define TEXT_RETURN_CHAR L'\r'
|
| -#define TEXT_EMPTY L""
|
| -#define TEXT_SPACE L" "
|
| -#define TEXT_RETURN_LINEFEED L"\r\n"
|
| -#define TEXT_LINEFEED L"\n"
|
| -#define TEXT_CHARRATIO_GAPDELTA 0.070
|
| -
|
| namespace {
|
|
|
| const FX_FLOAT kDefaultFontSize = 1.0f;
|
| @@ -55,22 +31,6 @@ const uint16_t* const g_UnicodeData_Normalization_Maps[5] = {
|
| nullptr, g_UnicodeData_Normalization_Map1, g_UnicodeData_Normalization_Map2,
|
| g_UnicodeData_Normalization_Map3, g_UnicodeData_Normalization_Map4};
|
|
|
| -FX_BOOL IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
|
| - if (curChar < 255)
|
| - return FALSE;
|
| - if ((curChar >= 0x0600 && curChar <= 0x06FF) ||
|
| - (curChar >= 0xFE70 && curChar <= 0xFEFF) ||
|
| - (curChar >= 0xFB50 && curChar <= 0xFDFF) ||
|
| - (curChar >= 0x0400 && curChar <= 0x04FF) ||
|
| - (curChar >= 0x0500 && curChar <= 0x052F) ||
|
| - (curChar >= 0xA640 && curChar <= 0xA69F) ||
|
| - (curChar >= 0x2DE0 && curChar <= 0x2DFF) || curChar == 8467 ||
|
| - (curChar >= 0x2000 && curChar <= 0x206F)) {
|
| - return FALSE;
|
| - }
|
| - return TRUE;
|
| -}
|
| -
|
| FX_FLOAT NormalizeThreshold(FX_FLOAT threshold) {
|
| if (threshold < 300)
|
| return threshold / 2.0f;
|
| @@ -1587,563 +1547,3 @@ FX_BOOL CPDF_TextPage::IsRectIntersect(const CFX_FloatRect& rect1,
|
| rect.Intersect(rect2);
|
| return !rect.IsEmpty();
|
| }
|
| -
|
| -CPDF_TextPageFind::CPDF_TextPageFind(const CPDF_TextPage* pTextPage)
|
| - : m_pTextPage(pTextPage),
|
| - m_flags(0),
|
| - m_findNextStart(-1),
|
| - m_findPreStart(-1),
|
| - m_bMatchCase(FALSE),
|
| - m_bMatchWholeWord(FALSE),
|
| - m_resStart(0),
|
| - m_resEnd(-1),
|
| - m_IsFind(FALSE) {
|
| - m_strText = m_pTextPage->GetPageText();
|
| - int nCount = pTextPage->CountChars();
|
| - if (nCount) {
|
| - m_CharIndex.push_back(0);
|
| - }
|
| - for (int i = 0; i < nCount; i++) {
|
| - FPDF_CHAR_INFO info;
|
| - pTextPage->GetCharInfo(i, &info);
|
| - int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
|
| - if (info.m_Flag == FPDFTEXT_CHAR_NORMAL ||
|
| - info.m_Flag == FPDFTEXT_CHAR_GENERATED) {
|
| - if (indexSize % 2) {
|
| - m_CharIndex.push_back(1);
|
| - } else {
|
| - if (indexSize <= 0) {
|
| - continue;
|
| - }
|
| - m_CharIndex[indexSize - 1] += 1;
|
| - }
|
| - } else {
|
| - if (indexSize % 2) {
|
| - if (indexSize <= 0) {
|
| - continue;
|
| - }
|
| - m_CharIndex[indexSize - 1] = i + 1;
|
| - } else {
|
| - m_CharIndex.push_back(i + 1);
|
| - }
|
| - }
|
| - }
|
| - int indexSize = pdfium::CollectionSize<int>(m_CharIndex);
|
| - if (indexSize % 2) {
|
| - m_CharIndex.erase(m_CharIndex.begin() + indexSize - 1);
|
| - }
|
| -}
|
| -
|
| -CPDF_TextPageFind::~CPDF_TextPageFind() {}
|
| -
|
| -int CPDF_TextPageFind::GetCharIndex(int index) const {
|
| - return m_pTextPage->CharIndexFromTextIndex(index);
|
| -}
|
| -
|
| -FX_BOOL CPDF_TextPageFind::FindFirst(const CFX_WideString& findwhat,
|
| - int flags,
|
| - int startPos) {
|
| - if (!m_pTextPage) {
|
| - return FALSE;
|
| - }
|
| - if (m_strText.IsEmpty() || m_bMatchCase != (flags & FPDFTEXT_MATCHCASE)) {
|
| - m_strText = m_pTextPage->GetPageText();
|
| - }
|
| - CFX_WideString findwhatStr = findwhat;
|
| - m_findWhat = findwhatStr;
|
| - m_flags = flags;
|
| - m_bMatchCase = flags & FPDFTEXT_MATCHCASE;
|
| - if (m_strText.IsEmpty()) {
|
| - m_IsFind = FALSE;
|
| - return TRUE;
|
| - }
|
| - FX_STRSIZE len = findwhatStr.GetLength();
|
| - if (!m_bMatchCase) {
|
| - findwhatStr.MakeLower();
|
| - m_strText.MakeLower();
|
| - }
|
| - m_bMatchWholeWord = flags & FPDFTEXT_MATCHWHOLEWORD;
|
| - m_findNextStart = startPos;
|
| - if (startPos == -1) {
|
| - m_findPreStart = m_strText.GetLength() - 1;
|
| - } else {
|
| - m_findPreStart = startPos;
|
| - }
|
| - m_csFindWhatArray.clear();
|
| - int i = 0;
|
| - while (i < len) {
|
| - if (findwhatStr.GetAt(i) != ' ') {
|
| - break;
|
| - }
|
| - i++;
|
| - }
|
| - if (i < len) {
|
| - ExtractFindWhat(findwhatStr);
|
| - } else {
|
| - m_csFindWhatArray.push_back(findwhatStr);
|
| - }
|
| - if (m_csFindWhatArray.empty()) {
|
| - return FALSE;
|
| - }
|
| - m_IsFind = TRUE;
|
| - m_resStart = 0;
|
| - m_resEnd = -1;
|
| - return TRUE;
|
| -}
|
| -
|
| -FX_BOOL CPDF_TextPageFind::FindNext() {
|
| - if (!m_pTextPage) {
|
| - return FALSE;
|
| - }
|
| - m_resArray.clear();
|
| - if (m_findNextStart == -1) {
|
| - return FALSE;
|
| - }
|
| - if (m_strText.IsEmpty()) {
|
| - m_IsFind = FALSE;
|
| - return m_IsFind;
|
| - }
|
| - int strLen = m_strText.GetLength();
|
| - if (m_findNextStart > strLen - 1) {
|
| - m_IsFind = FALSE;
|
| - return m_IsFind;
|
| - }
|
| - int nCount = pdfium::CollectionSize<int>(m_csFindWhatArray);
|
| - int nResultPos = 0;
|
| - int nStartPos = 0;
|
| - nStartPos = m_findNextStart;
|
| - FX_BOOL bSpaceStart = FALSE;
|
| - for (int iWord = 0; iWord < nCount; iWord++) {
|
| - CFX_WideString csWord = m_csFindWhatArray[iWord];
|
| - if (csWord.IsEmpty()) {
|
| - if (iWord == nCount - 1) {
|
| - FX_WCHAR strInsert = m_strText.GetAt(nStartPos);
|
| - if (strInsert == TEXT_LINEFEED_CHAR || strInsert == TEXT_SPACE_CHAR ||
|
| - strInsert == TEXT_RETURN_CHAR || strInsert == 160) {
|
| - nResultPos = nStartPos + 1;
|
| - break;
|
| - }
|
| - iWord = -1;
|
| - } else if (iWord == 0) {
|
| - bSpaceStart = TRUE;
|
| - }
|
| - continue;
|
| - }
|
| - int endIndex;
|
| - nResultPos = m_strText.Find(csWord.c_str(), nStartPos);
|
| - if (nResultPos == -1) {
|
| - m_IsFind = FALSE;
|
| - return m_IsFind;
|
| - }
|
| - endIndex = nResultPos + csWord.GetLength() - 1;
|
| - if (iWord == 0) {
|
| - m_resStart = nResultPos;
|
| - }
|
| - FX_BOOL bMatch = TRUE;
|
| - if (iWord != 0 && !bSpaceStart) {
|
| - int PreResEndPos = nStartPos;
|
| - int curChar = csWord.GetAt(0);
|
| - CFX_WideString lastWord = m_csFindWhatArray[iWord - 1];
|
| - int lastChar = lastWord.GetAt(lastWord.GetLength() - 1);
|
| - if (nStartPos == nResultPos &&
|
| - !(IsIgnoreSpaceCharacter(lastChar) ||
|
| - IsIgnoreSpaceCharacter(curChar))) {
|
| - bMatch = FALSE;
|
| - }
|
| - for (int d = PreResEndPos; d < nResultPos; d++) {
|
| - FX_WCHAR strInsert = m_strText.GetAt(d);
|
| - if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
|
| - strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
|
| - bMatch = FALSE;
|
| - break;
|
| - }
|
| - }
|
| - } else if (bSpaceStart) {
|
| - if (nResultPos > 0) {
|
| - FX_WCHAR strInsert = m_strText.GetAt(nResultPos - 1);
|
| - if (strInsert != TEXT_LINEFEED_CHAR && strInsert != TEXT_SPACE_CHAR &&
|
| - strInsert != TEXT_RETURN_CHAR && strInsert != 160) {
|
| - bMatch = FALSE;
|
| - m_resStart = nResultPos;
|
| - } else {
|
| - m_resStart = nResultPos - 1;
|
| - }
|
| - }
|
| - }
|
| - if (m_bMatchWholeWord && bMatch) {
|
| - bMatch = IsMatchWholeWord(m_strText, nResultPos, endIndex);
|
| - }
|
| - nStartPos = endIndex + 1;
|
| - if (!bMatch) {
|
| - iWord = -1;
|
| - if (bSpaceStart) {
|
| - nStartPos = m_resStart + m_csFindWhatArray[1].GetLength();
|
| - } else {
|
| - nStartPos = m_resStart + m_csFindWhatArray[0].GetLength();
|
| - }
|
| - }
|
| - }
|
| - m_resEnd = nResultPos + m_csFindWhatArray.back().GetLength() - 1;
|
| - m_IsFind = TRUE;
|
| - int resStart = GetCharIndex(m_resStart);
|
| - int resEnd = GetCharIndex(m_resEnd);
|
| - m_resArray = m_pTextPage->GetRectArray(resStart, resEnd - resStart + 1);
|
| - if (m_flags & FPDFTEXT_CONSECUTIVE) {
|
| - m_findNextStart = m_resStart + 1;
|
| - m_findPreStart = m_resEnd - 1;
|
| - } else {
|
| - m_findNextStart = m_resEnd + 1;
|
| - m_findPreStart = m_resStart - 1;
|
| - }
|
| - return m_IsFind;
|
| -}
|
| -
|
| -FX_BOOL CPDF_TextPageFind::FindPrev() {
|
| - if (!m_pTextPage) {
|
| - return FALSE;
|
| - }
|
| - m_resArray.clear();
|
| - if (m_strText.IsEmpty() || m_findPreStart < 0) {
|
| - m_IsFind = FALSE;
|
| - return m_IsFind;
|
| - }
|
| - CPDF_TextPageFind findEngine(m_pTextPage);
|
| - FX_BOOL ret = findEngine.FindFirst(m_findWhat, m_flags);
|
| - if (!ret) {
|
| - m_IsFind = FALSE;
|
| - return m_IsFind;
|
| - }
|
| - int order = -1, MatchedCount = 0;
|
| - while (ret) {
|
| - ret = findEngine.FindNext();
|
| - if (ret) {
|
| - int order1 = findEngine.GetCurOrder();
|
| - int MatchedCount1 = findEngine.GetMatchedCount();
|
| - if (((order1 + MatchedCount1) - 1) > m_findPreStart) {
|
| - break;
|
| - }
|
| - order = order1;
|
| - MatchedCount = MatchedCount1;
|
| - }
|
| - }
|
| - if (order == -1) {
|
| - m_IsFind = FALSE;
|
| - return m_IsFind;
|
| - }
|
| - m_resStart = m_pTextPage->TextIndexFromCharIndex(order);
|
| - m_resEnd = m_pTextPage->TextIndexFromCharIndex(order + MatchedCount - 1);
|
| - m_IsFind = TRUE;
|
| - m_resArray = m_pTextPage->GetRectArray(order, MatchedCount);
|
| - if (m_flags & FPDFTEXT_CONSECUTIVE) {
|
| - m_findNextStart = m_resStart + 1;
|
| - m_findPreStart = m_resEnd - 1;
|
| - } else {
|
| - m_findNextStart = m_resEnd + 1;
|
| - m_findPreStart = m_resStart - 1;
|
| - }
|
| - return m_IsFind;
|
| -}
|
| -
|
| -void CPDF_TextPageFind::ExtractFindWhat(const CFX_WideString& findwhat) {
|
| - if (findwhat.IsEmpty()) {
|
| - return;
|
| - }
|
| - int index = 0;
|
| - while (1) {
|
| - CFX_WideString csWord = TEXT_EMPTY;
|
| - int ret =
|
| - ExtractSubString(csWord, findwhat.c_str(), index, TEXT_SPACE_CHAR);
|
| - if (csWord.IsEmpty()) {
|
| - if (ret) {
|
| - m_csFindWhatArray.push_back(L"");
|
| - index++;
|
| - continue;
|
| - } else {
|
| - break;
|
| - }
|
| - }
|
| - int pos = 0;
|
| - while (pos < csWord.GetLength()) {
|
| - CFX_WideString curStr = csWord.Mid(pos, 1);
|
| - FX_WCHAR curChar = csWord.GetAt(pos);
|
| - if (IsIgnoreSpaceCharacter(curChar)) {
|
| - if (pos > 0 && curChar == 0x2019) {
|
| - pos++;
|
| - continue;
|
| - }
|
| - if (pos > 0) {
|
| - m_csFindWhatArray.push_back(csWord.Mid(0, pos));
|
| - }
|
| - m_csFindWhatArray.push_back(curStr);
|
| - if (pos == csWord.GetLength() - 1) {
|
| - csWord.clear();
|
| - break;
|
| - }
|
| - csWord = csWord.Right(csWord.GetLength() - pos - 1);
|
| - pos = 0;
|
| - continue;
|
| - }
|
| - pos++;
|
| - }
|
| - if (!csWord.IsEmpty()) {
|
| - m_csFindWhatArray.push_back(csWord);
|
| - }
|
| - index++;
|
| - }
|
| -}
|
| -
|
| -FX_BOOL CPDF_TextPageFind::IsMatchWholeWord(const CFX_WideString& csPageText,
|
| - int startPos,
|
| - int endPos) {
|
| - FX_WCHAR char_left = 0;
|
| - FX_WCHAR char_right = 0;
|
| - int char_count = endPos - startPos + 1;
|
| - if (char_count < 1) {
|
| - return FALSE;
|
| - }
|
| - if (char_count == 1 && csPageText.GetAt(startPos) > 255) {
|
| - return TRUE;
|
| - }
|
| - if (startPos - 1 >= 0) {
|
| - char_left = csPageText.GetAt(startPos - 1);
|
| - }
|
| - if (startPos + char_count < csPageText.GetLength()) {
|
| - char_right = csPageText.GetAt(startPos + char_count);
|
| - }
|
| - if ((char_left > 'A' && char_left < 'a') ||
|
| - (char_left > 'a' && char_left < 'z') ||
|
| - (char_left > 0xfb00 && char_left < 0xfb06) || std::iswdigit(char_left) ||
|
| - (char_right > 'A' && char_right < 'a') ||
|
| - (char_right > 'a' && char_right < 'z') ||
|
| - (char_right > 0xfb00 && char_right < 0xfb06) ||
|
| - std::iswdigit(char_right)) {
|
| - return FALSE;
|
| - }
|
| - if (!(('A' > char_left || char_left > 'Z') &&
|
| - ('a' > char_left || char_left > 'z') &&
|
| - ('A' > char_right || char_right > 'Z') &&
|
| - ('a' > char_right || char_right > 'z'))) {
|
| - return FALSE;
|
| - }
|
| - if (char_count > 0) {
|
| - if (csPageText.GetAt(startPos) >= L'0' &&
|
| - csPageText.GetAt(startPos) <= L'9' && char_left >= L'0' &&
|
| - char_left <= L'9') {
|
| - return FALSE;
|
| - }
|
| - if (csPageText.GetAt(endPos) >= L'0' && csPageText.GetAt(endPos) <= L'9' &&
|
| - char_right >= L'0' && char_right <= L'9') {
|
| - return FALSE;
|
| - }
|
| - }
|
| - return TRUE;
|
| -}
|
| -
|
| -FX_BOOL CPDF_TextPageFind::ExtractSubString(CFX_WideString& rString,
|
| - const FX_WCHAR* lpszFullString,
|
| - int iSubString,
|
| - FX_WCHAR chSep) {
|
| - if (!lpszFullString) {
|
| - return FALSE;
|
| - }
|
| - while (iSubString--) {
|
| - lpszFullString = wcschr(lpszFullString, chSep);
|
| - if (!lpszFullString) {
|
| - rString.clear();
|
| - return FALSE;
|
| - }
|
| - lpszFullString++;
|
| - while (*lpszFullString == chSep) {
|
| - lpszFullString++;
|
| - }
|
| - }
|
| - const FX_WCHAR* lpchEnd = wcschr(lpszFullString, chSep);
|
| - int nLen = lpchEnd ? (int)(lpchEnd - lpszFullString)
|
| - : (int)FXSYS_wcslen(lpszFullString);
|
| - ASSERT(nLen >= 0);
|
| - FXSYS_memcpy(rString.GetBuffer(nLen), lpszFullString,
|
| - nLen * sizeof(FX_WCHAR));
|
| - rString.ReleaseBuffer();
|
| - return TRUE;
|
| -}
|
| -
|
| -CFX_WideString CPDF_TextPageFind::MakeReverse(const CFX_WideString& str) {
|
| - CFX_WideString str2;
|
| - str2.clear();
|
| - int nlen = str.GetLength();
|
| - for (int i = nlen - 1; i >= 0; i--) {
|
| - str2 += str.GetAt(i);
|
| - }
|
| - return str2;
|
| -}
|
| -
|
| -int CPDF_TextPageFind::GetCurOrder() const {
|
| - return GetCharIndex(m_resStart);
|
| -}
|
| -
|
| -int CPDF_TextPageFind::GetMatchedCount() const {
|
| - int resStart = GetCharIndex(m_resStart);
|
| - int resEnd = GetCharIndex(m_resEnd);
|
| - return resEnd - resStart + 1;
|
| -}
|
| -
|
| -CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
|
| - : m_pTextPage(pTextPage) {}
|
| -
|
| -CPDF_LinkExtract::~CPDF_LinkExtract() {
|
| -}
|
| -
|
| -void CPDF_LinkExtract::ExtractLinks() {
|
| - m_LinkArray.clear();
|
| - if (!m_pTextPage->IsParsed())
|
| - return;
|
| -
|
| - m_strPageText = m_pTextPage->GetPageText(0, -1);
|
| - if (m_strPageText.IsEmpty())
|
| - return;
|
| -
|
| - ParseLink();
|
| -}
|
| -
|
| -void CPDF_LinkExtract::ParseLink() {
|
| - int start = 0, pos = 0;
|
| - int TotalChar = m_pTextPage->CountChars();
|
| - while (pos < TotalChar) {
|
| - FPDF_CHAR_INFO pageChar;
|
| - m_pTextPage->GetCharInfo(pos, &pageChar);
|
| - if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
|
| - pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
|
| - int nCount = pos - start;
|
| - if (pos == TotalChar - 1) {
|
| - nCount++;
|
| - }
|
| - CFX_WideString strBeCheck;
|
| - strBeCheck = m_pTextPage->GetPageText(start, nCount);
|
| - if (strBeCheck.GetLength() > 5) {
|
| - while (strBeCheck.GetLength() > 0) {
|
| - FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
|
| - if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
|
| - strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
|
| - nCount--;
|
| - } else {
|
| - break;
|
| - }
|
| - }
|
| - if (nCount > 5 &&
|
| - (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
|
| - m_LinkArray.push_back({start, nCount, strBeCheck});
|
| - }
|
| - }
|
| - start = ++pos;
|
| - } else {
|
| - pos++;
|
| - }
|
| - }
|
| -}
|
| -
|
| -bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
|
| - CFX_WideString str = strBeCheck;
|
| - str.MakeLower();
|
| - if (str.Find(L"http://www.") != -1) {
|
| - strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
|
| - return true;
|
| - }
|
| - if (str.Find(L"http://") != -1) {
|
| - strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
|
| - return true;
|
| - }
|
| - if (str.Find(L"https://www.") != -1) {
|
| - strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
|
| - return true;
|
| - }
|
| - if (str.Find(L"https://") != -1) {
|
| - strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
|
| - return true;
|
| - }
|
| - if (str.Find(L"www.") != -1) {
|
| - strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
|
| - strBeCheck = L"http://" + strBeCheck;
|
| - return true;
|
| - }
|
| - return false;
|
| -}
|
| -
|
| -bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
|
| - int aPos = str.Find(L'@');
|
| - // Invalid when no '@'.
|
| - if (aPos < 1)
|
| - return false;
|
| -
|
| - // Check the local part.
|
| - int pPos = aPos; // Used to track the position of '@' or '.'.
|
| - for (int i = aPos - 1; i >= 0; i--) {
|
| - FX_WCHAR ch = str.GetAt(i);
|
| - if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
|
| - continue;
|
| -
|
| - if (ch != L'.' || i == pPos - 1 || i == 0) {
|
| - if (i == aPos - 1) {
|
| - // There is '.' or invalid char before '@'.
|
| - return FALSE;
|
| - }
|
| - // End extracting for other invalid chars, '.' at the beginning, or
|
| - // consecutive '.'.
|
| - int removed_len = i == pPos - 1 ? i + 2 : i + 1;
|
| - str = str.Right(str.GetLength() - removed_len);
|
| - break;
|
| - }
|
| - // Found a valid '.'.
|
| - pPos = i;
|
| - }
|
| -
|
| - // Check the domain name part.
|
| - aPos = str.Find(L'@');
|
| - if (aPos < 1)
|
| - return false;
|
| -
|
| - str.TrimRight(L'.');
|
| - // At least one '.' in domain name, but not at the beginning.
|
| - // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
|
| - // Check whether we should remove this check.
|
| - int ePos = str.Find(L'.', aPos + 1);
|
| - if (ePos == -1 || ePos == aPos + 1)
|
| - return false;
|
| -
|
| - // Validate all other chars in domain name.
|
| - int nLen = str.GetLength();
|
| - pPos = 0; // Used to track the position of '.'.
|
| - for (int i = aPos + 1; i < nLen; i++) {
|
| - FX_WCHAR wch = str.GetAt(i);
|
| - if (wch == L'-' || FXSYS_iswalnum(wch))
|
| - continue;
|
| -
|
| - if (wch != L'.' || i == pPos + 1) {
|
| - // Domain name should end before invalid char.
|
| - int host_end = i == pPos + 1 ? i - 2 : i - 1;
|
| - if (pPos > 0 && host_end - aPos >= 3) {
|
| - // Trim the ending invalid chars if there is at least one '.' and name.
|
| - str = str.Left(host_end + 1);
|
| - break;
|
| - }
|
| - return false;
|
| - }
|
| - pPos = i;
|
| - }
|
| -
|
| - if (str.Find(L"mailto:") == -1)
|
| - str = L"mailto:" + str;
|
| -
|
| - return true;
|
| -}
|
| -
|
| -CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {
|
| - return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
|
| -}
|
| -
|
| -std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
|
| - if (index >= m_LinkArray.size())
|
| - return std::vector<CFX_FloatRect>();
|
| -
|
| - return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
|
| - m_LinkArray[index].m_Count);
|
| -}
|
|
|