core/fpdftext/cpdf_linkextract.cpp - Issue 2286723003: Split fpdf_text_int into classes

Unified Diff: core/fpdftext/cpdf_linkextract.cpp

Issue 2286723003: Split fpdf_text_int into classes (Closed)

Patch Set: Fix bots Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: core/fpdftext/cpdf_linkextract.cpp

diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp

new file mode 100644

index 0000000000000000000000000000000000000000..1677b67b55535b794979ced8a02cf6852d02da5a

--- /dev/null

+++ b/core/fpdftext/cpdf_linkextract.cpp

@@ -0,0 +1,173 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "core/fpdftext/include/cpdf_linkextract.h"

+#include <vector>

+#include "core/fpdftext/include/cpdf_textpage.h"

+#include "core/fxcrt/include/fx_ext.h"

+#include "core/fxcrt/include/fx_string.h"

+#include "core/fxcrt/include/fx_system.h"

+CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)

+ : m_pTextPage(pTextPage) {}

+CPDF_LinkExtract::~CPDF_LinkExtract() {}

+void CPDF_LinkExtract::ExtractLinks() {

+ m_LinkArray.clear();

+ if (!m_pTextPage->IsParsed())

+ return;

+ m_strPageText = m_pTextPage->GetPageText(0, -1);

+ if (m_strPageText.IsEmpty())

+ return;

+ ParseLink();

+void CPDF_LinkExtract::ParseLink() {

+ int start = 0, pos = 0;

+ int TotalChar = m_pTextPage->CountChars();

+ while (pos < TotalChar) {

+ FPDF_CHAR_INFO pageChar;

+ m_pTextPage->GetCharInfo(pos, &pageChar);

+ if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||

+ pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {

+ int nCount = pos - start;

+ if (pos == TotalChar - 1)

+ nCount++;

+ CFX_WideString strBeCheck;

+ strBeCheck = m_pTextPage->GetPageText(start, nCount);

+ if (strBeCheck.GetLength() > 5) {

+ while (strBeCheck.GetLength() > 0) {

+ FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);

+ if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {

+ strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);

+ nCount--;

+ } else {

+ break;

+ }

+ if (nCount > 5 &&

+ (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {

+ m_LinkArray.push_back({start, nCount, strBeCheck});

+ }

+ start = ++pos;

+ } else {

+ pos++;

+ }

+bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {

+ CFX_WideString str = strBeCheck;

+ str.MakeLower();

+ if (str.Find(L"http://www.") != -1) {

+ strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));

+ return true;

+ }

+ if (str.Find(L"http://") != -1) {

+ strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));

+ return true;

+ }

+ if (str.Find(L"https://www.") != -1) {

+ strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));

+ return true;

+ }

+ if (str.Find(L"https://") != -1) {

+ strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));

+ return true;

+ }

+ if (str.Find(L"www.") != -1) {

+ strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));

+ strBeCheck = L"http://" + strBeCheck;

+ return true;

+ }

+ return false;

+bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {

+ int aPos = str.Find(L'@');

+ // Invalid when no '@'.

+ if (aPos < 1)

+ return false;

+ // Check the local part.

+ int pPos = aPos; // Used to track the position of '@' or '.'.

+ for (int i = aPos - 1; i >= 0; i--) {

+ FX_WCHAR ch = str.GetAt(i);

+ if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))

+ continue;

+ if (ch != L'.' || i == pPos - 1 || i == 0) {

+ if (i == aPos - 1) {

+ // There is '.' or invalid char before '@'.

+ return FALSE;

+ }

+ // End extracting for other invalid chars, '.' at the beginning, or

+ // consecutive '.'.

+ int removed_len = i == pPos - 1 ? i + 2 : i + 1;

+ str = str.Right(str.GetLength() - removed_len);

+ break;

+ }

+ // Found a valid '.'.

+ pPos = i;

+ }

+ // Check the domain name part.

+ aPos = str.Find(L'@');

+ if (aPos < 1)

+ return false;

+ str.TrimRight(L'.');

+ // At least one '.' in domain name, but not at the beginning.

+ // TODO(weili): RFC5322 allows domain names to be a local name without '.'.

+ // Check whether we should remove this check.

+ int ePos = str.Find(L'.', aPos + 1);

+ if (ePos == -1 || ePos == aPos + 1)

+ return false;

+ // Validate all other chars in domain name.

+ int nLen = str.GetLength();

+ pPos = 0; // Used to track the position of '.'.

+ for (int i = aPos + 1; i < nLen; i++) {

+ FX_WCHAR wch = str.GetAt(i);

+ if (wch == L'-' || FXSYS_iswalnum(wch))

+ continue;

+ if (wch != L'.' || i == pPos + 1) {

+ // Domain name should end before invalid char.

+ int host_end = i == pPos + 1 ? i - 2 : i - 1;

+ if (pPos > 0 && host_end - aPos >= 3) {

+ // Trim the ending invalid chars if there is at least one '.' and name.

+ str = str.Left(host_end + 1);

+ break;

+ }

+ return false;

+ }

+ pPos = i;

+ }

+ if (str.Find(L"mailto:") == -1)

+ str = L"mailto:" + str;

+ return true;

+CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {

+ return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";

+std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {

+ if (index >= m_LinkArray.size())

+ return std::vector<CFX_FloatRect>();

+ return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,

+ m_LinkArray[index].m_Count);

« no previous file with comments | « BUILD.gn ('k') | core/fpdftext/cpdf_textpage.cpp » ('j') | no next file with comments »