core/fpdftext/cpdf_linkextract.cpp - Issue 2286723003: Split fpdf_text_int into classes

Side by Side Diff: core/fpdftext/cpdf_linkextract.cpp

Issue 2286723003: Split fpdf_text_int into classes (Closed)

Patch Set: Fix bots Created 4 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 // Copyright 2016 PDFium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

	6

	7 #include "core/fpdftext/include/cpdf_linkextract.h"

	8

	9 #include <vector>

	10

	11 #include "core/fpdftext/include/cpdf_textpage.h"

	12 #include "core/fxcrt/include/fx_ext.h"

	13 #include "core/fxcrt/include/fx_string.h"

	14 #include "core/fxcrt/include/fx_system.h"

	15

	16 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)

	17 : m_pTextPage(pTextPage) {}

	18

	19 CPDF_LinkExtract::~CPDF_LinkExtract() {}

	20

	21 void CPDF_LinkExtract::ExtractLinks() {

	22 m_LinkArray.clear();

	23 if (!m_pTextPage->IsParsed())

	24 return;

	25

	26 m_strPageText = m_pTextPage->GetPageText(0, -1);

	27 if (m_strPageText.IsEmpty())

	28 return;

	29

	30 ParseLink();

	31 }

	32

	33 void CPDF_LinkExtract::ParseLink() {

	34 int start = 0, pos = 0;

	35 int TotalChar = m_pTextPage->CountChars();

	36 while (pos < TotalChar) {

	37 FPDF_CHAR_INFO pageChar;

	38 m_pTextPage->GetCharInfo(pos, &pageChar);

	39 if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED \|\|

	40 pageChar.m_Unicode == 0x20 \|\| pos == TotalChar - 1) {

	41 int nCount = pos - start;

	42 if (pos == TotalChar - 1)

	43 nCount++;

	44 CFX_WideString strBeCheck;

	45 strBeCheck = m_pTextPage->GetPageText(start, nCount);

	46 if (strBeCheck.GetLength() > 5) {

	47 while (strBeCheck.GetLength() > 0) {

	48 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);

	49 if (ch == L')' \|\| ch == L',' \|\| ch == L'>' \|\| ch == L'.') {

	50 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);

	51 nCount--;

	52 } else {

	53 break;

	54 }

	55 }

	56 if (nCount > 5 &&

	57 (CheckWebLink(strBeCheck) \|\| CheckMailLink(strBeCheck))) {

	58 m_LinkArray.push_back({start, nCount, strBeCheck});

	59 }

	60 }

	61 start = ++pos;

	62 } else {

	63 pos++;

	64 }

	65 }

	66 }

	67

	68 bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {

	69 CFX_WideString str = strBeCheck;

	70 str.MakeLower();

	71 if (str.Find(L"http://www.") != -1) {

	72 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));

	73 return true;

	74 }

	75 if (str.Find(L"http://") != -1) {

	76 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));

	77 return true;

	78 }

	79 if (str.Find(L"https://www.") != -1) {

	80 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));

	81 return true;

	82 }

	83 if (str.Find(L"https://") != -1) {

	84 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));

	85 return true;

	86 }

	87 if (str.Find(L"www.") != -1) {

	88 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));

	89 strBeCheck = L"http://" + strBeCheck;

	90 return true;

	91 }

	92 return false;

	93 }

	94

	95 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {

	96 int aPos = str.Find(L'@');

	97 // Invalid when no '@'.

	98 if (aPos < 1)

	99 return false;

	100

	101 // Check the local part.

	102 int pPos = aPos; // Used to track the position of '@' or '.'.

	103 for (int i = aPos - 1; i >= 0; i--) {

	104 FX_WCHAR ch = str.GetAt(i);

	105 if (ch == L'_' \|\| ch == L'-' \|\| FXSYS_iswalnum(ch))

	106 continue;

	107

	108 if (ch != L'.' \|\| i == pPos - 1 \|\| i == 0) {

	109 if (i == aPos - 1) {

	110 // There is '.' or invalid char before '@'.

	111 return FALSE;

	112 }

	113 // End extracting for other invalid chars, '.' at the beginning, or

	114 // consecutive '.'.

	115 int removed_len = i == pPos - 1 ? i + 2 : i + 1;

	116 str = str.Right(str.GetLength() - removed_len);

	117 break;

	118 }

	119 // Found a valid '.'.

	120 pPos = i;

	121 }

	122

	123 // Check the domain name part.

	124 aPos = str.Find(L'@');

	125 if (aPos < 1)

	126 return false;

	127

	128 str.TrimRight(L'.');

	129 // At least one '.' in domain name, but not at the beginning.

	130 // TODO(weili): RFC5322 allows domain names to be a local name without '.'.

	131 // Check whether we should remove this check.

	132 int ePos = str.Find(L'.', aPos + 1);

	133 if (ePos == -1 \|\| ePos == aPos + 1)

	134 return false;

	135

	136 // Validate all other chars in domain name.

	137 int nLen = str.GetLength();

	138 pPos = 0; // Used to track the position of '.'.

	139 for (int i = aPos + 1; i < nLen; i++) {

	140 FX_WCHAR wch = str.GetAt(i);

	141 if (wch == L'-' \|\| FXSYS_iswalnum(wch))

	142 continue;

	143

	144 if (wch != L'.' \|\| i == pPos + 1) {

	145 // Domain name should end before invalid char.

	146 int host_end = i == pPos + 1 ? i - 2 : i - 1;

	147 if (pPos > 0 && host_end - aPos >= 3) {

	148 // Trim the ending invalid chars if there is at least one '.' and name.

	149 str = str.Left(host_end + 1);

	150 break;

	151 }

	152 return false;

	153 }

	154 pPos = i;

	155 }

	156

	157 if (str.Find(L"mailto:") == -1)

	158 str = L"mailto:" + str;

	159

	160 return true;

	161 }

	162

	163 CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {

	164 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";

	165 }

	166

	167 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {

	168 if (index >= m_LinkArray.size())

	169 return std::vector<CFX_FloatRect>();

	170

	171 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,

	172 m_LinkArray[index].m_Count);

	173 }

OLD	NEW

« no previous file with comments | « BUILD.gn ('k') | core/fpdftext/cpdf_textpage.cpp » ('j') | no next file with comments »