| Index: core/fpdftext/cpdf_linkextract.cpp
|
| diff --git a/core/fpdftext/cpdf_linkextract.cpp b/core/fpdftext/cpdf_linkextract.cpp
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..1677b67b55535b794979ced8a02cf6852d02da5a
|
| --- /dev/null
|
| +++ b/core/fpdftext/cpdf_linkextract.cpp
|
| @@ -0,0 +1,173 @@
|
| +// Copyright 2016 PDFium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +// Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
|
| +
|
| +#include "core/fpdftext/include/cpdf_linkextract.h"
|
| +
|
| +#include <vector>
|
| +
|
| +#include "core/fpdftext/include/cpdf_textpage.h"
|
| +#include "core/fxcrt/include/fx_ext.h"
|
| +#include "core/fxcrt/include/fx_string.h"
|
| +#include "core/fxcrt/include/fx_system.h"
|
| +
|
| +CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage)
|
| + : m_pTextPage(pTextPage) {}
|
| +
|
| +CPDF_LinkExtract::~CPDF_LinkExtract() {}
|
| +
|
| +void CPDF_LinkExtract::ExtractLinks() {
|
| + m_LinkArray.clear();
|
| + if (!m_pTextPage->IsParsed())
|
| + return;
|
| +
|
| + m_strPageText = m_pTextPage->GetPageText(0, -1);
|
| + if (m_strPageText.IsEmpty())
|
| + return;
|
| +
|
| + ParseLink();
|
| +}
|
| +
|
| +void CPDF_LinkExtract::ParseLink() {
|
| + int start = 0, pos = 0;
|
| + int TotalChar = m_pTextPage->CountChars();
|
| + while (pos < TotalChar) {
|
| + FPDF_CHAR_INFO pageChar;
|
| + m_pTextPage->GetCharInfo(pos, &pageChar);
|
| + if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED ||
|
| + pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) {
|
| + int nCount = pos - start;
|
| + if (pos == TotalChar - 1)
|
| + nCount++;
|
| + CFX_WideString strBeCheck;
|
| + strBeCheck = m_pTextPage->GetPageText(start, nCount);
|
| + if (strBeCheck.GetLength() > 5) {
|
| + while (strBeCheck.GetLength() > 0) {
|
| + FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1);
|
| + if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') {
|
| + strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1);
|
| + nCount--;
|
| + } else {
|
| + break;
|
| + }
|
| + }
|
| + if (nCount > 5 &&
|
| + (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) {
|
| + m_LinkArray.push_back({start, nCount, strBeCheck});
|
| + }
|
| + }
|
| + start = ++pos;
|
| + } else {
|
| + pos++;
|
| + }
|
| + }
|
| +}
|
| +
|
| +bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
|
| + CFX_WideString str = strBeCheck;
|
| + str.MakeLower();
|
| + if (str.Find(L"http://www.") != -1) {
|
| + strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www."));
|
| + return true;
|
| + }
|
| + if (str.Find(L"http://") != -1) {
|
| + strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://"));
|
| + return true;
|
| + }
|
| + if (str.Find(L"https://www.") != -1) {
|
| + strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www."));
|
| + return true;
|
| + }
|
| + if (str.Find(L"https://") != -1) {
|
| + strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
|
| + return true;
|
| + }
|
| + if (str.Find(L"www.") != -1) {
|
| + strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
|
| + strBeCheck = L"http://" + strBeCheck;
|
| + return true;
|
| + }
|
| + return false;
|
| +}
|
| +
|
| +bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
|
| + int aPos = str.Find(L'@');
|
| + // Invalid when no '@'.
|
| + if (aPos < 1)
|
| + return false;
|
| +
|
| + // Check the local part.
|
| + int pPos = aPos; // Used to track the position of '@' or '.'.
|
| + for (int i = aPos - 1; i >= 0; i--) {
|
| + FX_WCHAR ch = str.GetAt(i);
|
| + if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch))
|
| + continue;
|
| +
|
| + if (ch != L'.' || i == pPos - 1 || i == 0) {
|
| + if (i == aPos - 1) {
|
| + // There is '.' or invalid char before '@'.
|
| + return FALSE;
|
| + }
|
| + // End extracting for other invalid chars, '.' at the beginning, or
|
| + // consecutive '.'.
|
| + int removed_len = i == pPos - 1 ? i + 2 : i + 1;
|
| + str = str.Right(str.GetLength() - removed_len);
|
| + break;
|
| + }
|
| + // Found a valid '.'.
|
| + pPos = i;
|
| + }
|
| +
|
| + // Check the domain name part.
|
| + aPos = str.Find(L'@');
|
| + if (aPos < 1)
|
| + return false;
|
| +
|
| + str.TrimRight(L'.');
|
| + // At least one '.' in domain name, but not at the beginning.
|
| + // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
|
| + // Check whether we should remove this check.
|
| + int ePos = str.Find(L'.', aPos + 1);
|
| + if (ePos == -1 || ePos == aPos + 1)
|
| + return false;
|
| +
|
| + // Validate all other chars in domain name.
|
| + int nLen = str.GetLength();
|
| + pPos = 0; // Used to track the position of '.'.
|
| + for (int i = aPos + 1; i < nLen; i++) {
|
| + FX_WCHAR wch = str.GetAt(i);
|
| + if (wch == L'-' || FXSYS_iswalnum(wch))
|
| + continue;
|
| +
|
| + if (wch != L'.' || i == pPos + 1) {
|
| + // Domain name should end before invalid char.
|
| + int host_end = i == pPos + 1 ? i - 2 : i - 1;
|
| + if (pPos > 0 && host_end - aPos >= 3) {
|
| + // Trim the ending invalid chars if there is at least one '.' and name.
|
| + str = str.Left(host_end + 1);
|
| + break;
|
| + }
|
| + return false;
|
| + }
|
| + pPos = i;
|
| + }
|
| +
|
| + if (str.Find(L"mailto:") == -1)
|
| + str = L"mailto:" + str;
|
| +
|
| + return true;
|
| +}
|
| +
|
| +CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const {
|
| + return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L"";
|
| +}
|
| +
|
| +std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const {
|
| + if (index >= m_LinkArray.size())
|
| + return std::vector<CFX_FloatRect>();
|
| +
|
| + return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start,
|
| + m_LinkArray[index].m_Count);
|
| +}
|
|
|