OLD | NEW |
(Empty) | |
| 1 // Copyright 2016 PDFium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
| 6 |
| 7 #include "core/fpdftext/include/cpdf_linkextract.h" |
| 8 |
| 9 #include <vector> |
| 10 |
| 11 #include "core/fpdftext/include/cpdf_textpage.h" |
| 12 #include "core/fxcrt/include/fx_ext.h" |
| 13 #include "core/fxcrt/include/fx_string.h" |
| 14 #include "core/fxcrt/include/fx_system.h" |
| 15 |
| 16 CPDF_LinkExtract::CPDF_LinkExtract(const CPDF_TextPage* pTextPage) |
| 17 : m_pTextPage(pTextPage) {} |
| 18 |
| 19 CPDF_LinkExtract::~CPDF_LinkExtract() {} |
| 20 |
| 21 void CPDF_LinkExtract::ExtractLinks() { |
| 22 m_LinkArray.clear(); |
| 23 if (!m_pTextPage->IsParsed()) |
| 24 return; |
| 25 |
| 26 m_strPageText = m_pTextPage->GetPageText(0, -1); |
| 27 if (m_strPageText.IsEmpty()) |
| 28 return; |
| 29 |
| 30 ParseLink(); |
| 31 } |
| 32 |
| 33 void CPDF_LinkExtract::ParseLink() { |
| 34 int start = 0, pos = 0; |
| 35 int TotalChar = m_pTextPage->CountChars(); |
| 36 while (pos < TotalChar) { |
| 37 FPDF_CHAR_INFO pageChar; |
| 38 m_pTextPage->GetCharInfo(pos, &pageChar); |
| 39 if (pageChar.m_Flag == FPDFTEXT_CHAR_GENERATED || |
| 40 pageChar.m_Unicode == 0x20 || pos == TotalChar - 1) { |
| 41 int nCount = pos - start; |
| 42 if (pos == TotalChar - 1) |
| 43 nCount++; |
| 44 CFX_WideString strBeCheck; |
| 45 strBeCheck = m_pTextPage->GetPageText(start, nCount); |
| 46 if (strBeCheck.GetLength() > 5) { |
| 47 while (strBeCheck.GetLength() > 0) { |
| 48 FX_WCHAR ch = strBeCheck.GetAt(strBeCheck.GetLength() - 1); |
| 49 if (ch == L')' || ch == L',' || ch == L'>' || ch == L'.') { |
| 50 strBeCheck = strBeCheck.Mid(0, strBeCheck.GetLength() - 1); |
| 51 nCount--; |
| 52 } else { |
| 53 break; |
| 54 } |
| 55 } |
| 56 if (nCount > 5 && |
| 57 (CheckWebLink(strBeCheck) || CheckMailLink(strBeCheck))) { |
| 58 m_LinkArray.push_back({start, nCount, strBeCheck}); |
| 59 } |
| 60 } |
| 61 start = ++pos; |
| 62 } else { |
| 63 pos++; |
| 64 } |
| 65 } |
| 66 } |
| 67 |
| 68 bool CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) { |
| 69 CFX_WideString str = strBeCheck; |
| 70 str.MakeLower(); |
| 71 if (str.Find(L"http://www.") != -1) { |
| 72 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://www.")); |
| 73 return true; |
| 74 } |
| 75 if (str.Find(L"http://") != -1) { |
| 76 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"http://")); |
| 77 return true; |
| 78 } |
| 79 if (str.Find(L"https://www.") != -1) { |
| 80 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://www.")); |
| 81 return true; |
| 82 } |
| 83 if (str.Find(L"https://") != -1) { |
| 84 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); |
| 85 return true; |
| 86 } |
| 87 if (str.Find(L"www.") != -1) { |
| 88 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); |
| 89 strBeCheck = L"http://" + strBeCheck; |
| 90 return true; |
| 91 } |
| 92 return false; |
| 93 } |
| 94 |
| 95 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { |
| 96 int aPos = str.Find(L'@'); |
| 97 // Invalid when no '@'. |
| 98 if (aPos < 1) |
| 99 return false; |
| 100 |
| 101 // Check the local part. |
| 102 int pPos = aPos; // Used to track the position of '@' or '.'. |
| 103 for (int i = aPos - 1; i >= 0; i--) { |
| 104 FX_WCHAR ch = str.GetAt(i); |
| 105 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) |
| 106 continue; |
| 107 |
| 108 if (ch != L'.' || i == pPos - 1 || i == 0) { |
| 109 if (i == aPos - 1) { |
| 110 // There is '.' or invalid char before '@'. |
| 111 return FALSE; |
| 112 } |
| 113 // End extracting for other invalid chars, '.' at the beginning, or |
| 114 // consecutive '.'. |
| 115 int removed_len = i == pPos - 1 ? i + 2 : i + 1; |
| 116 str = str.Right(str.GetLength() - removed_len); |
| 117 break; |
| 118 } |
| 119 // Found a valid '.'. |
| 120 pPos = i; |
| 121 } |
| 122 |
| 123 // Check the domain name part. |
| 124 aPos = str.Find(L'@'); |
| 125 if (aPos < 1) |
| 126 return false; |
| 127 |
| 128 str.TrimRight(L'.'); |
| 129 // At least one '.' in domain name, but not at the beginning. |
| 130 // TODO(weili): RFC5322 allows domain names to be a local name without '.'. |
| 131 // Check whether we should remove this check. |
| 132 int ePos = str.Find(L'.', aPos + 1); |
| 133 if (ePos == -1 || ePos == aPos + 1) |
| 134 return false; |
| 135 |
| 136 // Validate all other chars in domain name. |
| 137 int nLen = str.GetLength(); |
| 138 pPos = 0; // Used to track the position of '.'. |
| 139 for (int i = aPos + 1; i < nLen; i++) { |
| 140 FX_WCHAR wch = str.GetAt(i); |
| 141 if (wch == L'-' || FXSYS_iswalnum(wch)) |
| 142 continue; |
| 143 |
| 144 if (wch != L'.' || i == pPos + 1) { |
| 145 // Domain name should end before invalid char. |
| 146 int host_end = i == pPos + 1 ? i - 2 : i - 1; |
| 147 if (pPos > 0 && host_end - aPos >= 3) { |
| 148 // Trim the ending invalid chars if there is at least one '.' and name. |
| 149 str = str.Left(host_end + 1); |
| 150 break; |
| 151 } |
| 152 return false; |
| 153 } |
| 154 pPos = i; |
| 155 } |
| 156 |
| 157 if (str.Find(L"mailto:") == -1) |
| 158 str = L"mailto:" + str; |
| 159 |
| 160 return true; |
| 161 } |
| 162 |
| 163 CFX_WideString CPDF_LinkExtract::GetURL(size_t index) const { |
| 164 return index < m_LinkArray.size() ? m_LinkArray[index].m_strUrl : L""; |
| 165 } |
| 166 |
| 167 std::vector<CFX_FloatRect> CPDF_LinkExtract::GetRects(size_t index) const { |
| 168 if (index >= m_LinkArray.size()) |
| 169 return std::vector<CFX_FloatRect>(); |
| 170 |
| 171 return m_pTextPage->GetRectArray(m_LinkArray[index].m_Start, |
| 172 m_LinkArray[index].m_Count); |
| 173 } |
OLD | NEW |