OLD | NEW |
1 // Copyright 2014 PDFium Authors. All rights reserved. | 1 // Copyright 2014 PDFium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com | 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com |
6 | 6 |
7 #include <cctype> | 7 #include <cctype> |
8 #include <cwctype> | 8 #include <cwctype> |
9 #include <algorithm> | 9 #include <algorithm> |
10 | 10 |
11 #include "core/include/fpdfapi/fpdf_module.h" | 11 #include "core/include/fpdfapi/fpdf_module.h" |
12 #include "core/include/fpdfapi/fpdf_page.h" | 12 #include "core/include/fpdfapi/fpdf_page.h" |
13 #include "core/include/fpdfapi/fpdf_pageobj.h" | 13 #include "core/include/fpdfapi/fpdf_pageobj.h" |
14 #include "core/include/fpdfapi/fpdf_resource.h" | 14 #include "core/include/fpdfapi/fpdf_resource.h" |
15 #include "core/include/fpdftext/fpdf_text.h" | 15 #include "core/include/fpdftext/fpdf_text.h" |
16 #include "core/include/fxcrt/fx_bidi.h" | 16 #include "core/include/fxcrt/fx_bidi.h" |
| 17 #include "core/include/fxcrt/fx_ext.h" |
17 #include "core/include/fxcrt/fx_ucd.h" | 18 #include "core/include/fxcrt/fx_ucd.h" |
18 #include "text_int.h" | 19 #include "text_int.h" |
19 #include "third_party/base/nonstd_unique_ptr.h" | 20 #include "third_party/base/nonstd_unique_ptr.h" |
20 | 21 |
21 namespace { | 22 namespace { |
22 | 23 |
23 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) { | 24 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) { |
24 if (curChar < 255) { | 25 if (curChar < 255) { |
25 return FALSE; | 26 return FALSE; |
26 } | 27 } |
(...skipping 2573 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2600 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); | 2601 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); |
2601 return TRUE; | 2602 return TRUE; |
2602 } | 2603 } |
2603 if (str.Find(L"www.") != -1) { | 2604 if (str.Find(L"www.") != -1) { |
2604 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); | 2605 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); |
2605 strBeCheck = L"http://" + strBeCheck; | 2606 strBeCheck = L"http://" + strBeCheck; |
2606 return TRUE; | 2607 return TRUE; |
2607 } | 2608 } |
2608 return FALSE; | 2609 return FALSE; |
2609 } | 2610 } |
2610 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { | 2611 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { |
2611 str.MakeLower(); | |
2612 int aPos = str.Find(L'@'); | 2612 int aPos = str.Find(L'@'); |
| 2613 // Invalid when no '@'. |
2613 if (aPos < 1) { | 2614 if (aPos < 1) { |
2614 return FALSE; | 2615 return FALSE; |
2615 } | 2616 } |
2616 if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') { | 2617 |
2617 return FALSE; | 2618 // Check the local part. |
2618 } | 2619 int pPos = aPos; // Used to track the position of '@' or '.'. |
2619 int i; | 2620 for (int i = aPos - 1; i >= 0; i--) { |
2620 for (i = aPos - 1; i >= 0; i--) { | |
2621 FX_WCHAR ch = str.GetAt(i); | 2621 FX_WCHAR ch = str.GetAt(i); |
2622 if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') || | 2622 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) { |
2623 (ch >= L'0' && ch <= L'9')) { | |
2624 continue; | 2623 continue; |
2625 } else { | 2624 } |
| 2625 if (ch != L'.' || i == pPos - 1 || i == 0) { |
2626 if (i == aPos - 1) { | 2626 if (i == aPos - 1) { |
| 2627 // There is '.' or invalid char before '@'. |
2627 return FALSE; | 2628 return FALSE; |
2628 } | 2629 } |
2629 str = str.Right(str.GetLength() - i - 1); | 2630 // End extracting for other invalid chars, '.' at the beginning, or |
| 2631 // consecutive '.'. |
| 2632 int removed_len = i == pPos - 1 ? i + 2 : i + 1; |
| 2633 str = str.Right(str.GetLength() - removed_len); |
2630 break; | 2634 break; |
2631 } | 2635 } |
| 2636 // Found a valid '.'. |
| 2637 pPos = i; |
2632 } | 2638 } |
2633 aPos = str.Find(L'@'); | 2639 |
2634 if (aPos < 1) { | 2640 // Check the domain name part. |
2635 return FALSE; | |
2636 } | |
2637 CFX_WideString strtemp = L""; | |
2638 for (i = 0; i < aPos; i++) { | |
2639 FX_WCHAR wch = str.GetAt(i); | |
2640 if (wch >= L'a' && wch <= L'z') { | |
2641 break; | |
2642 } else { | |
2643 strtemp = str.Right(str.GetLength() - i + 1); | |
2644 } | |
2645 } | |
2646 if (strtemp != L"") { | |
2647 str = strtemp; | |
2648 } | |
2649 aPos = str.Find(L'@'); | 2641 aPos = str.Find(L'@'); |
2650 if (aPos < 1) { | 2642 if (aPos < 1) { |
2651 return FALSE; | 2643 return FALSE; |
2652 } | 2644 } |
2653 str.TrimRight(L'.'); | 2645 str.TrimRight(L'.'); |
2654 strtemp = str; | 2646 // At least one '.' in domain name, but not at the beginning. |
2655 int ePos = str.Find(L'.'); | 2647 // TODO(weili): RFC5322 allows domain names to be a local name without '.'. |
2656 if (ePos == -1) { | 2648 // Check whether we should remove this check. |
| 2649 int ePos = str.Find(L'.', aPos + 1); |
| 2650 if (ePos == -1 || ePos == aPos + 1) { |
2657 return FALSE; | 2651 return FALSE; |
2658 } | 2652 } |
2659 while (ePos != -1) { | 2653 // Validate all other chars in domain name. |
2660 strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1); | 2654 int nLen = str.GetLength(); |
2661 ePos = strtemp.Find('.'); | 2655 pPos = 0; // Used to track the position of '.'. |
2662 } | 2656 for (int i = aPos + 1; i < nLen; i++) { |
2663 ePos = strtemp.GetLength(); | |
2664 for (i = 0; i < ePos; i++) { | |
2665 FX_WCHAR wch = str.GetAt(i); | 2657 FX_WCHAR wch = str.GetAt(i); |
2666 if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) { | 2658 if (wch == L'-' || FXSYS_iswalnum(wch)) { |
2667 continue; | 2659 continue; |
2668 } else { | |
2669 str = str.Left(str.GetLength() - ePos + i + 1); | |
2670 ePos = ePos - i - 1; | |
2671 break; | |
2672 } | 2660 } |
2673 } | 2661 if (wch != L'.' || i == pPos + 1) { |
2674 int nLen = str.GetLength(); | 2662 // Domain name should end before invalid char. |
2675 for (i = aPos + 1; i < nLen - ePos; i++) { | 2663 int host_end = i == pPos + 1 ? i - 2 : i - 1; |
2676 FX_WCHAR wch = str.GetAt(i); | 2664 if (pPos > 0 && host_end - aPos >= 3) { |
2677 if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') || | 2665 // Trim the ending invalid chars if there is at least one '.' and name. |
2678 (wch >= L'0' && wch <= L'9')) { | 2666 str = str.Left(host_end + 1); |
2679 continue; | 2667 break; |
2680 } else { | 2668 } |
2681 return FALSE; | 2669 return FALSE; |
2682 } | 2670 } |
| 2671 pPos = i; |
2683 } | 2672 } |
| 2673 |
2684 if (str.Find(L"mailto:") == -1) { | 2674 if (str.Find(L"mailto:") == -1) { |
2685 str = L"mailto:" + str; | 2675 str = L"mailto:" + str; |
2686 } | 2676 } |
2687 return TRUE; | 2677 return TRUE; |
2688 } | 2678 } |
2689 | 2679 |
2690 void CPDF_LinkExtract::AppendToLinkList(int start, | 2680 void CPDF_LinkExtract::AppendToLinkList(int start, |
2691 int count, | 2681 int count, |
2692 const CFX_WideString& strUrl) { | 2682 const CFX_WideString& strUrl) { |
2693 CPDF_LinkExt* linkInfo = new CPDF_LinkExt; | 2683 CPDF_LinkExt* linkInfo = new CPDF_LinkExt; |
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
2726 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { | 2716 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { |
2727 return; | 2717 return; |
2728 } | 2718 } |
2729 CPDF_LinkExt* link = NULL; | 2719 CPDF_LinkExt* link = NULL; |
2730 link = m_LinkList.GetAt(index); | 2720 link = m_LinkList.GetAt(index); |
2731 if (!link) { | 2721 if (!link) { |
2732 return; | 2722 return; |
2733 } | 2723 } |
2734 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); | 2724 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); |
2735 } | 2725 } |
OLD | NEW |