Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(205)

Side by Side Diff: core/src/fpdftext/fpdf_text_int.cpp

Issue 1530763005: Correctly extracting email addresses (Closed) Base URL: https://pdfium.googlesource.com/pdfium.git@master
Patch Set: more comments Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « core/include/fxcrt/fx_ext.h ('k') | core/src/fpdftext/fpdf_text_int_unittest.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 PDFium Authors. All rights reserved. 1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 6
7 #include <cctype> 7 #include <cctype>
8 #include <cwctype> 8 #include <cwctype>
9 #include <algorithm> 9 #include <algorithm>
10 10
11 #include "core/include/fpdfapi/fpdf_module.h" 11 #include "core/include/fpdfapi/fpdf_module.h"
12 #include "core/include/fpdfapi/fpdf_page.h" 12 #include "core/include/fpdfapi/fpdf_page.h"
13 #include "core/include/fpdfapi/fpdf_pageobj.h" 13 #include "core/include/fpdfapi/fpdf_pageobj.h"
14 #include "core/include/fpdfapi/fpdf_resource.h" 14 #include "core/include/fpdfapi/fpdf_resource.h"
15 #include "core/include/fpdftext/fpdf_text.h" 15 #include "core/include/fpdftext/fpdf_text.h"
16 #include "core/include/fxcrt/fx_bidi.h" 16 #include "core/include/fxcrt/fx_bidi.h"
17 #include "core/include/fxcrt/fx_ext.h"
17 #include "core/include/fxcrt/fx_ucd.h" 18 #include "core/include/fxcrt/fx_ucd.h"
18 #include "text_int.h" 19 #include "text_int.h"
19 #include "third_party/base/nonstd_unique_ptr.h" 20 #include "third_party/base/nonstd_unique_ptr.h"
20 21
21 namespace { 22 namespace {
22 23
23 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) { 24 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) {
24 if (curChar < 255) { 25 if (curChar < 255) {
25 return FALSE; 26 return FALSE;
26 } 27 }
(...skipping 2573 matching lines...) Expand 10 before | Expand all | Expand 10 after
2600 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://")); 2601 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));
2601 return TRUE; 2602 return TRUE;
2602 } 2603 }
2603 if (str.Find(L"www.") != -1) { 2604 if (str.Find(L"www.") != -1) {
2604 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); 2605 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
2605 strBeCheck = L"http://" + strBeCheck; 2606 strBeCheck = L"http://" + strBeCheck;
2606 return TRUE; 2607 return TRUE;
2607 } 2608 }
2608 return FALSE; 2609 return FALSE;
2609 } 2610 }
2610 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { 2611 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
2611 str.MakeLower();
2612 int aPos = str.Find(L'@'); 2612 int aPos = str.Find(L'@');
2613 // Invalid when no '@'.
2613 if (aPos < 1) { 2614 if (aPos < 1) {
2614 return FALSE; 2615 return FALSE;
2615 } 2616 }
2616 if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') { 2617
2617 return FALSE; 2618 // Check the local part.
2618 } 2619 int pPos = aPos; // Used to track the position of '@' or '.'.
2619 int i; 2620 for (int i = aPos - 1; i >= 0; i--) {
2620 for (i = aPos - 1; i >= 0; i--) {
2621 FX_WCHAR ch = str.GetAt(i); 2621 FX_WCHAR ch = str.GetAt(i);
2622 if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') || 2622 if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) {
2623 (ch >= L'0' && ch <= L'9')) {
2624 continue; 2623 continue;
2625 } else { 2624 }
2625 if (ch != L'.' || i == pPos - 1 || i == 0) {
2626 if (i == aPos - 1) { 2626 if (i == aPos - 1) {
2627 // There is '.' or invalid char before '@'.
2627 return FALSE; 2628 return FALSE;
2628 } 2629 }
2629 str = str.Right(str.GetLength() - i - 1); 2630 // End extracting for other invalid chars, '.' at the beginning, or
2631 // consecutive '.'.
2632 int removed_len = i == pPos - 1 ? i + 2 : i + 1;
2633 str = str.Right(str.GetLength() - removed_len);
2630 break; 2634 break;
2631 } 2635 }
2636 // Found a valid '.'.
2637 pPos = i;
2632 } 2638 }
2633 aPos = str.Find(L'@'); 2639
2634 if (aPos < 1) { 2640 // Check the domain name part.
2635 return FALSE;
2636 }
2637 CFX_WideString strtemp = L"";
2638 for (i = 0; i < aPos; i++) {
2639 FX_WCHAR wch = str.GetAt(i);
2640 if (wch >= L'a' && wch <= L'z') {
2641 break;
2642 } else {
2643 strtemp = str.Right(str.GetLength() - i + 1);
2644 }
2645 }
2646 if (strtemp != L"") {
2647 str = strtemp;
2648 }
2649 aPos = str.Find(L'@'); 2641 aPos = str.Find(L'@');
2650 if (aPos < 1) { 2642 if (aPos < 1) {
2651 return FALSE; 2643 return FALSE;
2652 } 2644 }
2653 str.TrimRight(L'.'); 2645 str.TrimRight(L'.');
2654 strtemp = str; 2646 // At least one '.' in domain name, but not at the beginning.
2655 int ePos = str.Find(L'.'); 2647 // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
2656 if (ePos == -1) { 2648 // Check whether we should remove this check.
2649 int ePos = str.Find(L'.', aPos + 1);
2650 if (ePos == -1 || ePos == aPos + 1) {
2657 return FALSE; 2651 return FALSE;
2658 } 2652 }
2659 while (ePos != -1) { 2653 // Validate all other chars in domain name.
2660 strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1); 2654 int nLen = str.GetLength();
2661 ePos = strtemp.Find('.'); 2655 pPos = 0; // Used to track the position of '.'.
2662 } 2656 for (int i = aPos + 1; i < nLen; i++) {
2663 ePos = strtemp.GetLength();
2664 for (i = 0; i < ePos; i++) {
2665 FX_WCHAR wch = str.GetAt(i); 2657 FX_WCHAR wch = str.GetAt(i);
2666 if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) { 2658 if (wch == L'-' || FXSYS_iswalnum(wch)) {
2667 continue; 2659 continue;
2668 } else {
2669 str = str.Left(str.GetLength() - ePos + i + 1);
2670 ePos = ePos - i - 1;
2671 break;
2672 } 2660 }
2673 } 2661 if (wch != L'.' || i == pPos + 1) {
2674 int nLen = str.GetLength(); 2662 // Domain name should end before invalid char.
2675 for (i = aPos + 1; i < nLen - ePos; i++) { 2663 int host_end = i == pPos + 1 ? i - 2 : i - 1;
2676 FX_WCHAR wch = str.GetAt(i); 2664 if (pPos > 0 && host_end - aPos >= 3) {
2677 if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') || 2665 // Trim the ending invalid chars if there is at least one '.' and name.
2678 (wch >= L'0' && wch <= L'9')) { 2666 str = str.Left(host_end + 1);
2679 continue; 2667 break;
2680 } else { 2668 }
2681 return FALSE; 2669 return FALSE;
2682 } 2670 }
2671 pPos = i;
2683 } 2672 }
2673
2684 if (str.Find(L"mailto:") == -1) { 2674 if (str.Find(L"mailto:") == -1) {
2685 str = L"mailto:" + str; 2675 str = L"mailto:" + str;
2686 } 2676 }
2687 return TRUE; 2677 return TRUE;
2688 } 2678 }
2689 2679
2690 void CPDF_LinkExtract::AppendToLinkList(int start, 2680 void CPDF_LinkExtract::AppendToLinkList(int start,
2691 int count, 2681 int count,
2692 const CFX_WideString& strUrl) { 2682 const CFX_WideString& strUrl) {
2693 CPDF_LinkExt* linkInfo = new CPDF_LinkExt; 2683 CPDF_LinkExt* linkInfo = new CPDF_LinkExt;
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
2726 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { 2716 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2727 return; 2717 return;
2728 } 2718 }
2729 CPDF_LinkExt* link = NULL; 2719 CPDF_LinkExt* link = NULL;
2730 link = m_LinkList.GetAt(index); 2720 link = m_LinkList.GetAt(index);
2731 if (!link) { 2721 if (!link) {
2732 return; 2722 return;
2733 } 2723 }
2734 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); 2724 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);
2735 } 2725 }
OLDNEW
« no previous file with comments | « core/include/fxcrt/fx_ext.h ('k') | core/src/fpdftext/fpdf_text_int_unittest.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698