Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(218)

Side by Side Diff: core/src/fpdftext/fpdf_text_int.cpp

Issue 1530763005: Correctly extracting email addresses (Closed) Base URL: https://pdfium.googlesource.com/pdfium.git@master
Patch Set: Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 PDFium Authors. All rights reserved. 1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 6
7 #include <cctype> 7 #include <cctype>
8 #include <cwctype> 8 #include <cwctype>
9 #include <algorithm> 9 #include <algorithm>
10 10
(...skipping 2590 matching lines...) Expand 10 before | Expand all | Expand 10 after
2601 return TRUE; 2601 return TRUE;
2602 } 2602 }
2603 if (str.Find(L"www.") != -1) { 2603 if (str.Find(L"www.") != -1) {
2604 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); 2604 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
2605 strBeCheck = L"http://" + strBeCheck; 2605 strBeCheck = L"http://" + strBeCheck;
2606 return TRUE; 2606 return TRUE;
2607 } 2607 }
2608 return FALSE; 2608 return FALSE;
2609 } 2609 }
2610 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { 2610 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
2611 str.MakeLower();
2612 int aPos = str.Find(L'@'); 2611 int aPos = str.Find(L'@');
2612 // Invalid when no '@'.
2613 if (aPos < 1) { 2613 if (aPos < 1) {
2614 return FALSE; 2614 return FALSE;
2615 } 2615 }
2616 if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') { 2616
2617 return FALSE; 2617 // Check the local part.
2618 } 2618 int pPos = aPos; // Used to track the position of '@' or '.'.
2619 int i; 2619 for (int i = aPos - 1; i >= 0; i--) {
2620 for (i = aPos - 1; i >= 0; i--) {
2621 FX_WCHAR ch = str.GetAt(i); 2620 FX_WCHAR ch = str.GetAt(i);
2622 if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') || 2621 if (ch == L'_' || ch == L'-' || FX_ISWALPHA(ch) || FX_ISWDIGIT(ch)) {
2623 (ch >= L'0' && ch <= L'9')) {
2624 continue; 2622 continue;
2625 } else { 2623 }
2624 if (ch != L'.' || i == pPos - 1 || i == 0) {
2626 if (i == aPos - 1) { 2625 if (i == aPos - 1) {
2626 // There is '.' or invalid char before '@'.
2627 return FALSE; 2627 return FALSE;
2628 } 2628 }
2629 str = str.Right(str.GetLength() - i - 1); 2629 // End extracting for other invalid chars, '.' at the beginning, or
2630 // consecutive '.'.
2631 int removed_len = i == pPos - 1 ? i + 2 : i + 1;
2632 str = str.Right(str.GetLength() - removed_len);
2630 break; 2633 break;
2631 } 2634 }
2635 // Found a valid '.'.
2636 pPos = i;
2632 } 2637 }
2633 aPos = str.Find(L'@'); 2638
2634 if (aPos < 1) { 2639 // Check the domain name part.
2635 return FALSE;
2636 }
2637 CFX_WideString strtemp = L"";
2638 for (i = 0; i < aPos; i++) {
2639 FX_WCHAR wch = str.GetAt(i);
2640 if (wch >= L'a' && wch <= L'z') {
2641 break;
2642 } else {
2643 strtemp = str.Right(str.GetLength() - i + 1);
2644 }
2645 }
2646 if (strtemp != L"") {
2647 str = strtemp;
2648 }
2649 aPos = str.Find(L'@'); 2640 aPos = str.Find(L'@');
2650 if (aPos < 1) { 2641 if (aPos < 1) {
2651 return FALSE; 2642 return FALSE;
2652 } 2643 }
2653 str.TrimRight(L'.'); 2644 str.TrimRight(L'.');
2654 strtemp = str; 2645 CFX_WideString strtemp = str;
Lei Zhang 2015/12/18 00:24:09 Not used?
Wei Li 2015/12/18 01:12:21 Done, thank you for your keen eyes. :)
2655 int ePos = str.Find(L'.'); 2646 // At least one '.' in domain name, but not at the beginning.
2656 if (ePos == -1) { 2647 int ePos = str.Find(L'.', aPos + 1);
2648 if (ePos == -1 || ePos == aPos + 1) {
2657 return FALSE; 2649 return FALSE;
2658 } 2650 }
2659 while (ePos != -1) { 2651 // Validate all other chars in domain name.
2660 strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1); 2652 int nLen = str.GetLength();
2661 ePos = strtemp.Find('.'); 2653 pPos = 0; // Used to track the position of '.'.
2662 } 2654 for (int i = aPos + 1; i < nLen; i++) {
2663 ePos = strtemp.GetLength();
2664 for (i = 0; i < ePos; i++) {
2665 FX_WCHAR wch = str.GetAt(i); 2655 FX_WCHAR wch = str.GetAt(i);
2666 if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) { 2656 if (wch == L'-' || FX_ISWALPHA(wch) || FX_ISWDIGIT(wch)) {
2667 continue; 2657 continue;
2668 } else {
2669 str = str.Left(str.GetLength() - ePos + i + 1);
2670 ePos = ePos - i - 1;
2671 break;
2672 } 2658 }
2673 } 2659 if (wch != L'.' || i == pPos + 1) {
2674 int nLen = str.GetLength(); 2660 // Domain name should end before invalid char.
2675 for (i = aPos + 1; i < nLen - ePos; i++) { 2661 int host_end = i == pPos + 1 ? i - 2 : i - 1;
2676 FX_WCHAR wch = str.GetAt(i); 2662 if (pPos > 0 && host_end - aPos >= 3) {
2677 if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') || 2663 // Trim the ending invalid chars if there is at least one '.' and name.
2678 (wch >= L'0' && wch <= L'9')) { 2664 str = str.Left(host_end + 1);
2679 continue; 2665 break;
2680 } else { 2666 }
2681 return FALSE; 2667 return FALSE;
2682 } 2668 }
2669 pPos = i;
2683 } 2670 }
2671
2684 if (str.Find(L"mailto:") == -1) { 2672 if (str.Find(L"mailto:") == -1) {
2685 str = L"mailto:" + str; 2673 str = L"mailto:" + str;
2686 } 2674 }
2687 return TRUE; 2675 return TRUE;
2688 } 2676 }
2689 2677
2690 void CPDF_LinkExtract::AppendToLinkList(int start, 2678 void CPDF_LinkExtract::AppendToLinkList(int start,
2691 int count, 2679 int count,
2692 const CFX_WideString& strUrl) { 2680 const CFX_WideString& strUrl) {
2693 CPDF_LinkExt* linkInfo = new CPDF_LinkExt; 2681 CPDF_LinkExt* linkInfo = new CPDF_LinkExt;
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
2726 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { 2714 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2727 return; 2715 return;
2728 } 2716 }
2729 CPDF_LinkExt* link = NULL; 2717 CPDF_LinkExt* link = NULL;
2730 link = m_LinkList.GetAt(index); 2718 link = m_LinkList.GetAt(index);
2731 if (!link) { 2719 if (!link) {
2732 return; 2720 return;
2733 } 2721 }
2734 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); 2722 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);
2735 } 2723 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698