Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(148)

Side by Side Diff: core/src/fpdftext/fpdf_text_int.cpp

Issue 1530763005: Correctly extracting email addresses (Closed) Base URL: https://pdfium.googlesource.com/pdfium.git@master
Patch Set: Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « BUILD.gn ('k') | core/src/fpdftext/fpdf_text_int_unittest.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 PDFium Authors. All rights reserved. 1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 6
7 #include <cctype> 7 #include <cctype>
8 #include <cwctype> 8 #include <cwctype>
9 #include <algorithm> 9 #include <algorithm>
10 10
(...skipping 2592 matching lines...) Expand 10 before | Expand all | Expand 10 after
2603 if (str.Find(L"www.") != -1) { 2603 if (str.Find(L"www.") != -1) {
2604 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); 2604 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
2605 strBeCheck = L"http://" + strBeCheck; 2605 strBeCheck = L"http://" + strBeCheck;
2606 return TRUE; 2606 return TRUE;
2607 } 2607 }
2608 return FALSE; 2608 return FALSE;
2609 } 2609 }
2610 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { 2610 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
2611 str.MakeLower(); 2611 str.MakeLower();
2612 int aPos = str.Find(L'@'); 2612 int aPos = str.Find(L'@');
2613 // Invalid when no '@'.
2613 if (aPos < 1) { 2614 if (aPos < 1) {
2614 return FALSE; 2615 return FALSE;
2615 } 2616 }
2616 if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') {
2617 return FALSE;
2618 }
2619 int i; 2617 int i;
jun_fang 2015/12/17 13:30:45 nit: prefer "for (int i = aPos - 1; i >= 0; i--)"
Wei Li 2015/12/17 17:41:41 Done. I was not sure what style I should follow w
2618 int pPos = aPos; // Used to track the position of '@' or '.'.
2620 for (i = aPos - 1; i >= 0; i--) { 2619 for (i = aPos - 1; i >= 0; i--) {
2621 FX_WCHAR ch = str.GetAt(i); 2620 FX_WCHAR ch = str.GetAt(i);
2622 if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') || 2621 if (ch == L'_' || ch == L'-' || (ch >= L'a' && ch <= L'z') ||
jun_fang 2015/12/17 13:30:45 Should we handle upper-case letters in email addre
Wei Li 2015/12/17 17:41:41 Done. I don't know why the original code chose to
2623 (ch >= L'0' && ch <= L'9')) { 2622 (ch >= L'0' && ch <= L'9')) {
2624 continue; 2623 continue;
2625 } else { 2624 }
2625 if (ch != L'.' || i == pPos - 1 || i == 0) {
2626 if (i == aPos - 1) { 2626 if (i == aPos - 1) {
2627 // There is '.' or invalid char before '@'.
2627 return FALSE; 2628 return FALSE;
2628 } 2629 }
2629 str = str.Right(str.GetLength() - i - 1); 2630 // End extracting for other invalid chars, '.' at the beginning, or
2631 // consecutive '.'.
2632 int removed_len = i == pPos - 1 ? i + 2 : i + 1;
2633 str = str.Right(str.GetLength() - removed_len);
2630 break; 2634 break;
2635 } else {
2636 // Found a valid '.'.
2637 pPos = i;
2631 } 2638 }
2632 } 2639 }
2633 aPos = str.Find(L'@'); 2640
2634 if (aPos < 1) { 2641 // Check the host name part.
2635 return FALSE;
2636 }
2637 CFX_WideString strtemp = L"";
2638 for (i = 0; i < aPos; i++) {
2639 FX_WCHAR wch = str.GetAt(i);
2640 if (wch >= L'a' && wch <= L'z') {
2641 break;
2642 } else {
2643 strtemp = str.Right(str.GetLength() - i + 1);
2644 }
2645 }
2646 if (strtemp != L"") {
2647 str = strtemp;
2648 }
2649 aPos = str.Find(L'@'); 2642 aPos = str.Find(L'@');
2650 if (aPos < 1) { 2643 if (aPos < 1) {
2651 return FALSE; 2644 return FALSE;
2652 } 2645 }
2653 str.TrimRight(L'.'); 2646 str.TrimRight(L'.');
2654 strtemp = str; 2647 CFX_WideString strtemp = str;
2655 int ePos = str.Find(L'.'); 2648 // At least one '.' in host name, but not at the beginning.
2656 if (ePos == -1) { 2649 int ePos = str.Find(L'.', aPos + 1);
2650 if (ePos == -1 || ePos == aPos + 1) {
2657 return FALSE; 2651 return FALSE;
2658 } 2652 }
2659 while (ePos != -1) { 2653 // Validate all other chars in host name.
2660 strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1); 2654 int nLen = str.GetLength();
2661 ePos = strtemp.Find('.'); 2655 pPos = 0; // Used to track the position of '.'.
2662 } 2656 for (i = aPos + 1; i < nLen; i++) {
jun_fang 2015/12/17 13:30:45 nit: for (int i = aPos + 1; i < nLen; i++).
Wei Li 2015/12/17 17:41:41 Done.
2663 ePos = strtemp.GetLength();
2664 for (i = 0; i < ePos; i++) {
2665 FX_WCHAR wch = str.GetAt(i); 2657 FX_WCHAR wch = str.GetAt(i);
2666 if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) { 2658 if (wch == L'-' || (wch >= L'a' && wch <= L'z') ||
jun_fang 2015/12/17 13:30:45 How about upper-case letters?
Wei Li 2015/12/17 17:41:41 See above.
2659 (wch >= L'0' && wch <= L'9')) {
2667 continue; 2660 continue;
2661 }
2662 if (wch != L'.' || i == pPos + 1) {
2663 // Host name should end before invalid char.
2664 int host_end = i == pPos + 1 ? i - 2 : i - 1;
2665 if (pPos > 0 && host_end - aPos >= 3) {
2666 // Trim the ending invalid chars if there is at least one '.' and name.
2667 str = str.Left(host_end + 1);
2668 break;
2669 }
2670 return FALSE;
2668 } else { 2671 } else {
2669 str = str.Left(str.GetLength() - ePos + i + 1); 2672 pPos = i;
2670 ePos = ePos - i - 1;
2671 break;
2672 } 2673 }
2673 } 2674 }
2674 int nLen = str.GetLength(); 2675
2675 for (i = aPos + 1; i < nLen - ePos; i++) {
2676 FX_WCHAR wch = str.GetAt(i);
2677 if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') ||
2678 (wch >= L'0' && wch <= L'9')) {
2679 continue;
2680 } else {
2681 return FALSE;
2682 }
2683 }
2684 if (str.Find(L"mailto:") == -1) { 2676 if (str.Find(L"mailto:") == -1) {
2685 str = L"mailto:" + str; 2677 str = L"mailto:" + str;
2686 } 2678 }
2687 return TRUE; 2679 return TRUE;
2688 } 2680 }
2689 2681
2690 void CPDF_LinkExtract::AppendToLinkList(int start, 2682 void CPDF_LinkExtract::AppendToLinkList(int start,
2691 int count, 2683 int count,
2692 const CFX_WideString& strUrl) { 2684 const CFX_WideString& strUrl) {
2693 CPDF_LinkExt* linkInfo = new CPDF_LinkExt; 2685 CPDF_LinkExt* linkInfo = new CPDF_LinkExt;
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
2726 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { 2718 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2727 return; 2719 return;
2728 } 2720 }
2729 CPDF_LinkExt* link = NULL; 2721 CPDF_LinkExt* link = NULL;
2730 link = m_LinkList.GetAt(index); 2722 link = m_LinkList.GetAt(index);
2731 if (!link) { 2723 if (!link) {
2732 return; 2724 return;
2733 } 2725 }
2734 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); 2726 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);
2735 } 2727 }
OLDNEW
« no previous file with comments | « BUILD.gn ('k') | core/src/fpdftext/fpdf_text_int_unittest.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698