Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(106)

Side by Side Diff: core/src/fpdftext/fpdf_text_int.cpp

Issue 1530763005: Correctly extracting email addresses (Closed) Base URL: https://pdfium.googlesource.com/pdfium.git@master
Patch Set: Make case sensitive Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 PDFium Authors. All rights reserved. 1 // Copyright 2014 PDFium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com 5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com
6 6
7 #include <cctype> 7 #include <cctype>
8 #include <cwctype> 8 #include <cwctype>
9 #include <algorithm> 9 #include <algorithm>
10 10
(...skipping 2590 matching lines...) Expand 10 before | Expand all | Expand 10 after
2601 return TRUE; 2601 return TRUE;
2602 } 2602 }
2603 if (str.Find(L"www.") != -1) { 2603 if (str.Find(L"www.") != -1) {
2604 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www.")); 2604 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));
2605 strBeCheck = L"http://" + strBeCheck; 2605 strBeCheck = L"http://" + strBeCheck;
2606 return TRUE; 2606 return TRUE;
2607 } 2607 }
2608 return FALSE; 2608 return FALSE;
2609 } 2609 }
2610 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) { 2610 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
2611 str.MakeLower();
2612 int aPos = str.Find(L'@'); 2611 int aPos = str.Find(L'@');
2612 // Invalid when no '@'.
2613 if (aPos < 1) { 2613 if (aPos < 1) {
2614 return FALSE; 2614 return FALSE;
2615 } 2615 }
2616 if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') { 2616
Lei Zhang 2015/12/17 19:13:29 Maybe add a comment to say checking the local part
Wei Li 2015/12/17 19:42:27 Done.
2617 return FALSE; 2617 int pPos = aPos; // Used to track the position of '@' or '.'.
2618 } 2618 for (int i = aPos - 1; i >= 0; i--) {
2619 int i;
2620 for (i = aPos - 1; i >= 0; i--) {
2621 FX_WCHAR ch = str.GetAt(i); 2619 FX_WCHAR ch = str.GetAt(i);
2622 if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') || 2620 if (ch == L'_' || ch == L'-' || (ch >= L'A' && ch <= L'Z') ||
2623 (ch >= L'0' && ch <= L'9')) { 2621 (ch >= L'a' && ch <= L'z') || (ch >= L'0' && ch <= L'9')) {
2624 continue; 2622 continue;
2625 } else { 2623 }
2624 if (ch != L'.' || i == pPos - 1 || i == 0) {
2626 if (i == aPos - 1) { 2625 if (i == aPos - 1) {
2626 // There is '.' or invalid char before '@'.
2627 return FALSE; 2627 return FALSE;
2628 } 2628 }
2629 str = str.Right(str.GetLength() - i - 1); 2629 // End extracting for other invalid chars, '.' at the beginning, or
2630 // consecutive '.'.
2631 int removed_len = i == pPos - 1 ? i + 2 : i + 1;
2632 str = str.Right(str.GetLength() - removed_len);
2630 break; 2633 break;
2634 } else {
Lei Zhang 2015/12/17 19:13:29 no need for else after a break.
Wei Li 2015/12/17 19:42:27 Done.
2635 // Found a valid '.'.
2636 pPos = i;
2631 } 2637 }
2632 } 2638 }
2633 aPos = str.Find(L'@'); 2639
2634 if (aPos < 1) { 2640 // Check the host name part.
2635 return FALSE;
2636 }
2637 CFX_WideString strtemp = L"";
2638 for (i = 0; i < aPos; i++) {
2639 FX_WCHAR wch = str.GetAt(i);
2640 if (wch >= L'a' && wch <= L'z') {
2641 break;
2642 } else {
2643 strtemp = str.Right(str.GetLength() - i + 1);
2644 }
2645 }
2646 if (strtemp != L"") {
2647 str = strtemp;
2648 }
2649 aPos = str.Find(L'@'); 2641 aPos = str.Find(L'@');
2650 if (aPos < 1) { 2642 if (aPos < 1) {
2651 return FALSE; 2643 return FALSE;
2652 } 2644 }
2653 str.TrimRight(L'.'); 2645 str.TrimRight(L'.');
2654 strtemp = str; 2646 CFX_WideString strtemp = str;
2655 int ePos = str.Find(L'.'); 2647 // At least one '.' in host name, but not at the beginning.
2656 if (ePos == -1) { 2648 int ePos = str.Find(L'.', aPos + 1);
2649 if (ePos == -1 || ePos == aPos + 1) {
2657 return FALSE; 2650 return FALSE;
2658 } 2651 }
2659 while (ePos != -1) { 2652 // Validate all other chars in host name.
2660 strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1); 2653 int nLen = str.GetLength();
2661 ePos = strtemp.Find('.'); 2654 pPos = 0; // Used to track the position of '.'.
2662 } 2655 for (int i = aPos + 1; i < nLen; i++) {
2663 ePos = strtemp.GetLength();
2664 for (i = 0; i < ePos; i++) {
2665 FX_WCHAR wch = str.GetAt(i); 2656 FX_WCHAR wch = str.GetAt(i);
2666 if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) { 2657 if (wch == L'-' || (wch >= L'A' && wch <= L'Z') ||
Lei Zhang 2015/12/17 19:13:29 Maybe add some helpers like IsLetter() and IsNumbe
Wei Li 2015/12/17 19:42:27 Done.
2658 (wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) {
2667 continue; 2659 continue;
2660 }
2661 if (wch != L'.' || i == pPos + 1) {
2662 // Host name should end before invalid char.
2663 int host_end = i == pPos + 1 ? i - 2 : i - 1;
2664 if (pPos > 0 && host_end - aPos >= 3) {
2665 // Trim the ending invalid chars if there is at least one '.' and name.
2666 str = str.Left(host_end + 1);
2667 break;
2668 }
2669 return FALSE;
2668 } else { 2670 } else {
Lei Zhang 2015/12/17 19:13:29 ditto
Wei Li 2015/12/17 19:42:27 Done.
2669 str = str.Left(str.GetLength() - ePos + i + 1); 2671 pPos = i;
2670 ePos = ePos - i - 1;
2671 break;
2672 } 2672 }
2673 } 2673 }
2674 int nLen = str.GetLength(); 2674
2675 for (i = aPos + 1; i < nLen - ePos; i++) {
2676 FX_WCHAR wch = str.GetAt(i);
2677 if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') ||
2678 (wch >= L'0' && wch <= L'9')) {
2679 continue;
2680 } else {
2681 return FALSE;
2682 }
2683 }
2684 if (str.Find(L"mailto:") == -1) { 2675 if (str.Find(L"mailto:") == -1) {
2685 str = L"mailto:" + str; 2676 str = L"mailto:" + str;
2686 } 2677 }
2687 return TRUE; 2678 return TRUE;
2688 } 2679 }
2689 2680
2690 void CPDF_LinkExtract::AppendToLinkList(int start, 2681 void CPDF_LinkExtract::AppendToLinkList(int start,
2691 int count, 2682 int count,
2692 const CFX_WideString& strUrl) { 2683 const CFX_WideString& strUrl) {
2693 CPDF_LinkExt* linkInfo = new CPDF_LinkExt; 2684 CPDF_LinkExt* linkInfo = new CPDF_LinkExt;
(...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after
2726 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) { 2717 if (!m_bIsParsed || index < 0 || index >= m_LinkList.GetSize()) {
2727 return; 2718 return;
2728 } 2719 }
2729 CPDF_LinkExt* link = NULL; 2720 CPDF_LinkExt* link = NULL;
2730 link = m_LinkList.GetAt(index); 2721 link = m_LinkList.GetAt(index);
2731 if (!link) { 2722 if (!link) {
2732 return; 2723 return;
2733 } 2724 }
2734 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects); 2725 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);
2735 } 2726 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698