core/src/fpdftext/fpdf_text_int.cpp - Issue 1530763005: Correctly extracting email addresses

Side by Side Diff: core/src/fpdftext/fpdf_text_int.cpp

Issue 1530763005: Correctly extracting email addresses (Closed) Base URL: https://pdfium.googlesource.com/pdfium.git@master

Patch Set: Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 PDFium Authors. All rights reserved.	1 // Copyright 2014 PDFium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com	5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

6	6

7 #include <cctype>	7 #include <cctype>

8 #include <cwctype>	8 #include <cwctype>

9 #include <algorithm>	9 #include <algorithm>

10	10

(...skipping 2592 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2603 if (str.Find(L"www.") != -1) {	2603 if (str.Find(L"www.") != -1) {

2604 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));	2604 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));

2605 strBeCheck = L"http://" + strBeCheck;	2605 strBeCheck = L"http://" + strBeCheck;

2606 return TRUE;	2606 return TRUE;

2607 }	2607 }

2608 return FALSE;	2608 return FALSE;

2609 }	2609 }

2610 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {	2610 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {

2611 str.MakeLower();	2611 str.MakeLower();

2612 int aPos = str.Find(L'@');	2612 int aPos = str.Find(L'@');

	2613 // Invalid when no '@'.

2613 if (aPos < 1) {	2614 if (aPos < 1) {

2614 return FALSE;	2615 return FALSE;

2615 }	2616 }

2616 if (str.GetAt(aPos - 1) == L'.' \|\| str.GetAt(aPos - 1) == L'_') {

2617 return FALSE;

2618 }

2619 int i;	2617 int i;
	jun_fang 2015/12/17 13:30:45 nit: prefer "for (int i = aPos - 1; i >= 0; i--)" nit: prefer "for (int i = aPos - 1; i >= 0; i--)" Wei Li 2015/12/17 17:41:41 Done. I was not sure what style I should follow w Show quoted text On 2015/12/17 13:30:45, jun_fang wrote: > nit: prefer "for (int i = aPos - 1; i >= 0; i--)" Done. I was not sure what style I should follow when I changed the original code. But glad to change this one. :)
	2618 int pPos = aPos; // Used to track the position of '@' or '.'.

2620 for (i = aPos - 1; i >= 0; i--) {	2619 for (i = aPos - 1; i >= 0; i--) {

2621 FX_WCHAR ch = str.GetAt(i);	2620 FX_WCHAR ch = str.GetAt(i);

2622 if (ch == L'_' \|\| ch == L'.' \|\| (ch >= L'a' && ch <= L'z') \|\|	2621 if (ch == L'_' \|\| ch == L'-' \|\| (ch >= L'a' && ch <= L'z') \|\|
	jun_fang 2015/12/17 13:30:45 Should we handle upper-case letters in email addre Should we handle upper-case letters in email address? Wei Li 2015/12/17 17:41:41 Done. I don't know why the original code chose to Show quoted text On 2015/12/17 13:30:45, jun_fang wrote: > Should we handle upper-case letters in email address? Done. I don't know why the original code chose to be case insensitive. Since it seems no harm, I kept it. But keeping the case is better.
2623 (ch >= L'0' && ch <= L'9')) {	2622 (ch >= L'0' && ch <= L'9')) {

2624 continue;	2623 continue;

2625 } else {	2624 }

	2625 if (ch != L'.' \|\| i == pPos - 1 \|\| i == 0) {

2626 if (i == aPos - 1) {	2626 if (i == aPos - 1) {

	2627 // There is '.' or invalid char before '@'.

2627 return FALSE;	2628 return FALSE;

2628 }	2629 }

2629 str = str.Right(str.GetLength() - i - 1);	2630 // End extracting for other invalid chars, '.' at the beginning, or

	2631 // consecutive '.'.

	2632 int removed_len = i == pPos - 1 ? i + 2 : i + 1;

	2633 str = str.Right(str.GetLength() - removed_len);

2630 break;	2634 break;

	2635 } else {

	2636 // Found a valid '.'.

	2637 pPos = i;

2631 }	2638 }

2632 }	2639 }

2633 aPos = str.Find(L'@');	2640

2634 if (aPos < 1) {	2641 // Check the host name part.

2635 return FALSE;

2636 }

2637 CFX_WideString strtemp = L"";

2638 for (i = 0; i < aPos; i++) {

2639 FX_WCHAR wch = str.GetAt(i);

2640 if (wch >= L'a' && wch <= L'z') {

2641 break;

2642 } else {

2643 strtemp = str.Right(str.GetLength() - i + 1);

2644 }

2645 }

2646 if (strtemp != L"") {

2647 str = strtemp;

2648 }

2649 aPos = str.Find(L'@');	2642 aPos = str.Find(L'@');

2650 if (aPos < 1) {	2643 if (aPos < 1) {

2651 return FALSE;	2644 return FALSE;

2652 }	2645 }

2653 str.TrimRight(L'.');	2646 str.TrimRight(L'.');

2654 strtemp = str;	2647 CFX_WideString strtemp = str;

2655 int ePos = str.Find(L'.');	2648 // At least one '.' in host name, but not at the beginning.

2656 if (ePos == -1) {	2649 int ePos = str.Find(L'.', aPos + 1);

	2650 if (ePos == -1 \|\| ePos == aPos + 1) {

2657 return FALSE;	2651 return FALSE;

2658 }	2652 }

2659 while (ePos != -1) {	2653 // Validate all other chars in host name.

2660 strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1);	2654 int nLen = str.GetLength();

2661 ePos = strtemp.Find('.');	2655 pPos = 0; // Used to track the position of '.'.

2662 }	2656 for (i = aPos + 1; i < nLen; i++) {
	jun_fang 2015/12/17 13:30:45 nit: for (int i = aPos + 1; i < nLen; i++). nit: for (int i = aPos + 1; i < nLen; i++). Wei Li 2015/12/17 17:41:41 Done. Show quoted text On 2015/12/17 13:30:45, jun_fang wrote: > nit: for (int i = aPos + 1; i < nLen; i++). Done.
2663 ePos = strtemp.GetLength();

2664 for (i = 0; i < ePos; i++) {

2665 FX_WCHAR wch = str.GetAt(i);	2657 FX_WCHAR wch = str.GetAt(i);

2666 if ((wch >= L'a' && wch <= L'z') \|\| (wch >= L'0' && wch <= L'9')) {	2658 if (wch == L'-' \|\| (wch >= L'a' && wch <= L'z') \|\|
	jun_fang 2015/12/17 13:30:45 How about upper-case letters? How about upper-case letters? Wei Li 2015/12/17 17:41:41 See above. Show quoted text On 2015/12/17 13:30:45, jun_fang wrote: > How about upper-case letters? See above.
	2659 (wch >= L'0' && wch <= L'9')) {

2667 continue;	2660 continue;

	2661 }

	2662 if (wch != L'.' \|\| i == pPos + 1) {

	2663 // Host name should end before invalid char.

	2664 int host_end = i == pPos + 1 ? i - 2 : i - 1;

	2665 if (pPos > 0 && host_end - aPos >= 3) {

	2666 // Trim the ending invalid chars if there is at least one '.' and name.

	2667 str = str.Left(host_end + 1);

	2668 break;

	2669 }

	2670 return FALSE;

2668 } else {	2671 } else {

2669 str = str.Left(str.GetLength() - ePos + i + 1);	2672 pPos = i;

2670 ePos = ePos - i - 1;

2671 break;

2672 }	2673 }

2673 }	2674 }

2674 int nLen = str.GetLength();	2675

2675 for (i = aPos + 1; i < nLen - ePos; i++) {

2676 FX_WCHAR wch = str.GetAt(i);

2677 if (wch == L'-' \|\| wch == L'.' \|\| (wch >= L'a' && wch <= L'z') \|\|

2678 (wch >= L'0' && wch <= L'9')) {

2679 continue;

2680 } else {

2681 return FALSE;

2682 }

2683 }

2684 if (str.Find(L"mailto:") == -1) {	2676 if (str.Find(L"mailto:") == -1) {

2685 str = L"mailto:" + str;	2677 str = L"mailto:" + str;

2686 }	2678 }

2687 return TRUE;	2679 return TRUE;

2688 }	2680 }

2689	2681

2690 void CPDF_LinkExtract::AppendToLinkList(int start,	2682 void CPDF_LinkExtract::AppendToLinkList(int start,

2691 int count,	2683 int count,

2692 const CFX_WideString& strUrl) {	2684 const CFX_WideString& strUrl) {

2693 CPDF_LinkExt* linkInfo = new CPDF_LinkExt;	2685 CPDF_LinkExt* linkInfo = new CPDF_LinkExt;

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2726 if (!m_bIsParsed \|\| index < 0 \|\| index >= m_LinkList.GetSize()) {	2718 if (!m_bIsParsed \|\| index < 0 \|\| index >= m_LinkList.GetSize()) {

2727 return;	2719 return;

2728 }	2720 }

2729 CPDF_LinkExt* link = NULL;	2721 CPDF_LinkExt* link = NULL;

2730 link = m_LinkList.GetAt(index);	2722 link = m_LinkList.GetAt(index);

2731 if (!link) {	2723 if (!link) {

2732 return;	2724 return;

2733 }	2725 }

2734 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);	2726 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);

2735 }	2727 }

OLD	NEW

« no previous file with comments | « BUILD.gn ('k') | core/src/fpdftext/fpdf_text_int_unittest.cpp » ('j') | no next file with comments »