core/src/fpdftext/fpdf_text_int.cpp - Issue 1530763005: Correctly extracting email addresses

Side by Side Diff: core/src/fpdftext/fpdf_text_int.cpp

Issue 1530763005: Correctly extracting email addresses (Closed) Base URL: https://pdfium.googlesource.com/pdfium.git@master

Patch Set: more comments Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 PDFium Authors. All rights reserved.	1 // Copyright 2014 PDFium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com	5 // Original code copyright 2014 Foxit Software Inc. http://www.foxitsoftware.com

6	6

7 #include <cctype>	7 #include <cctype>

8 #include <cwctype>	8 #include <cwctype>

9 #include <algorithm>	9 #include <algorithm>

10	10

11 #include "core/include/fpdfapi/fpdf_module.h"	11 #include "core/include/fpdfapi/fpdf_module.h"

12 #include "core/include/fpdfapi/fpdf_page.h"	12 #include "core/include/fpdfapi/fpdf_page.h"

13 #include "core/include/fpdfapi/fpdf_pageobj.h"	13 #include "core/include/fpdfapi/fpdf_pageobj.h"

14 #include "core/include/fpdfapi/fpdf_resource.h"	14 #include "core/include/fpdfapi/fpdf_resource.h"

15 #include "core/include/fpdftext/fpdf_text.h"	15 #include "core/include/fpdftext/fpdf_text.h"

16 #include "core/include/fxcrt/fx_bidi.h"	16 #include "core/include/fxcrt/fx_bidi.h"

	17 #include "core/include/fxcrt/fx_ext.h"

17 #include "core/include/fxcrt/fx_ucd.h"	18 #include "core/include/fxcrt/fx_ucd.h"

18 #include "text_int.h"	19 #include "text_int.h"

19 #include "third_party/base/nonstd_unique_ptr.h"	20 #include "third_party/base/nonstd_unique_ptr.h"

20	21

21 namespace {	22 namespace {

22	23

23 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) {	24 FX_BOOL _IsIgnoreSpaceCharacter(FX_WCHAR curChar) {

24 if (curChar < 255) {	25 if (curChar < 255) {

25 return FALSE;	26 return FALSE;

26 }	27 }

(...skipping 2573 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2600 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));	2601 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"https://"));

2601 return TRUE;	2602 return TRUE;

2602 }	2603 }

2603 if (str.Find(L"www.") != -1) {	2604 if (str.Find(L"www.") != -1) {

2604 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));	2605 strBeCheck = strBeCheck.Right(str.GetLength() - str.Find(L"www."));

2605 strBeCheck = L"http://" + strBeCheck;	2606 strBeCheck = L"http://" + strBeCheck;

2606 return TRUE;	2607 return TRUE;

2607 }	2608 }

2608 return FALSE;	2609 return FALSE;

2609 }	2610 }

2610 FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {	2611 bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {

2611 str.MakeLower();

2612 int aPos = str.Find(L'@');	2612 int aPos = str.Find(L'@');

	2613 // Invalid when no '@'.

2613 if (aPos < 1) {	2614 if (aPos < 1) {

2614 return FALSE;	2615 return FALSE;

2615 }	2616 }

2616 if (str.GetAt(aPos - 1) == L'.' \|\| str.GetAt(aPos - 1) == L'_') {	2617

2617 return FALSE;	2618 // Check the local part.

2618 }	2619 int pPos = aPos; // Used to track the position of '@' or '.'.

2619 int i;	2620 for (int i = aPos - 1; i >= 0; i--) {

2620 for (i = aPos - 1; i >= 0; i--) {

2621 FX_WCHAR ch = str.GetAt(i);	2621 FX_WCHAR ch = str.GetAt(i);

2622 if (ch == L'_' \|\| ch == L'.' \|\| (ch >= L'a' && ch <= L'z') \|\|	2622 if (ch == L'_' \|\| ch == L'-' \|\| FXSYS_iswalnum(ch)) {

2623 (ch >= L'0' && ch <= L'9')) {

2624 continue;	2623 continue;

2625 } else {	2624 }

	2625 if (ch != L'.' \|\| i == pPos - 1 \|\| i == 0) {

2626 if (i == aPos - 1) {	2626 if (i == aPos - 1) {

	2627 // There is '.' or invalid char before '@'.

2627 return FALSE;	2628 return FALSE;

2628 }	2629 }

2629 str = str.Right(str.GetLength() - i - 1);	2630 // End extracting for other invalid chars, '.' at the beginning, or

	2631 // consecutive '.'.

	2632 int removed_len = i == pPos - 1 ? i + 2 : i + 1;

	2633 str = str.Right(str.GetLength() - removed_len);

2630 break;	2634 break;

2631 }	2635 }

	2636 // Found a valid '.'.

	2637 pPos = i;

2632 }	2638 }

2633 aPos = str.Find(L'@');	2639

2634 if (aPos < 1) {	2640 // Check the domain name part.

2635 return FALSE;

2636 }

2637 CFX_WideString strtemp = L"";

2638 for (i = 0; i < aPos; i++) {

2639 FX_WCHAR wch = str.GetAt(i);

2640 if (wch >= L'a' && wch <= L'z') {

2641 break;

2642 } else {

2643 strtemp = str.Right(str.GetLength() - i + 1);

2644 }

2645 }

2646 if (strtemp != L"") {

2647 str = strtemp;

2648 }

2649 aPos = str.Find(L'@');	2641 aPos = str.Find(L'@');

2650 if (aPos < 1) {	2642 if (aPos < 1) {

2651 return FALSE;	2643 return FALSE;

2652 }	2644 }

2653 str.TrimRight(L'.');	2645 str.TrimRight(L'.');

2654 strtemp = str;	2646 // At least one '.' in domain name, but not at the beginning.

2655 int ePos = str.Find(L'.');	2647 // TODO(weili): RFC5322 allows domain names to be a local name without '.'.

2656 if (ePos == -1) {	2648 // Check whether we should remove this check.

	2649 int ePos = str.Find(L'.', aPos + 1);

	2650 if (ePos == -1 \|\| ePos == aPos + 1) {

2657 return FALSE;	2651 return FALSE;

2658 }	2652 }

2659 while (ePos != -1) {	2653 // Validate all other chars in domain name.

2660 strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1);	2654 int nLen = str.GetLength();

2661 ePos = strtemp.Find('.');	2655 pPos = 0; // Used to track the position of '.'.

2662 }	2656 for (int i = aPos + 1; i < nLen; i++) {

2663 ePos = strtemp.GetLength();

2664 for (i = 0; i < ePos; i++) {

2665 FX_WCHAR wch = str.GetAt(i);	2657 FX_WCHAR wch = str.GetAt(i);

2666 if ((wch >= L'a' && wch <= L'z') \|\| (wch >= L'0' && wch <= L'9')) {	2658 if (wch == L'-' \|\| FXSYS_iswalnum(wch)) {

2667 continue;	2659 continue;

2668 } else {

2669 str = str.Left(str.GetLength() - ePos + i + 1);

2670 ePos = ePos - i - 1;

2671 break;

2672 }	2660 }

2673 }	2661 if (wch != L'.' \|\| i == pPos + 1) {

2674 int nLen = str.GetLength();	2662 // Domain name should end before invalid char.

2675 for (i = aPos + 1; i < nLen - ePos; i++) {	2663 int host_end = i == pPos + 1 ? i - 2 : i - 1;

2676 FX_WCHAR wch = str.GetAt(i);	2664 if (pPos > 0 && host_end - aPos >= 3) {

2677 if (wch == L'-' \|\| wch == L'.' \|\| (wch >= L'a' && wch <= L'z') \|\|	2665 // Trim the ending invalid chars if there is at least one '.' and name.

2678 (wch >= L'0' && wch <= L'9')) {	2666 str = str.Left(host_end + 1);

2679 continue;	2667 break;

2680 } else {	2668 }

2681 return FALSE;	2669 return FALSE;

2682 }	2670 }

	2671 pPos = i;

2683 }	2672 }

	2673

2684 if (str.Find(L"mailto:") == -1) {	2674 if (str.Find(L"mailto:") == -1) {

2685 str = L"mailto:" + str;	2675 str = L"mailto:" + str;

2686 }	2676 }

2687 return TRUE;	2677 return TRUE;

2688 }	2678 }

2689	2679

2690 void CPDF_LinkExtract::AppendToLinkList(int start,	2680 void CPDF_LinkExtract::AppendToLinkList(int start,

2691 int count,	2681 int count,

2692 const CFX_WideString& strUrl) {	2682 const CFX_WideString& strUrl) {

2693 CPDF_LinkExt* linkInfo = new CPDF_LinkExt;	2683 CPDF_LinkExt* linkInfo = new CPDF_LinkExt;

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
2726 if (!m_bIsParsed \|\| index < 0 \|\| index >= m_LinkList.GetSize()) {	2716 if (!m_bIsParsed \|\| index < 0 \|\| index >= m_LinkList.GetSize()) {

2727 return;	2717 return;

2728 }	2718 }

2729 CPDF_LinkExt* link = NULL;	2719 CPDF_LinkExt* link = NULL;

2730 link = m_LinkList.GetAt(index);	2720 link = m_LinkList.GetAt(index);

2731 if (!link) {	2721 if (!link) {

2732 return;	2722 return;

2733 }	2723 }

2734 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);	2724 m_pTextPage->GetRectArray(link->m_Start, link->m_Count, rects);

2735 }	2725 }

OLD	NEW

« no previous file with comments | « core/include/fxcrt/fx_ext.h ('k') | core/src/fpdftext/fpdf_text_int_unittest.cpp » ('j') | no next file with comments »