Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(2458)

Unified Diff: core/src/fpdftext/fpdf_text_int.cpp

Issue 1530763005: Correctly extracting email addresses (Closed) Base URL: https://pdfium.googlesource.com/pdfium.git@master
Patch Set: more comments Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « core/include/fxcrt/fx_ext.h ('k') | core/src/fpdftext/fpdf_text_int_unittest.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: core/src/fpdftext/fpdf_text_int.cpp
diff --git a/core/src/fpdftext/fpdf_text_int.cpp b/core/src/fpdftext/fpdf_text_int.cpp
index 1e6d54d13398eff87710b2747300d9ad45f0b3f5..d7a9c47519e74db436647b1eed4907e7ccb313c3 100644
--- a/core/src/fpdftext/fpdf_text_int.cpp
+++ b/core/src/fpdftext/fpdf_text_int.cpp
@@ -14,6 +14,7 @@
#include "core/include/fpdfapi/fpdf_resource.h"
#include "core/include/fpdftext/fpdf_text.h"
#include "core/include/fxcrt/fx_bidi.h"
+#include "core/include/fxcrt/fx_ext.h"
#include "core/include/fxcrt/fx_ucd.h"
#include "text_int.h"
#include "third_party/base/nonstd_unique_ptr.h"
@@ -2607,80 +2608,69 @@ FX_BOOL CPDF_LinkExtract::CheckWebLink(CFX_WideString& strBeCheck) {
}
return FALSE;
}
-FX_BOOL CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
- str.MakeLower();
+bool CPDF_LinkExtract::CheckMailLink(CFX_WideString& str) {
int aPos = str.Find(L'@');
+ // Invalid when no '@'.
if (aPos < 1) {
return FALSE;
}
- if (str.GetAt(aPos - 1) == L'.' || str.GetAt(aPos - 1) == L'_') {
- return FALSE;
- }
- int i;
- for (i = aPos - 1; i >= 0; i--) {
+
+ // Check the local part.
+ int pPos = aPos; // Used to track the position of '@' or '.'.
+ for (int i = aPos - 1; i >= 0; i--) {
FX_WCHAR ch = str.GetAt(i);
- if (ch == L'_' || ch == L'.' || (ch >= L'a' && ch <= L'z') ||
- (ch >= L'0' && ch <= L'9')) {
+ if (ch == L'_' || ch == L'-' || FXSYS_iswalnum(ch)) {
continue;
- } else {
+ }
+ if (ch != L'.' || i == pPos - 1 || i == 0) {
if (i == aPos - 1) {
+ // There is '.' or invalid char before '@'.
return FALSE;
}
- str = str.Right(str.GetLength() - i - 1);
+ // End extracting for other invalid chars, '.' at the beginning, or
+ // consecutive '.'.
+ int removed_len = i == pPos - 1 ? i + 2 : i + 1;
+ str = str.Right(str.GetLength() - removed_len);
break;
}
+ // Found a valid '.'.
+ pPos = i;
}
- aPos = str.Find(L'@');
- if (aPos < 1) {
- return FALSE;
- }
- CFX_WideString strtemp = L"";
- for (i = 0; i < aPos; i++) {
- FX_WCHAR wch = str.GetAt(i);
- if (wch >= L'a' && wch <= L'z') {
- break;
- } else {
- strtemp = str.Right(str.GetLength() - i + 1);
- }
- }
- if (strtemp != L"") {
- str = strtemp;
- }
+
+ // Check the domain name part.
aPos = str.Find(L'@');
if (aPos < 1) {
return FALSE;
}
str.TrimRight(L'.');
- strtemp = str;
- int ePos = str.Find(L'.');
- if (ePos == -1) {
+ // At least one '.' in domain name, but not at the beginning.
+ // TODO(weili): RFC5322 allows domain names to be a local name without '.'.
+ // Check whether we should remove this check.
+ int ePos = str.Find(L'.', aPos + 1);
+ if (ePos == -1 || ePos == aPos + 1) {
return FALSE;
}
- while (ePos != -1) {
- strtemp = strtemp.Right(strtemp.GetLength() - ePos - 1);
- ePos = strtemp.Find('.');
- }
- ePos = strtemp.GetLength();
- for (i = 0; i < ePos; i++) {
- FX_WCHAR wch = str.GetAt(i);
- if ((wch >= L'a' && wch <= L'z') || (wch >= L'0' && wch <= L'9')) {
- continue;
- } else {
- str = str.Left(str.GetLength() - ePos + i + 1);
- ePos = ePos - i - 1;
- break;
- }
- }
+ // Validate all other chars in domain name.
int nLen = str.GetLength();
- for (i = aPos + 1; i < nLen - ePos; i++) {
+ pPos = 0; // Used to track the position of '.'.
+ for (int i = aPos + 1; i < nLen; i++) {
FX_WCHAR wch = str.GetAt(i);
- if (wch == L'-' || wch == L'.' || (wch >= L'a' && wch <= L'z') ||
- (wch >= L'0' && wch <= L'9')) {
+ if (wch == L'-' || FXSYS_iswalnum(wch)) {
continue;
- } else {
+ }
+ if (wch != L'.' || i == pPos + 1) {
+ // Domain name should end before invalid char.
+ int host_end = i == pPos + 1 ? i - 2 : i - 1;
+ if (pPos > 0 && host_end - aPos >= 3) {
+ // Trim the ending invalid chars if there is at least one '.' and name.
+ str = str.Left(host_end + 1);
+ break;
+ }
return FALSE;
}
+ pPos = i;
}
+
if (str.Find(L"mailto:") == -1) {
str = L"mailto:" + str;
}
« no previous file with comments | « core/include/fxcrt/fx_ext.h ('k') | core/src/fpdftext/fpdf_text_int_unittest.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698