Index: net/base/net_util.cc |
=================================================================== |
--- net/base/net_util.cc (revision 52955) |
+++ net/base/net_util.cc (working copy) |
@@ -6,6 +6,7 @@ |
#include <algorithm> |
#include <map> |
+#include <unicode/regex.h> |
#include <unicode/ucnv.h> |
#include <unicode/uidna.h> |
#include <unicode/ulocdata.h> |
@@ -621,6 +622,13 @@ |
L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14" |
L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]" |
L"[\ufffa-\ufffd]]"), status); |
+ DCHECK(U_SUCCESS(status)); |
+ icu::RegexMatcher dangerous_patterns(icu::UnicodeString( |
+ // Lone katakana no, so, or n |
+ L"([^\\p{Katakana}][\u30ce\u30f3\u30bd][^\\p{Katakana}]" |
+ // Repeating Japanese accent characters |
+ L"|[\u3099\u309a\u309b\u309c][\u3099\u309a\u309b\u309c])"), |
+ 0, status); |
#else |
icu::UnicodeSet dangerous_characters(icu::UnicodeString( |
"[[\\u0020\\u00bc\\u00bd\\u01c3\\u0337\\u0338" |
@@ -631,13 +639,26 @@ |
"\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14" |
"\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]" |
"[\\ufffa-\\ufffd]]", -1, US_INV), status); |
+ DCHECK(U_SUCCESS(status)); |
+ icu::RegexMatcher dangerous_patterns(icu::UnicodeString( |
+ // Lone katakana no, so, or n |
+ "([^\\p{Katakana}][\\u30ce\\u30f3\u30bd][^\\p{Katakana}]" |
+ // Repeating Japanese accent characters |
+ "|[\\u3099\\u309a\\u309b\\u309c][\\u3099\\u309a\\u309b\\u309c])"), |
+ 0, status); |
#endif |
DCHECK(U_SUCCESS(status)); |
icu::UnicodeSet component_characters; |
- component_characters.addAll(icu::UnicodeString(str, str_len)); |
+ icu::UnicodeString component_string(str, str_len); |
+ component_characters.addAll(component_string); |
if (dangerous_characters.containsSome(component_characters)) |
return false; |
+ DCHECK(U_SUCCESS(status)); |
+ dangerous_patterns.reset(component_string); |
+ if (dangerous_patterns.find()) |
+ return false; |
+ |
// If the language list is empty, the result is completely determined |
// by whether a component is a single script or not. This will block |
// even "safe" script mixing cases like <Chinese, Latin-ASCII> that are |