net/base/net_util.cc - Issue 3011012: Add URL filter to trigger punycode for Japanese homographic sequences....

Side by Side Diff: net/base/net_util.cc

Issue 3011012: Add URL filter to trigger punycode for Japanese homographic sequences.... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: Created 10 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/base/net_util.h"	5 #include "net/base/net_util.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <map>	8 #include <map>

	9 #include <unicode/regex.h>

9 #include <unicode/ucnv.h>	10 #include <unicode/ucnv.h>

10 #include <unicode/uidna.h>	11 #include <unicode/uidna.h>

11 #include <unicode/ulocdata.h>	12 #include <unicode/ulocdata.h>

12 #include <unicode/uniset.h>	13 #include <unicode/uniset.h>

13 #include <unicode/uscript.h>	14 #include <unicode/uscript.h>

14 #include <unicode/uset.h>	15 #include <unicode/uset.h>

15	16

16 #include "build/build_config.h"	17 #include "build/build_config.h"

17	18

18 #if defined(OS_WIN)	19 #if defined(OS_WIN)

(...skipping 595 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
614 #ifdef U_WCHAR_IS_UTF16	615 #ifdef U_WCHAR_IS_UTF16

615 icu::UnicodeSet dangerous_characters(icu::UnicodeString(	616 icu::UnicodeSet dangerous_characters(icu::UnicodeString(

616 L"[[\\ \u00bc\u00bd\u01c3\u0337\u0338"	617 L"[[\\ \u00bc\u00bd\u01c3\u0337\u0338"

617 L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]"	618 L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]"

618 L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]"	619 L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]"

619 L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae"	620 L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae"

620 L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014"	621 L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014"

621 L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14"	622 L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14"

622 L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]"	623 L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]"

623 L"[\ufffa-\ufffd]]"), status);	624 L"[\ufffa-\ufffd]]"), status);

	625 DCHECK(U_SUCCESS(status));

	626 icu::RegexMatcher dangerous_patterns(icu::UnicodeString(

	627 // Lone katakana no, so, or n

	628 L"([^\\p{Katakana}][\u30ce\u30f3\u30bd][^\\p{Katakana}]"

	629 // Repeating Japanese accent characters

	630 L"\|[\u3099\u309a\u309b\u309c][\u3099\u309a\u309b\u309c])"),

	631 0, status);

624 #else	632 #else

625 icu::UnicodeSet dangerous_characters(icu::UnicodeString(	633 icu::UnicodeSet dangerous_characters(icu::UnicodeString(

626 "[[\\u0020\\u00bc\\u00bd\\u01c3\\u0337\\u0338"	634 "[[\\u0020\\u00bc\\u00bd\\u01c3\\u0337\\u0338"

627 "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]"	635 "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]"

628 "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]"	636 "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]"

629 "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae"	637 "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae"

630 "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014"	638 "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014"

631 "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14"	639 "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14"

632 "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]"	640 "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]"

633 "[\\ufffa-\\ufffd]]", -1, US_INV), status);	641 "[\\ufffa-\\ufffd]]", -1, US_INV), status);

	642 DCHECK(U_SUCCESS(status));

	643 icu::RegexMatcher dangerous_patterns(icu::UnicodeString(

	644 // Lone katakana no, so, or n

	645 "([^\\p{Katakana}][\\u30ce\\u30f3\u30bd][^\\p{Katakana}]"

	646 // Repeating Japanese accent characters

	647 "\|[\\u3099\\u309a\\u309b\\u309c][\\u3099\\u309a\\u309b\\u309c])"),

	648 0, status);

634 #endif	649 #endif

635 DCHECK(U_SUCCESS(status));	650 DCHECK(U_SUCCESS(status));

636 icu::UnicodeSet component_characters;	651 icu::UnicodeSet component_characters;

637 component_characters.addAll(icu::UnicodeString(str, str_len));	652 icu::UnicodeString component_string(str, str_len);

	653 component_characters.addAll(component_string);

638 if (dangerous_characters.containsSome(component_characters))	654 if (dangerous_characters.containsSome(component_characters))

639 return false;	655 return false;

640	656

	657 DCHECK(U_SUCCESS(status));

	658 dangerous_patterns.reset(component_string);

	659 if (dangerous_patterns.find())

	660 return false;

	661

641 // If the language list is empty, the result is completely determined	662 // If the language list is empty, the result is completely determined

642 // by whether a component is a single script or not. This will block	663 // by whether a component is a single script or not. This will block

643 // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are	664 // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are

644 // allowed with \|languages\| (while it blocks Chinese + Latin letters with	665 // allowed with \|languages\| (while it blocks Chinese + Latin letters with

645 // an accent as should be the case), but we want to err on the safe side	666 // an accent as should be the case), but we want to err on the safe side

646 // when \|languages\| is empty.	667 // when \|languages\| is empty.

647 if (languages.empty())	668 if (languages.empty())

648 return IsIDNComponentInSingleScript(str, str_len);	669 return IsIDNComponentInSingleScript(str, str_len);

649	670

650 // \|common_characters\| is made up of ASCII numbers, hyphen, plus and	671 // \|common_characters\| is made up of ASCII numbers, hyphen, plus and

(...skipping 1265 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1916 }	1937 }

1917	1938

1918 int GetPortFromAddrinfo(const struct addrinfo* info) {	1939 int GetPortFromAddrinfo(const struct addrinfo* info) {

1919 uint16* port_field = GetPortFieldFromAddrinfo(info);	1940 uint16* port_field = GetPortFieldFromAddrinfo(info);

1920 if (!port_field)	1941 if (!port_field)

1921 return -1;	1942 return -1;

1922 return ntohs(*port_field);	1943 return ntohs(*port_field);

1923 }	1944 }

1924	1945

1925 } // namespace net	1946 } // namespace net

OLD	NEW

« no previous file with comments | « no previous file | net/base/net_util_unittest.cc » ('j') | no next file with comments »