components/url_formatter/url_formatter.cc - Issue 1258813002: Implement a new IDN display policy

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 1258813002: Implement a new IDN display policy (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: typo fix Created 5 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/url_formatter/url_formatter.h"	5 #include "components/url_formatter/url_formatter.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <map>

9 #include <utility>	8 #include <utility>

10	9

11 #include "base/lazy_instance.h"	10 #include "base/lazy_instance.h"

12 #include "base/logging.h"	11 #include "base/logging.h"

13 #include "base/macros.h"	12 #include "base/macros.h"

14 #include "base/memory/singleton.h"	13 #include "base/memory/scoped_ptr.h"

15 #include "base/stl_util.h"	14 #include "base/numerics/safe_conversions.h"

16 #include "base/strings/string_tokenizer.h"	15 #include "base/strings/string_piece.h"

17 #include "base/strings/string_util.h"	16 #include "base/strings/string_util.h"

18 #include "base/strings/utf_offset_string_conversions.h"	17 #include "base/strings/utf_offset_string_conversions.h"

19 #include "base/strings/utf_string_conversions.h"	18 #include "base/strings/utf_string_conversions.h"

20 #include "base/synchronization/lock.h"	19 #include "base/synchronization/lock.h"

21 #include "third_party/icu/source/common/unicode/uidna.h"	20 #include "third_party/icu/source/common/unicode/uidna.h"

22 #include "third_party/icu/source/common/unicode/uniset.h"	21 #include "third_party/icu/source/common/unicode/uniset.h"

23 #include "third_party/icu/source/common/unicode/uscript.h"

24 #include "third_party/icu/source/i18n/unicode/regex.h"	22 #include "third_party/icu/source/i18n/unicode/regex.h"

25 #include "third_party/icu/source/i18n/unicode/ulocdata.h"	23 #include "third_party/icu/source/i18n/unicode/uspoof.h"

26 #include "url/gurl.h"	24 #include "url/gurl.h"

27 #include "url/third_party/mozilla/url_parse.h"	25 #include "url/third_party/mozilla/url_parse.h"

28	26

29 namespace url_formatter {	27 namespace url_formatter {

30	28

31 namespace {	29 namespace {

32	30

33 base::string16 IDNToUnicodeWithAdjustments(	31 base::string16 IDNToUnicodeWithAdjustments(

34 const std::string& host,	32 const std::string& host,

35 const std::string& languages,

36 base::OffsetAdjuster::Adjustments* adjustments);	33 base::OffsetAdjuster::Adjustments* adjustments);

37 bool IDNToUnicodeOneComponent(const base::char16* comp,	34 bool IDNToUnicodeOneComponent(const base::char16* comp,

38 size_t comp_len,	35 size_t comp_len,

39 const std::string& languages,

40 base::string16* out);	36 base::string16* out);

41	37

42 class AppendComponentTransform {	38 class AppendComponentTransform {

43 public:	39 public:

44 AppendComponentTransform() {}	40 AppendComponentTransform() {}

45 virtual ~AppendComponentTransform() {}	41 virtual ~AppendComponentTransform() {}

46	42

47 virtual base::string16 Execute(	43 virtual base::string16 Execute(

48 const std::string& component_text,	44 const std::string& component_text,

49 base::OffsetAdjuster::Adjustments* adjustments) const = 0;	45 base::OffsetAdjuster::Adjustments* adjustments) const = 0;

50	46

51 // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an	47 // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an

52 // accessible copy constructor in order to call AppendFormattedComponent()	48 // accessible copy constructor in order to call AppendFormattedComponent()

53 // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ).	49 // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ).

54 };	50 };

55	51

56 class HostComponentTransform : public AppendComponentTransform {	52 class HostComponentTransform : public AppendComponentTransform {

57 public:	53 public:

58 explicit HostComponentTransform(const std::string& languages)	54 explicit HostComponentTransform() {}

59 : languages_(languages) {}

60	55

61 private:	56 private:

62 base::string16 Execute(	57 base::string16 Execute(

63 const std::string& component_text,	58 const std::string& component_text,

64 base::OffsetAdjuster::Adjustments* adjustments) const override {	59 base::OffsetAdjuster::Adjustments* adjustments) const override {

65 return IDNToUnicodeWithAdjustments(component_text, languages_, adjustments);	60 return IDNToUnicodeWithAdjustments(component_text, adjustments);

66 }	61 }

67

68 const std::string& languages_;

69 };	62 };

70	63

71 class NonHostComponentTransform : public AppendComponentTransform {	64 class NonHostComponentTransform : public AppendComponentTransform {

72 public:	65 public:

73 explicit NonHostComponentTransform(net::UnescapeRule::Type unescape_rules)	66 explicit NonHostComponentTransform(net::UnescapeRule::Type unescape_rules)

74 : unescape_rules_(unescape_rules) {}	67 : unescape_rules_(unescape_rules) {}

75	68

76 private:	69 private:

77 base::string16 Execute(	70 base::string16 Execute(

78 const std::string& component_text,	71 const std::string& component_text,

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
150 AdjustComponent(delta, &(parsed->host));	143 AdjustComponent(delta, &(parsed->host));

151 AdjustComponent(delta, &(parsed->port));	144 AdjustComponent(delta, &(parsed->port));

152 AdjustComponent(delta, &(parsed->path));	145 AdjustComponent(delta, &(parsed->path));

153 AdjustComponent(delta, &(parsed->query));	146 AdjustComponent(delta, &(parsed->query));

154 AdjustComponent(delta, &(parsed->ref));	147 AdjustComponent(delta, &(parsed->ref));

155 }	148 }

156	149

157 // Helper for FormatUrlWithOffsets().	150 // Helper for FormatUrlWithOffsets().

158 base::string16 FormatViewSourceUrl(	151 base::string16 FormatViewSourceUrl(

159 const GURL& url,	152 const GURL& url,

160 const std::string& languages,

161 FormatUrlTypes format_types,	153 FormatUrlTypes format_types,

162 net::UnescapeRule::Type unescape_rules,	154 net::UnescapeRule::Type unescape_rules,

163 url::Parsed* new_parsed,	155 url::Parsed* new_parsed,

164 size_t* prefix_end,	156 size_t* prefix_end,

165 base::OffsetAdjuster::Adjustments* adjustments) {	157 base::OffsetAdjuster::Adjustments* adjustments) {

166 DCHECK(new_parsed);	158 DCHECK(new_parsed);

167 const char kViewSource[] = "view-source:";	159 const char kViewSource[] = "view-source:";

168 const size_t kViewSourceLength = arraysize(kViewSource) - 1;	160 const size_t kViewSourceLength = arraysize(kViewSource) - 1;

169	161

170 // Format the underlying URL and record adjustments.	162 // Format the underlying URL and record adjustments.

171 const std::string& url_str(url.possibly_invalid_spec());	163 const std::string& url_str(url.possibly_invalid_spec());

172 adjustments->clear();	164 adjustments->clear();

173 base::string16 result(	165 base::string16 result(

174 base::ASCIIToUTF16(kViewSource) +	166 base::ASCIIToUTF16(kViewSource) +

175 FormatUrlWithAdjustments(GURL(url_str.substr(kViewSourceLength)),	167 FormatUrlWithAdjustments(GURL(url_str.substr(kViewSourceLength)),

176 languages, format_types, unescape_rules,	168 std::string(), format_types, unescape_rules,

177 new_parsed, prefix_end, adjustments));	169 new_parsed, prefix_end, adjustments));

178 // Revise \|adjustments\| by shifting to the offsets to prefix that the above	170 // Revise \|adjustments\| by shifting to the offsets to prefix that the above

179 // call to FormatUrl didn't get to see.	171 // call to FormatUrl didn't get to see.

180 for (base::OffsetAdjuster::Adjustments::iterator it = adjustments->begin();	172 for (base::OffsetAdjuster::Adjustments::iterator it = adjustments->begin();

181 it != adjustments->end(); ++it)	173 it != adjustments->end(); ++it)

182 it->original_offset += kViewSourceLength;	174 it->original_offset += kViewSourceLength;

183	175

184 // Adjust positions of the parsed components.	176 // Adjust positions of the parsed components.

185 if (new_parsed->scheme.is_nonempty()) {	177 if (new_parsed->scheme.is_nonempty()) {

186 // Assume "view-source:real-scheme" as a scheme.	178 // Assume "view-source:real-scheme" as a scheme.

187 new_parsed->scheme.len += kViewSourceLength;	179 new_parsed->scheme.len += kViewSourceLength;

188 } else {	180 } else {

189 new_parsed->scheme.begin = 0;	181 new_parsed->scheme.begin = 0;

190 new_parsed->scheme.len = kViewSourceLength - 1;	182 new_parsed->scheme.len = kViewSourceLength - 1;

191 }	183 }

192 AdjustAllComponentsButScheme(kViewSourceLength, new_parsed);	184 AdjustAllComponentsButScheme(kViewSourceLength, new_parsed);

193	185

194 if (prefix_end)	186 if (prefix_end)

195 *prefix_end += kViewSourceLength;	187 *prefix_end += kViewSourceLength;

196	188

197 return result;	189 return result;

198 }	190 }

199	191

200 // TODO(brettw) bug 734373: check the scripts for each host component and	192 // TODO(brettw) We may want to skip this step in the case of file URLs to

201 // don't un-IDN-ize if there is more than one. Alternatively, only IDN for	193 // allow unicode UNC hostnames regardless of encodings.

202 // scripts that the user has installed. For now, just put the entire

203 // path through IDN. Maybe this feature can be implemented in ICU itself?

204 //

205 // We may want to skip this step in the case of file URLs to allow unicode

206 // UNC hostnames regardless of encodings.

207 base::string16 IDNToUnicodeWithAdjustments(	194 base::string16 IDNToUnicodeWithAdjustments(

208 const std::string& host,	195 const std::string& host,

209 const std::string& languages,

210 base::OffsetAdjuster::Adjustments* adjustments) {	196 base::OffsetAdjuster::Adjustments* adjustments) {

211 if (adjustments)	197 if (adjustments)

212 adjustments->clear();	198 adjustments->clear();

213 // Convert the ASCII input to a base::string16 for ICU.	199 // Convert the ASCII input to a base::string16 for ICU.

214 base::string16 input16;	200 base::string16 input16;

215 input16.reserve(host.length());	201 input16.reserve(host.length());

216 input16.insert(input16.end(), host.begin(), host.end());	202 input16.insert(input16.end(), host.begin(), host.end());

217	203

218 // Do each component of the host separately, since we enforce script matching	204 // Do each component of the host separately, since we enforce script matching

219 // on a per-component basis.	205 // on a per-component basis.

220 base::string16 out16;	206 base::string16 out16;

221 for (size_t component_start = 0, component_end;	207 for (size_t component_start = 0, component_end;

222 component_start < input16.length();	208 component_start < input16.length();

223 component_start = component_end + 1) {	209 component_start = component_end + 1) {

224 // Find the end of the component.	210 // Find the end of the component.

225 component_end = input16.find('.', component_start);	211 component_end = input16.find('.', component_start);

226 if (component_end == base::string16::npos)	212 if (component_end == base::string16::npos)

227 component_end = input16.length(); // For getting the last component.	213 component_end = input16.length(); // For getting the last component.

228 size_t component_length = component_end - component_start;	214 size_t component_length = component_end - component_start;

229 size_t new_component_start = out16.length();	215 size_t new_component_start = out16.length();

230 bool converted_idn = false;	216 bool converted_idn = false;

231 if (component_end > component_start) {	217 if (component_end > component_start) {

232 // Add the substring that we just found.	218 // Add the substring that we just found.

233 converted_idn =	219 converted_idn =

234 IDNToUnicodeOneComponent(input16.data() + component_start,	220 IDNToUnicodeOneComponent(input16.data() + component_start,

235 component_length, languages, &out16);	221 component_length, &out16);

236 }	222 }

237 size_t new_component_length = out16.length() - new_component_start;	223 size_t new_component_length = out16.length() - new_component_start;

238	224

239 if (converted_idn && adjustments) {	225 if (converted_idn && adjustments) {

240 adjustments->push_back(base::OffsetAdjuster::Adjustment(	226 adjustments->push_back(base::OffsetAdjuster::Adjustment(

241 component_start, component_length, new_component_length));	227 component_start, component_length, new_component_length));

242 }	228 }

243	229

244 // Need to add the dot we just found (if we found one).	230 // Need to add the dot we just found (if we found one).

245 if (component_end < input16.length())	231 if (component_end < input16.length())

246 out16.push_back('.');	232 out16.push_back('.');

247 }	233 }

248 return out16;	234 return out16;

249 }	235 }

250	236

251 // Does some simple normalization of scripts so we can allow certain scripts	237 // A helper class for IDN Spoof checker 1st pass. When created lazily

252 // to exist together.	238 // the first time it's necessary, it'll initialize and set up \|USpoofChecker\|

253 // TODO(brettw) bug 880223: we should allow some other languages to be	239 // with the level of script mixing, allowed characters, and types of checks.
	Ryan Sleevi 2015/09/08 21:45:58 Comment nit: I found this comment a bit hard to re Comment nit: I found this comment a bit hard to read. I've tried rewriting it, both to make sure I correctly understand and to offer feedback. // A helper class for the first pass of IDN spoof checking. I'm not clear what "level of script mixing, allowed characters, and types of checks" is meant to convey. As I read it, it suggests parameters / inputs, but perhaps you mean it as "The policies that the Chromium security and internationalization teams have decided on", in which case, it might be best set // A helper class for the first pass of IDN spoof checking, used to // ensure that no IDN input meets Chromium's standard of spoofability. // For a more thorough explanation of how spoof checking works in // Chromium, see [link to Markdown file explaining?] jungshik at Google 2016/03/11 20:05:37 Done. Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi wrote: > Comment nit: I found this comment a bit hard to read. I've tried rewriting it, > both to make sure I correctly understand and to offer feedback. > > // A helper class for the first pass of IDN spoof checking. > > I'm not clear what "level of script mixing, allowed characters, and types of > checks" is meant to convey. As I read it, it suggests parameters / inputs, but > perhaps you mean it as "The policies that the Chromium security and > internationalization teams have decided on", in which case, it might be best set > > // A helper class for the first pass of IDN spoof checking, used to > // ensure that no IDN input meets Chromium's standard of spoofability. > // For a more thorough explanation of how spoof checking works in > // Chromium, see [link to Markdown file explaining?] Done.
254 // oombined such as Chinese and Latin. We will probably need a more	240 class IDNSpoofChecker {

255 // complicated system of language pairs to have more fine-grained control.	241 public:

256 UScriptCode NormalizeScript(UScriptCode code) {	242 IDNSpoofChecker();

257 switch (code) {	243 // Return true if \|label\| is deemed safe to display in Unicode. The check

258 case USCRIPT_KATAKANA:	244 // is done with \|checker_\|. In case \|checker_\| is not properly initialized,

259 case USCRIPT_HIRAGANA:	245 // all the IDN labels are regarded as unsafe and false is returned.

260 case USCRIPT_KATAKANA_OR_HIRAGANA:	246 // Besides, it internally calls IDNSpoofCheckerExtra::Check if additional

261 case USCRIPT_HANGUL: // This one is arguable.	247 // check is necessary.
	Ryan Sleevi 2015/09/08 21:45:58 Comment nit: Suggested rewording // Returns true Comment nit: Suggested rewording // Returns true if \|label\| is safe to display as Unicode. In the event // of library failure, all IDN inputs will be treated as unsafe. In this reword, I try to remove the documentation about the implementation ("is done with \|checker_\|", "Besides, it internally calls"), since that should be both obvious from the code and fragile to document. Instead, it just spells out the API contract. jungshik at Google 2016/03/11 20:05:37 Done. Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi wrote: > Comment nit: Suggested rewording > > // Returns true if \|label\| is safe to display as Unicode. In the event > // of library failure, all IDN inputs will be treated as unsafe. > > > In this reword, I try to remove the documentation about the implementation ("is > done with \|checker_\|", "Besides, it internally calls"), since that should be > both obvious from the code and fragile to document. Instead, it just spells out > the API contract. Done.
262 return USCRIPT_HAN;	248 bool Check(base::StringPiece16 label);

263 default:	249

264 return code;	250 private:

	251 USpoofChecker* checker_;

	252 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);

	253 };

	254

	255 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =

	256 LAZY_INSTANCE_INITIALIZER;

	257

	258 // A helper class for IDN Spoof checker 2nd pass. It is lazily created

	259 // the first time it's necessary and initializes a Unicode set and

	260 // a regex necessary for extra checks done on top of checks done with

	261 // \|USpoofChecker\| in IDNSpoofChecker.
	Ryan Sleevi 2015/09/08 21:45:59 // A helper class for the second pass of IDN spoof // A helper class for the second pass of IDN spoof checking. // For a more thorough explanation of how spoof checking works in Chromium, // see [link to Markdown file explaining first and second passes] It's unclear to me why this is a separate helper object, as opposed to just being one spoof checker that does both passes, but I haven't read the rest of the code yet, I'm just basing it on "read this file and try to understand from header comments what will happen" jungshik at Google 2016/03/11 20:05:37 I removed this extra class and fold that into the Show quoted text On 2015/09/08 21:45:59, Ryan Sleevi wrote: > // A helper class for the second pass of IDN spoof checking. > // For a more thorough explanation of how spoof checking works in Chromium, > // see [link to Markdown file explaining first and second passes] > > It's unclear to me why this is a separate helper object, as opposed to just > being one spoof checker that does both passes, but I haven't read the rest of > the code yet, I'm just basing it on "read this file and try to understand from > header comments what will happen" I removed this extra class and fold that into the first class.
	262 class IDNSpoofCheckerExtra {

	263 public:

	264 IDNSpoofCheckerExtra();

	265 // Return true if extra checks on \|label\| find it to be safe to display

	266 // in Unicode. Called by IDNSpoofChecker::Check.
	Ryan Sleevi 2015/09/08 21:45:58 Comment nit: I would remove the second sentence - Comment nit: I would remove the second sentence - that's discussing implementation details of some 'other' implementation (even if it is in the same file) // Returns true if \|label\| is safe to display as Unicode according to // extended spoofability criteria. Because this extended criteria may // be expensive, it should only be used after first ensuring that \|label\| // is safe using IDNSpoofChecker::Check. Of course, the problem with the above suggested reword is you see how tightly coupled IDNSpoofChecker and IDNSpoofCheckerExtra are. If Extra is an implementation detail of IDNSpoofChecker, then SpoofChecker shouldn't be talking about it. If it's the caller's responsibility to call both, then we should document that. But if IDNSpoofChecker is calling IDNSpoofCheckerExtra and that's the API contract, then the comments should be clearer to that point, such as // Implementation detail of IDNSpoofChecker. // As some checks for spoofability are expensive to compute, IDNSpoofCheckerExtra // implements the secondary checks after a given \|label\| has passed // the initial spoof checking. // This class should never be called directly - spoofability should // be checked with IDNSpoofChecker - and just exists to encapsulate the // more expensive logic. jungshik at Google* 2016/03/11 20:05:37 Initially, I expected ExtraCheck to have more expe Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi wrote: > Comment nit: I would remove the second sentence - that's discussing > implementation details of some 'other' implementation (even if it is in the same > file) > > // Returns true if \|label\| is safe to display as Unicode according to > // extended spoofability criteria. Because this extended criteria may > // be expensive, it should only be used after first ensuring that \|label\| > // is safe using IDNSpoofChecker::Check. > > > Of course, the problem with the above suggested reword is you see how tightly > coupled IDNSpoofChecker and IDNSpoofCheckerExtra are. If *Extra is an > implementation detail of IDNSpoofChecker, then SpoofChecker shouldn't be talking > about it. If it's the caller's responsibility to call both, then we should > document that. But if IDNSpoofChecker is calling IDNSpoofCheckerExtra and that's > the API contract, then the comments should be clearer to that point, such as > > // Implementation detail of IDNSpoofChecker. > // As some checks for spoofability are expensive to compute, > IDNSpoofCheckerExtra > // implements the secondary checks after a given \|label\| has passed > // the initial spoof checking. > // This class should never be called directly - spoofability should > // be checked with IDNSpoofChecker - and just exists to encapsulate the > // more expensive logic. Initially, I expected ExtraCheck to have more expensive logic (such as checking against known good domains, etc) than what id has now, but I didn't do that (at least in this CL) so that folding ExtraCheck into IDNSpoofChecker is simpler. There's a bit more initialization to do for extra check, but it's not expensive. When we have more expensive logic, we can separate that out.
	267 bool Check(base::StringPiece16 label);

	268

	269 private:

	270 icu::UnicodeSet non_ascii_latin_;

	271 icu::RegexPattern* dangerous_pattern_;

	272 DISALLOW_COPY_AND_ASSIGN(IDNSpoofCheckerExtra);

	273 };

	274

	275 base::LazyInstance<IDNSpoofCheckerExtra>::Leaky g_idn_spoof_checker_extra =

	276 LAZY_INSTANCE_INITIALIZER;

	277

	278 IDNSpoofChecker::IDNSpoofChecker() {

	279 UErrorCode status = U_ZERO_ERROR;

	280 checker_ = uspoof_open(&status);

	281 if (U_FAILURE(status)) {

	282 checker_ = nullptr;

	283 LOG(ERROR) << "IDN spoof checker failed to open with error, "

	284 << u_errorName(status)

	285 << " ; all IDN will be shown in punycode.";

	286 return;

265 }	287 }

	288

	289 // Use 'hightly restrictive' restritiction level to limit the script mixing

	290 // to Latin + Han + {Hiragana + Katakana, Bopomofo, Hangul}.

	291 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection

	292 // The default is highly restrictive so that it's not set explicitly.
	Ryan Sleevi 2015/09/08 21:45:58 I have trouble making sense of this sentence. It s I have trouble making sense of this sentence. It seems to suggest that the API call is unnecessary (because it's the default), but then you're setting it, so I'm not sure how to make sense of it. jungshik at Google 2016/03/11 20:05:37 Sorry for the confusion. (yeah, it's confusing.). Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi wrote: > I have trouble making sense of this sentence. It seems to suggest that the API > call is unnecessary (because it's the default), but then you're setting it, so > I'm not sure how to make sense of it. Sorry for the confusion. (yeah, it's confusing.). Anyway, I switched to moderately restrictive and the comment was rewritten accordingly.
	293 // TODO(jshin): Firefox uses 'moderately restrictive' by default. Review

	294 // using that, instead.
	Ryan Sleevi 2015/09/08 21:45:58 Is there a bug open for this review? Just some sor Is there a bug open for this review? Just some sort of tracking bug for Chromium :) jungshik at Google 2016/03/11 20:05:37 By switching to moderately restrictive, no more ne Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi (Slow to 2-21) wrote: > Is there a bug open for this review? Just some sort of tracking bug for Chromium > :) By switching to moderately restrictive, no more need for TODO.
	295 uspoof_setRestrictionLevel(checker_, USPOOF_HIGHLY_RESTRICTIVE);

	296

	297 // The recommended set and inclusion set come from

	298 // http://unicode.org/reports/tr39/ and

	299 // http://www.unicode.org/Public/security/latest/xidmodifications.txt

	300 // The list can undergo some changes as a new version of Unicode is

	301 // released and we update our copy of ICU.
	Ryan Sleevi 2015/09/08 21:45:58 Comment nit: Avoid pronouns in comments ( https:// Comment nit: Avoid pronouns in comments ( https://groups.google.com/a/chromium.org/forum/#!topic/chromium-dev/NH-S6KCkr2M ) // The recommended set and inclusion set come from ... // This list may change over time, and will be updated whenever the // included version of ICU is updated. (This avoids ambiguity for things like Chromium linking against system ICU, or non-Chromium distributions, etc - the "who is 'we'" question) jungshik at Google 2016/03/11 20:05:37 Done. Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi wrote: > Comment nit: Avoid pronouns in comments ( > https://groups.google.com/a/chromium.org/forum/#!topic/chromium-dev/NH-S6KCkr2M > ) > > // The recommended set and inclusion set come from ... > // This list may change over time, and will be updated whenever the > // included version of ICU is updated. > > (This avoids ambiguity for things like Chromium linking against system ICU, or > non-Chromium distributions, etc - the "who is 'we'" question) Done.
	302 const icu::UnicodeSet* recommended_set =

	303 uspoof_getRecommendedUnicodeSet(&status);

	304 icu::UnicodeSet allowed_set;

	305 allowed_set.addAll(*recommended_set);

	306 const icu::UnicodeSet* inclusion_set = uspoof_getInclusionUnicodeSet(&status);

	307 allowed_set.addAll(*inclusion_set);
	Ryan Sleevi 2015/09/08 21:45:58 So you explain where the lists come from, but don' So you explain where the lists come from, but don't quite explain the policy here (as in, why is it OK to add these to the allowed sets). I suspect this is something that might be best in an .md or explainer. jungshik at Google 2016/03/11 20:05:37 Made the comment more self-explanatory. A separate Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi (Slow to 2-21) wrote: > So you explain where the lists come from, but don't quite explain the policy > here (as in, why is it OK to add these to the allowed sets). > > I suspect this is something that might be best in an .md or explainer. Made the comment more self-explanatory. A separate document will have more details.
	308

	309 // From UAX 31 Table 6:

	310 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts

	311 const icu::UnicodeSet aspirational_scripts(

	312 UNICODE_STRING_SIMPLE(

	313 "[[:sc=Cans:][:sc=Plrd:][:sc=Mong:][:sc=Tfng:][:sc=Yiii:]]"),

	314 status);

	315 allowed_set.addAll(aspirational_scripts);

	316

	317 // Add 'Black Heart Suit' and 'Circled White Star'.

	318 // TODO(jshin): How about other heart-like characters and Emoji (e.g.

	319 // U+1F600) ?
	Ryan Sleevi 2015/09/08 21:45:58 Document and explain :P Document and explain :P jungshik at Google 2016/03/11 20:05:37 Yeah... they're rather arbitrary (they come from . Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi wrote: > Document and explain :P Yeah... they're rather arbitrary (they come from ...) . I just nixed them for now. I need to give more though on what to do with Emoji and other symbols. (it'd be great if we can just pass them all (most of them) and have an entirely different layer of defense against spoofing such as safe-browsing that can do a lot more off-line processing with a lot more data).
	320 allowed_set.add(0x2665u);

	321 allowed_set.add(0x272au);

	322

	323 // Remove the following three characters listed in Mozilla's blacklist (

	324 // http://kb.mozillazine.org/Network.IDN.blacklist_chars ) but

	325 // not yet excluded from \|allowed_set\| up to this point:

	326 // Combining Long Solidus Overlay, Hebrew Punctuation Gershayim, and

	327 // Hyphenation Point
	Ryan Sleevi 2015/09/08 21:45:58 comment nit: "." at the end of Hyphenation Point ( comment nit: "." at the end of Hyphenation Point (ending the sentence) comment nit: This reads a bit weird, the "not yet excluded from \|allowed_set\|". Up until this point, there's no exclusions from allowed_set, so it's kinda grammatically weird. A suggested reword might be something like // The following three characters are included in (recommended? inclusion) set, // but are blacklisted as part of Mozilla's IDN blacklist (link here). // Explicitly remove them, which does nothing if they're removed from // (recommended, inclusion?) set in a future ICU update. jungshik at Google 2016/03/11 20:05:37 I reviewed those 3 characters again and they can b Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi (Slow to 2-21) wrote: > comment nit: "." at the end of Hyphenation Point (ending the sentence) > > comment nit: This reads a bit weird, the "not yet excluded from \|allowed_set\|". > Up until this point, there's no exclusions from allowed_set, so it's kinda > grammatically weird. > > A suggested reword might be something like > > // The following three characters are included in (recommended? inclusion) set, > // but are blacklisted as part of Mozilla's IDN blacklist (link here). > // Explicitly remove them, which does nothing if they're removed from > // (recommended, inclusion?) set in a future ICU update. I reviewed those 3 characters again and they can be safely allowed except for U+0338 which is a bit problematic when rendered with a broken font/text renering engine because it can be mistaken for a forward slash. (with the correct rendering, it should not). I'm dropping only that character.
	328 allowed_set.remove(0x338u); // Combining Long Solidus Overlay

	329 allowed_set.remove(0x5f4u); // Hebrew Punctuation Gershayim

	330 allowed_set.remove(0x2027u); // Hyphenation Point

	331

	332 // TODO(jshin): Decide what to do with '+' and U+0020. For now, leave

	333 // them out as Mozilla does.
	Ryan Sleevi 2015/09/08 21:45:58 Bug #? :) Bug #? :) jungshik at Google 2016/03/11 20:05:37 Not necessary. They'd better be kept out. (per bot Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi (Slow to 2-21) wrote: > Bug #? :) Not necessary. They'd better be kept out. (per both uts46 and idna2008) Removed the comment.
	334 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, &status);

	335

	336 int32_t checks = uspoof_getChecks(checker_, &status);

	337 // Do not allow mixed numbering systems (e.g. ASCII digits and

	338 // Devanagari digits) or invisible characters or multiple occurrences

	339 // there is a script mixing.

	340 checks \|= USPOOF_MIXED_NUMBERS \| USPOOF_AUX_INFO;

	341

	342 // USPOOF_INVISBLE should be on by this point without being

	343 // explicitly turned on.

	344 DCHECK(checks & USPOOF_INVISIBLE);

	345

	346 // Disable whole-script-confusable check because even 'pax' (Latin)

	347 // and b<u-umlaut>cher cannot pass the test because Cyrillic/Greek have

	348 // confusable characters for all letters in them.
	Ryan Sleevi 2015/09/08 21:45:59 comment nit: Grammatically, this is weird because comment nit: Grammatically, this is weird because of the double "because" // Disable whole-script-confusable checks. // While this check would be desirable, simple strings such as // 'pax' (Latin) and 'b<u-umlaut>cher' (Latin?) are whole script confusable // with other scripts (such as Cyrillic/Greek). [Did the above capture the concerns?] I'm also not sure the "check against a list of well known good domain names" - is this talking about whitelisted TLDs? whitelisted strings? Something else? jungshik at Google 2016/03/11 20:05:37 Rewrote the comment with more details on a possibl Show quoted text On 2015/09/08 21:45:59, Ryan Sleevi (Slow to 2-21) wrote: > comment nit: Grammatically, this is weird because of the double "because" > > // Disable whole-script-confusable checks. > // While this check would be desirable, simple strings such as > // 'pax' (Latin) and 'b<u-umlaut>cher' (Latin?) are whole script confusable > // with other scripts (such as Cyrillic/Greek). > > [Did the above capture the concerns?] > > I'm also not sure the "check against a list of well known good domain names" - > is this talking about whitelisted TLDs? whitelisted strings? Something else? Rewrote the comment with more details on a possible alternative.
	349 // TODO(jshin): Disabling this check has a downside. One way to alleviate

	350 // is to check against a list of well known good domain names.

	351 checks ^= USPOOF_WHOLE_SCRIPT_CONFUSABLE;

	352

	353 uspoof_setChecks(checker_, checks, &status);

	354 DCHECK(U_SUCCESS(status));

266 }	355 }

267	356

268 bool IsIDNComponentInSingleScript(const base::char16* str, int str_len) {	357 inline bool IDNSpoofChecker::Check(base::StringPiece16 label) {

269 UScriptCode first_script = USCRIPT_INVALID_CODE;	358 UErrorCode status = U_ZERO_ERROR;

270 bool is_first = true;	359 int32_t results = uspoof_check(checker_, label.data(),

	360 base::checked_cast<int32_t>(label.size()),

	361 NULL, &status);

	362 // If uspoof_check fails, consider all IDN unsafe to be conservative.

	363 if (U_FAILURE(status) \|\| results & USPOOF_ALL_CHECKS)

	364 return false;

271	365

272 int i = 0;	366 // If there's no script mixing, the input is regarded as safe

273 while (i < str_len) {	367 // without any extra check.

274 unsigned code_point;	368 if (results == USPOOF_ASCII \|\| results == USPOOF_SINGLE_SCRIPT_RESTRICTIVE)

275 U16_NEXT(str, i, str_len, code_point);	369 return true;

276	370

277 UErrorCode err = U_ZERO_ERROR;	371 return g_idn_spoof_checker_extra.Get().Check(label);

278 UScriptCode cur_script = uscript_getScript(code_point, &err);

279 if (err != U_ZERO_ERROR)

280 return false; // Report mixed on error.

281 cur_script = NormalizeScript(cur_script);

282

283 // TODO(brettw) We may have to check for USCRIPT_INHERENT as well.

284 if (is_first && cur_script != USCRIPT_COMMON) {

285 first_script = cur_script;

286 is_first = false;

287 } else {

288 if (cur_script != USCRIPT_COMMON && cur_script != first_script)

289 return false;

290 }

291 }

292 return true;

293 }	372 }

294	373

295 // Check if the script of a language can be 'safely' mixed with	374 IDNSpoofCheckerExtra::IDNSpoofCheckerExtra() {

296 // Latin letters in the ASCII range.	375 UErrorCode status = U_ZERO_ERROR;

297 bool IsCompatibleWithASCIILetters(const std::string& lang) {	376 non_ascii_latin_ = icu::UnicodeSet(

298 // For now, just list Chinese, Japanese and Korean (positive list).	377 UNICODE_STRING_SIMPLE("[[:sc=Latn:] - [a-zA-Z]]"), status);

299 // An alternative is negative-listing (languages using Greek and	378

300 // Cyrillic letters), but it can be more dangerous.	379 dangerous_pattern_ = icu::RegexPattern::compile(

301 return !lang.substr(0, 2).compare("zh") \|\| !lang.substr(0, 2).compare("ja") \|\|	380 UNICODE_STRING_SIMPLE(

302 !lang.substr(0, 2).compare("ko");	381 // Lone (out-of-context) katakana no, so, zo, or n

	382 // They can be mistaken for a slash. Only allow them

	383 // when enclosed by Katakana, Hiragana and Han.

	384 "[^\\p{Katakana}\\p{Hiragana}\\p{Han}]"

	385 "[\\u30ce\\u30f3\\u30bd\\u30be]"

	386 "[^\\p{Katakana}\\p{Hiragana}\\p{Han}]"
	Ryan Sleevi 2015/09/08 21:45:58 As written, doesn't this mean that a lone katakana As written, doesn't this mean that a lone katakana no, so, zo, or n (without any context) will be allowed, since it'll fail the regex before/after? Or that "\\u0061\\u30ce" would be treated as safe (I'm probably butchering things here) jungshik at Google 2016/03/11 20:05:37 Thank you for catching it. Rewrote the regex. Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi (Slow to 2-21) wrote: > As written, doesn't this mean that a lone katakana no, so, zo, or n (without any > context) will be allowed, since it'll fail the regex before/after? > > Or that > > "\\u0061\\u30ce" would be treated as safe (I'm probably butchering things here) Thank you for catching it. Rewrote the regex.
	387 // Repeating Japanese accent characters. USPOOF_INVISIBLE

	388 // only checks for a repeated occurence of the same combining

	389 // mark, but we block a sequence of similary looking
	Ryan Sleevi 2015/09/08 21:45:58 same comments about "we" same comments about "we" jungshik at Google 2016/03/11 20:05:37 Done. Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi (Slow to 2-21) wrote: > same comments about "we" Done.
	390 // Japanese combining marks as well.
	Ryan Sleevi 2015/09/08 21:45:58 So, overall comment wise, this is somewhat hard to So, overall comment wise, this is somewhat hard to read because the comments are written for the positive case, but the actual regex is the negative (dangerous patterns). It may help to ensure the comments match the negativity. // Disallow the katakana no, so, zo, or n, as they may be mistaken // for slashes, unless they're entirely enclosed by Katakana, Hiragana, // or Han scripts. // Disallow repeating Japanese accent characters. Because // USPOOF_INVISIBLE will only check for repeated occurrences of the // same combining mark, it's necessary to block any sequence of // similar looking Japanese combining marks as well. jungshik at Google 2016/03/11 20:05:37 Done. Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi (Slow to 2-21) wrote: > So, overall comment wise, this is somewhat hard to read because the comments are > written for the positive case, but the actual regex is the negative (dangerous > patterns). > > It may help to ensure the comments match the negativity. > > // Disallow the katakana no, so, zo, or n, as they may be mistaken > // for slashes, unless they're entirely enclosed by Katakana, Hiragana, > // or Han scripts. > > // Disallow repeating Japanese accent characters. Because > // USPOOF_INVISIBLE will only check for repeated occurrences of the > // same combining mark, it's necessary to block any sequence of > // similar looking Japanese combining marks as well. Done.
	391 "\|[\\u3099-\\u309c][\\u3099-\\u309c]"),

	392 0, status);

	393 DCHECK(U_SUCCESS(status));

303 }	394 }

304	395

305 typedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap;	396 inline bool IDNSpoofCheckerExtra::Check(base::StringPiece16 label) {

	397 // This is called only if script mixing is detected.
	Ryan Sleevi 2015/09/08 21:45:59 Same comment about documenting 'how it's used' and Same comment about documenting 'how it's used' and such. This comment seems superfluous, but if you're trying to document a precondition, do it as such // \|label\| is a string that contains multiple, mixed scripts. jungshik at Google 2016/03/11 20:05:37 Done. Show quoted text On 2015/09/08 21:45:59, Ryan Sleevi wrote: > Same comment about documenting 'how it's used' and such. This comment seems > superfluous, but if you're trying to document a precondition, do it as such > > > // \|label\| is a string that contains multiple, mixed scripts. Done.
	398 // Limit Latin letters that can be mixed with other scripts to

	399 // ASCII-Latin instead of any Latin.
	Ryan Sleevi 2015/09/08 21:45:58 comment nit: s/ / / comment nit: s/ / / jungshik at Google 2016/03/11 20:05:37 Done. Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi wrote: > comment nit: s/ / / Done.
	400 icu::UnicodeString label_string(FALSE, label.data(),

	401 base::checked_cast<int32_t>(label.size()));

	402 if (non_ascii_latin_.containsSome(label_string))

	403 return false;

306	404

307 class LangToExemplarSet {	405 UErrorCode status = U_ZERO_ERROR;

308 public:	406 scoped_ptr<icu::RegexMatcher> dangerous_pattern_matcher(

309 static LangToExemplarSet* GetInstance() {	407 dangerous_pattern_->matcher(label_string, status));

310 return Singleton<LangToExemplarSet>::get();	408 DCHECK(U_SUCCESS(status));

311 }	409 return !dangerous_pattern_matcher->find();

312	410

313 private:	411 // TODO(jshin): Check spoofing attempt against a list of 'good' domains
	Ryan Sleevi 2015/09/08 21:45:58 BUG #? Is this domains (labels?) or TLDs? BUG #? Is this domains (labels?) or TLDs? jungshik at Google 2016/03/11 20:05:37 What I had in mind is now documented in another pa Show quoted text On 2015/09/08 21:45:58, Ryan Sleevi (Slow to 2-21) wrote: > BUG #? Is this domains (labels?) or TLDs? What I had in mind is now documented in another part of the code as a possible alternative with more details.
314 LangToExemplarSetMap map;

315 LangToExemplarSet() {}

316 ~LangToExemplarSet() {

317 STLDeleteContainerPairSecondPointers(map.begin(), map.end());

318 }

319

320 friend class Singleton<LangToExemplarSet>;

321 friend struct DefaultSingletonTraits<LangToExemplarSet>;

322 friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**);

323 friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*);

324

325 DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet);

326 };

327

328 bool GetExemplarSetForLang(const std::string& lang,

329 icu::UnicodeSet** lang_set) {

330 const LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map;

331 LangToExemplarSetMap::const_iterator pos = map.find(lang);

332 if (pos != map.end()) {

333 *lang_set = pos->second;

334 return true;

335 }

336 return false;

337 }

338

339 void SetExemplarSetForLang(const std::string& lang, icu::UnicodeSet* lang_set) {

340 LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map;

341 map.insert(std::make_pair(lang, lang_set));

342 }

343

344 static base::LazyInstance<base::Lock>::Leaky g_lang_set_lock =

345 LAZY_INSTANCE_INITIALIZER;

346

347 // Returns true if all the characters in component_characters are used by

348 // the language \|lang\|.

349 bool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters,

350 const std::string& lang) {

351 CR_DEFINE_STATIC_LOCAL(const icu::UnicodeSet, kASCIILetters, ('a', 'z'));

352 icu::UnicodeSet* lang_set = nullptr;

353 // We're called from both the UI thread and the history thread.

354 {

355 base::AutoLock lock(g_lang_set_lock.Get());

356 if (!GetExemplarSetForLang(lang, &lang_set)) {

357 UErrorCode status = U_ZERO_ERROR;

358 ULocaleData* uld = ulocdata_open(lang.c_str(), &status);

359 // TODO(jungshik) Turn this check on when the ICU data file is

360 // rebuilt with the minimal subset of locale data for languages

361 // to which Chrome is not localized but which we offer in the list

362 // of languages selectable for Accept-Languages. With the rebuilt ICU

363 // data, ulocdata_open never should fall back to the default locale.

364 // (issue 2078)

365 // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING);

366 if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) {

367 lang_set = reinterpret_cast<icu::UnicodeSet*>(ulocdata_getExemplarSet(

368 uld, nullptr, 0, ULOCDATA_ES_STANDARD, &status));

369 // On success, if \|lang\| is compatible with ASCII Latin letters, add

370 // them.

371 if (lang_set && IsCompatibleWithASCIILetters(lang))

372 lang_set->addAll(kASCIILetters);

373 }

374

375 if (!lang_set)

376 lang_set = new icu::UnicodeSet(1, 0);

377

378 lang_set->freeze();

379 SetExemplarSetForLang(lang, lang_set);

380 ulocdata_close(uld);

381 }

382 }

383 return !lang_set->isEmpty() && lang_set->containsAll(component_characters);

384 }	412 }

385	413

386 // Returns true if the given Unicode host component is safe to display to the	414 // Returns true if the given Unicode host component is safe to display to the

387 // user.	415 // user.

388 bool IsIDNComponentSafe(const base::char16* str,	416 bool IsIDNComponentSafe(base::StringPiece16 label) {

389 int str_len,	417 return g_idn_spoof_checker.Get().Check(label);

390 const std::string& languages) {

391 // Most common cases (non-IDN) do not reach here so that we don't

392 // need a fast return path.

393 // TODO(jungshik) : Check if there's any character inappropriate

394 // (although allowed) for domain names.

395 // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and

396 // http://www.unicode.org/reports/tr39/data/xidmodifications.txt

397 // For now, we borrow the list from Mozilla and tweaked it slightly.

398 // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because

399 // they're gonna be canonicalized to U+0020 and full stop before

400 // reaching here.)

401 // The original list is available at

402 // http://kb.mozillazine.org/Network.IDN.blacklist_chars and

403 // at

404 // http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703

405

406 UErrorCode status = U_ZERO_ERROR;

407 #ifdef U_WCHAR_IS_UTF16

408 icu::UnicodeSet dangerous_characters(

409 icu::UnicodeString(

410 L"[[\\ \u00ad\u00bc\u00bd\u01c3\u0337\u0338"

411 L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]"

412 L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]"

413 L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae"

414 L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014"

415 L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14"

416 L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]"

417 L"[\ufffa-\ufffd]\U0001f50f\U0001f510\U0001f512\U0001f513]"),

418 status);

419 DCHECK(U_SUCCESS(status));

420 icu::RegexMatcher dangerous_patterns(

421 icu::UnicodeString(

422 // Lone katakana no, so, or n

423 L"[^\\p{Katakana}][\u30ce\u30f3\u30bd][^\\p{Katakana}]"

424 // Repeating Japanese accent characters

425 L"\|[\u3099\u309a\u309b\u309c][\u3099\u309a\u309b\u309c]"),

426 0, status);

427 #else

428 icu::UnicodeSet dangerous_characters(

429 icu::UnicodeString(

430 "[[\\u0020\\u00ad\\u00bc\\u00bd\\u01c3\\u0337\\u0338"

431 "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]"

432 "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]"

433 "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae"

434 "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014"

435 "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe"

436 "14"

437 "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\uff"

438 "f9]"

439 "[\\ufffa-\\ufffd]\\U0001f50f\\U0001f510\\U0001f512\\U0001f513]",

440 -1, US_INV),

441 status);

442 DCHECK(U_SUCCESS(status));

443 icu::RegexMatcher dangerous_patterns(

444 icu::UnicodeString(

445 // Lone katakana no, so, or n

446 "[^\\p{Katakana}][\\u30ce\\u30f3\\u30bd][^\\p{Katakana}]"

447 // Repeating Japanese accent characters

448 "\|[\\u3099\\u309a\\u309b\\u309c][\\u3099\\u309a\\u309b\\u309c]"),

449 0, status);

450 #endif

451 DCHECK(U_SUCCESS(status));

452 icu::UnicodeSet component_characters;

453 icu::UnicodeString component_string(str, str_len);

454 component_characters.addAll(component_string);

455 if (dangerous_characters.containsSome(component_characters))

456 return false;

457

458 DCHECK(U_SUCCESS(status));

459 dangerous_patterns.reset(component_string);

460 if (dangerous_patterns.find())

461 return false;

462

463 // If the language list is empty, the result is completely determined

464 // by whether a component is a single script or not. This will block

465 // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are

466 // allowed with \|languages\| (while it blocks Chinese + Latin letters with

467 // an accent as should be the case), but we want to err on the safe side

468 // when \|languages\| is empty.

469 if (languages.empty())

470 return IsIDNComponentInSingleScript(str, str_len);

471

472 // \|common_characters\| is made up of ASCII numbers, hyphen, plus and

473 // underscore that are used across scripts and allowed in domain names.

474 // (sync'd with characters allowed in url_canon_host with square

475 // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc.

476 icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"),

477 status);

478 DCHECK(U_SUCCESS(status));

479 // Subtract common characters because they're always allowed so that

480 // we just have to check if a language-specific set contains

481 // the remainder.

482 component_characters.removeAll(common_characters);

483

484 base::StringTokenizer t(languages, ",");

485 while (t.GetNext()) {

486 if (IsComponentCoveredByLang(component_characters, t.token()))

487 return true;

488 }

489 return false;

490 }	418 }

491	419

492 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to	420 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to

493 // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().	421 // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().

494 //	422 //

495 // We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with	423 // We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with

496 // the backward compatibility in mind. What it does:	424 // the backward compatibility in mind. What it does:

497 //	425 //

498 // 1. Use the up-to-date Unicode data.	426 // 1. Use the up-to-date Unicode data.

499 // 2. Define a case folding/mapping with the up-to-date Unicode data as	427 // 2. Define a case folding/mapping with the up-to-date Unicode data as

(...skipping 15 matching lines...) Expand all Loading...
515 // TODO(jungshik): Change options as different parties (browsers,	443 // TODO(jungshik): Change options as different parties (browsers,

516 // registrars, search engines) converge toward a consensus.	444 // registrars, search engines) converge toward a consensus.

517 value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err);	445 value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err);

518 if (U_FAILURE(err))	446 if (U_FAILURE(err))

519 value = NULL;	447 value = NULL;

520 }	448 }

521	449

522 UIDNA* value;	450 UIDNA* value;

523 };	451 };

524	452

525 static base::LazyInstance<UIDNAWrapper>::Leaky g_uidna =	453 base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;

526 LAZY_INSTANCE_INITIALIZER;

527	454

528 // Converts one component of a host (between dots) to IDN if safe. The result	455 // Converts one component (label) of a host (between dots) to Unicode if safe.

529 // will be APPENDED to the given output string and will be the same as the input	456 // The result will be APPENDED to the given output string and will be the

530 // if it is not IDN or the IDN is unsafe to display. Returns whether any	457 // same as the input if it is not Punycode or the IDN is unsafe to display.

531 // conversion was performed.	458 // Returns whether any conversion was performed.

532 bool IDNToUnicodeOneComponent(const base::char16* comp,	459 bool IDNToUnicodeOneComponent(const base::char16* comp,

533 size_t comp_len,	460 size_t comp_len,

534 const std::string& languages,

535 base::string16* out) {	461 base::string16* out) {

536 DCHECK(out);	462 DCHECK(out);

537 if (comp_len == 0)	463 if (comp_len == 0)

538 return false;	464 return false;

539	465

540 // Only transform if the input can be an IDN component.	466 // Only transform if the input can be an IDN component.

541 static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'};	467 static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'};

542 if ((comp_len > arraysize(kIdnPrefix)) &&	468 if ((comp_len > arraysize(kIdnPrefix)) &&

543 !memcmp(comp, kIdnPrefix, sizeof(kIdnPrefix))) {	469 !memcmp(comp, kIdnPrefix, sizeof(kIdnPrefix))) {

544 UIDNA* uidna = g_uidna.Get().value;	470 UIDNA* uidna = g_uidna.Get().value;

545 DCHECK(uidna != NULL);	471 DCHECK(uidna != NULL);

546 size_t original_length = out->length();	472 size_t original_length = out->length();

547 int output_length = 64;	473 int32_t output_length = 64;

548 UIDNAInfo info = UIDNA_INFO_INITIALIZER;	474 UIDNAInfo info = UIDNA_INFO_INITIALIZER;

549 UErrorCode status;	475 UErrorCode status;

550 do {	476 do {

551 out->resize(original_length + output_length);	477 out->resize(original_length + output_length);

552 status = U_ZERO_ERROR;	478 status = U_ZERO_ERROR;

553 // This returns the actual length required. If this is more than 64	479 // This returns the actual length required. If this is more than 64

554 // code units, \|status\| will be U_BUFFER_OVERFLOW_ERROR and we'll try	480 // code units, \|status\| will be U_BUFFER_OVERFLOW_ERROR and we'll try

555 // the conversion again, but with a sufficiently large buffer.	481 // the conversion again, but with a sufficiently large buffer.

556 output_length = uidna_labelToUnicode(	482 output_length = uidna_labelToUnicode(

557 uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length],	483 uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length],

558 output_length, &info, &status);	484 output_length, &info, &status);

559 } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0));	485 } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0));

560	486

561 if (U_SUCCESS(status) && info.errors == 0) {	487 if (U_SUCCESS(status) && info.errors == 0) {

562 // Converted successfully. Ensure that the converted component	488 // Converted successfully. Ensure that the converted component

563 // can be safely displayed to the user.	489 // can be safely displayed to the user.

564 out->resize(original_length + output_length);	490 out->resize(original_length + output_length);

565 if (IsIDNComponentSafe(out->data() + original_length, output_length,	491 if (IsIDNComponentSafe(

566 languages))	492 base::StringPiece16(out->data() + original_length,

	493 base::checked_cast<size_t>(output_length))))

567 return true;	494 return true;

568 }	495 }

569	496

570 // Something went wrong. Revert to original string.	497 // Something went wrong. Revert to original string.

571 out->resize(original_length);	498 out->resize(original_length);

572 }	499 }

573	500

574 // We get here with no IDN or on error, in which case we just append the	501 // We get here with no IDN or on error, in which case we just append the

575 // literal input.	502 // literal input.

576 out->append(comp, comp_len);	503 out->append(comp, comp_len);

(...skipping 14 matching lines...) Expand all Loading...
591 const std::string& languages,	518 const std::string& languages,

592 FormatUrlTypes format_types,	519 FormatUrlTypes format_types,

593 net::UnescapeRule::Type unescape_rules,	520 net::UnescapeRule::Type unescape_rules,

594 url::Parsed* new_parsed,	521 url::Parsed* new_parsed,

595 size_t* prefix_end,	522 size_t* prefix_end,

596 size_t* offset_for_adjustment) {	523 size_t* offset_for_adjustment) {

597 std::vector<size_t> offsets;	524 std::vector<size_t> offsets;

598 if (offset_for_adjustment)	525 if (offset_for_adjustment)

599 offsets.push_back(*offset_for_adjustment);	526 offsets.push_back(*offset_for_adjustment);

600 base::string16 result =	527 base::string16 result =

601 FormatUrlWithOffsets(url, languages, format_types, unescape_rules,	528 FormatUrlWithOffsets(url, std::string(), format_types, unescape_rules,

602 new_parsed, prefix_end, &offsets);	529 new_parsed, prefix_end, &offsets);

603 if (offset_for_adjustment)	530 if (offset_for_adjustment)

604 *offset_for_adjustment = offsets[0];	531 *offset_for_adjustment = offsets[0];

605 return result;	532 return result;

606 }	533 }

607	534

608 base::string16 FormatUrlWithOffsets(	535 base::string16 FormatUrlWithOffsets(

609 const GURL& url,	536 const GURL& url,

610 const std::string& languages,	537 const std::string& languages,

611 FormatUrlTypes format_types,	538 FormatUrlTypes format_types,

612 net::UnescapeRule::Type unescape_rules,	539 net::UnescapeRule::Type unescape_rules,

613 url::Parsed* new_parsed,	540 url::Parsed* new_parsed,

614 size_t* prefix_end,	541 size_t* prefix_end,

615 std::vector<size_t>* offsets_for_adjustment) {	542 std::vector<size_t>* offsets_for_adjustment) {

616 base::OffsetAdjuster::Adjustments adjustments;	543 base::OffsetAdjuster::Adjustments adjustments;

617 const base::string16& format_url_return_value =	544 const base::string16& format_url_return_value =

618 FormatUrlWithAdjustments(url, languages, format_types, unescape_rules,	545 FormatUrlWithAdjustments(url, std::string(), format_types, unescape_rules,

619 new_parsed, prefix_end, &adjustments);	546 new_parsed, prefix_end, &adjustments);

620 base::OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);	547 base::OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);

621 if (offsets_for_adjustment) {	548 if (offsets_for_adjustment) {

622 std::for_each(	549 std::for_each(

623 offsets_for_adjustment->begin(), offsets_for_adjustment->end(),	550 offsets_for_adjustment->begin(), offsets_for_adjustment->end(),

624 base::LimitOffset<std::string>(format_url_return_value.length()));	551 base::LimitOffset<std::string>(format_url_return_value.length()));

625 }	552 }

626 return format_url_return_value;	553 return format_url_return_value;

627 }	554 }

628	555

(...skipping 14 matching lines...) Expand all Loading...
643 *new_parsed = url::Parsed();	570 *new_parsed = url::Parsed();

644	571

645 // Special handling for view-source:. Don't use content::kViewSourceScheme	572 // Special handling for view-source:. Don't use content::kViewSourceScheme

646 // because this library shouldn't depend on chrome.	573 // because this library shouldn't depend on chrome.

647 const char kViewSource[] = "view-source";	574 const char kViewSource[] = "view-source";

648 // Reject "view-source:view-source:..." to avoid deep recursion.	575 // Reject "view-source:view-source:..." to avoid deep recursion.

649 const char kViewSourceTwice[] = "view-source:view-source:";	576 const char kViewSourceTwice[] = "view-source:view-source:";

650 if (url.SchemeIs(kViewSource) &&	577 if (url.SchemeIs(kViewSource) &&

651 !base::StartsWith(url.possibly_invalid_spec(), kViewSourceTwice,	578 !base::StartsWith(url.possibly_invalid_spec(), kViewSourceTwice,

652 base::CompareCase::INSENSITIVE_ASCII)) {	579 base::CompareCase::INSENSITIVE_ASCII)) {

653 return FormatViewSourceUrl(url, languages, format_types, unescape_rules,	580 return FormatViewSourceUrl(url, format_types, unescape_rules,

654 new_parsed, prefix_end, adjustments);	581 new_parsed, prefix_end, adjustments);

655 }	582 }

656	583

657 // We handle both valid and invalid URLs (this will give us the spec	584 // We handle both valid and invalid URLs (this will give us the spec

658 // regardless of validity).	585 // regardless of validity).

659 const std::string& spec = url.possibly_invalid_spec();	586 const std::string& spec = url.possibly_invalid_spec();

660 const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();	587 const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();

661	588

662 // Scheme & separators. These are ASCII.	589 // Scheme & separators. These are ASCII.

663 base::string16 url_string;	590 base::string16 url_string;

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
713 AppendFormattedComponent(spec, parsed.password,	640 AppendFormattedComponent(spec, parsed.password,

714 NonHostComponentTransform(unescape_rules),	641 NonHostComponentTransform(unescape_rules),

715 &url_string, &new_parsed->password, adjustments);	642 &url_string, &new_parsed->password, adjustments);

716 if (parsed.username.is_valid() \|\| parsed.password.is_valid())	643 if (parsed.username.is_valid() \|\| parsed.password.is_valid())

717 url_string.push_back('@');	644 url_string.push_back('@');

718 }	645 }

719 if (prefix_end)	646 if (prefix_end)

720 *prefix_end = static_cast<size_t>(url_string.length());	647 *prefix_end = static_cast<size_t>(url_string.length());

721	648

722 // Host.	649 // Host.

723 AppendFormattedComponent(spec, parsed.host, HostComponentTransform(languages),	650 AppendFormattedComponent(spec, parsed.host, HostComponentTransform(),

724 &url_string, &new_parsed->host, adjustments);	651 &url_string, &new_parsed->host, adjustments);

725	652

726 // Port.	653 // Port.

727 if (parsed.port.is_nonempty()) {	654 if (parsed.port.is_nonempty()) {

728 url_string.push_back(':');	655 url_string.push_back(':');

729 new_parsed->port.begin = url_string.length();	656 new_parsed->port.begin = url_string.length();

730 url_string.insert(url_string.end(), spec.begin() + parsed.port.begin,	657 url_string.insert(url_string.end(), spec.begin() + parsed.port.begin,

731 spec.begin() + parsed.port.end());	658 spec.begin() + parsed.port.end());

732 new_parsed->port.len = url_string.length() - new_parsed->port.begin;	659 new_parsed->port.len = url_string.length() - new_parsed->port.begin;

733 } else {	660 } else {

(...skipping 55 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
789 // the hostname.	716 // the hostname.

790 return url.IsStandard() && !url.SchemeIsFile() && !url.SchemeIsFileSystem() &&	717 return url.IsStandard() && !url.SchemeIsFile() && !url.SchemeIsFileSystem() &&

791 !url.has_query() && !url.has_ref() && url.path() == "/";	718 !url.has_query() && !url.has_ref() && url.path() == "/";

792 }	719 }

793	720

794 void AppendFormattedHost(const GURL& url,	721 void AppendFormattedHost(const GURL& url,

795 const std::string& languages,	722 const std::string& languages,

796 base::string16* output) {	723 base::string16* output) {

797 AppendFormattedComponent(	724 AppendFormattedComponent(

798 url.possibly_invalid_spec(), url.parsed_for_possibly_invalid_spec().host,	725 url.possibly_invalid_spec(), url.parsed_for_possibly_invalid_spec().host,

799 HostComponentTransform(languages), output, NULL, NULL);	726 HostComponentTransform(), output, NULL, NULL);

800 }	727 }

801	728

802 base::string16 IDNToUnicode(const std::string& host,	729 base::string16 IDNToUnicode(const std::string& host,

803 const std::string& languages) {	730 const std::string& languages) {

804 return IDNToUnicodeWithAdjustments(host, languages, NULL);	731 return IDNToUnicodeWithAdjustments(host, NULL);

805 }	732 }

806	733

807 } // url_formatter	734 } // url_formatter

OLD	NEW

« no previous file with comments | « components/url_formatter/url_formatter.h ('k') | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »