components/url_formatter/url_formatter.cc - Issue 1258813002: Implement a new IDN display policy

Side by Side Diff: components/url_formatter/url_formatter.cc

Issue 1258813002: Implement a new IDN display policy (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: drop U+2027 and add tests for U+2027 and U+05F4 Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« components/omnibox/browser/history_url_provider_unittest.cc ('K') | « components/url_formatter/url_formatter.h ('k') | components/url_formatter/url_formatter_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2015 The Chromium Authors. All rights reserved.	1 // Copyright 2015 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/url_formatter/url_formatter.h"	5 #include "components/url_formatter/url_formatter.h"

6	6

7 #include <algorithm>	7 #include <algorithm>

8 #include <map>

9 #include <utility>	8 #include <utility>

10	9

11 #include "base/lazy_instance.h"	10 #include "base/lazy_instance.h"

12 #include "base/logging.h"	11 #include "base/logging.h"

13 #include "base/macros.h"	12 #include "base/macros.h"

14 #include "base/memory/singleton.h"	13 #include "base/memory/scoped_ptr.h"

15 #include "base/stl_util.h"	14 #include "base/numerics/safe_conversions.h"

16 #include "base/strings/string_tokenizer.h"	15 #include "base/strings/string_piece.h"

17 #include "base/strings/string_util.h"	16 #include "base/strings/string_util.h"

18 #include "base/strings/utf_offset_string_conversions.h"	17 #include "base/strings/utf_offset_string_conversions.h"

19 #include "base/strings/utf_string_conversions.h"	18 #include "base/strings/utf_string_conversions.h"

20 #include "base/synchronization/lock.h"	19 #include "base/synchronization/lock.h"

21 #include "third_party/icu/source/common/unicode/uidna.h"	20 #include "third_party/icu/source/common/unicode/uidna.h"

22 #include "third_party/icu/source/common/unicode/uniset.h"	21 #include "third_party/icu/source/common/unicode/uniset.h"

23 #include "third_party/icu/source/common/unicode/uscript.h"	22 #include "third_party/icu/source/common/unicode/uscript.h"

24 #include "third_party/icu/source/i18n/unicode/regex.h"	23 #include "third_party/icu/source/i18n/unicode/regex.h"

25 #include "third_party/icu/source/i18n/unicode/ulocdata.h"	24 #include "third_party/icu/source/i18n/unicode/uspoof.h"

26 #include "url/gurl.h"	25 #include "url/gurl.h"

27 #include "url/third_party/mozilla/url_parse.h"	26 #include "url/third_party/mozilla/url_parse.h"

28	27

29 namespace url_formatter {	28 namespace url_formatter {

30	29

31 namespace {	30 namespace {

32	31

33 base::string16 IDNToUnicodeWithAdjustments(	32 base::string16 IDNToUnicodeWithAdjustments(

34 const std::string& host,	33 const std::string& host,

35 const std::string& languages,

36 base::OffsetAdjuster::Adjustments* adjustments);	34 base::OffsetAdjuster::Adjustments* adjustments);

37 bool IDNToUnicodeOneComponent(const base::char16* comp,	35 bool IDNToUnicodeOneComponent(const base::char16* comp,

38 size_t comp_len,	36 size_t comp_len,

39 const std::string& languages,

40 base::string16* out);	37 base::string16* out);

41	38

42 class AppendComponentTransform {	39 class AppendComponentTransform {

43 public:	40 public:

44 AppendComponentTransform() {}	41 AppendComponentTransform() {}

45 virtual ~AppendComponentTransform() {}	42 virtual ~AppendComponentTransform() {}

46	43

47 virtual base::string16 Execute(	44 virtual base::string16 Execute(

48 const std::string& component_text,	45 const std::string& component_text,

49 base::OffsetAdjuster::Adjustments* adjustments) const = 0;	46 base::OffsetAdjuster::Adjustments* adjustments) const = 0;

50	47

51 // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an	48 // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an

52 // accessible copy constructor in order to call AppendFormattedComponent()	49 // accessible copy constructor in order to call AppendFormattedComponent()

53 // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ).	50 // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ).

54 };	51 };

55	52

56 class HostComponentTransform : public AppendComponentTransform {	53 class HostComponentTransform : public AppendComponentTransform {

57 public:	54 public:

58 explicit HostComponentTransform(const std::string& languages)	55 HostComponentTransform() {}

59 : languages_(languages) {}

60	56

61 private:	57 private:

62 base::string16 Execute(	58 base::string16 Execute(

63 const std::string& component_text,	59 const std::string& component_text,

64 base::OffsetAdjuster::Adjustments* adjustments) const override {	60 base::OffsetAdjuster::Adjustments* adjustments) const override {

65 return IDNToUnicodeWithAdjustments(component_text, languages_, adjustments);	61 return IDNToUnicodeWithAdjustments(component_text, adjustments);

66 }	62 }

67

68 const std::string& languages_;

69 };	63 };

70	64

71 class NonHostComponentTransform : public AppendComponentTransform {	65 class NonHostComponentTransform : public AppendComponentTransform {

72 public:	66 public:

73 explicit NonHostComponentTransform(net::UnescapeRule::Type unescape_rules)	67 explicit NonHostComponentTransform(net::UnescapeRule::Type unescape_rules)

74 : unescape_rules_(unescape_rules) {}	68 : unescape_rules_(unescape_rules) {}

75	69

76 private:	70 private:

77 base::string16 Execute(	71 base::string16 Execute(

78 const std::string& component_text,	72 const std::string& component_text,

(...skipping 71 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
150 AdjustComponent(delta, &(parsed->host));	144 AdjustComponent(delta, &(parsed->host));

151 AdjustComponent(delta, &(parsed->port));	145 AdjustComponent(delta, &(parsed->port));

152 AdjustComponent(delta, &(parsed->path));	146 AdjustComponent(delta, &(parsed->path));

153 AdjustComponent(delta, &(parsed->query));	147 AdjustComponent(delta, &(parsed->query));

154 AdjustComponent(delta, &(parsed->ref));	148 AdjustComponent(delta, &(parsed->ref));

155 }	149 }

156	150

157 // Helper for FormatUrlWithOffsets().	151 // Helper for FormatUrlWithOffsets().

158 base::string16 FormatViewSourceUrl(	152 base::string16 FormatViewSourceUrl(

159 const GURL& url,	153 const GURL& url,

160 const std::string& languages,

161 FormatUrlTypes format_types,	154 FormatUrlTypes format_types,

162 net::UnescapeRule::Type unescape_rules,	155 net::UnescapeRule::Type unescape_rules,

163 url::Parsed* new_parsed,	156 url::Parsed* new_parsed,

164 size_t* prefix_end,	157 size_t* prefix_end,

165 base::OffsetAdjuster::Adjustments* adjustments) {	158 base::OffsetAdjuster::Adjustments* adjustments) {

166 DCHECK(new_parsed);	159 DCHECK(new_parsed);

167 const char kViewSource[] = "view-source:";	160 const char kViewSource[] = "view-source:";

168 const size_t kViewSourceLength = arraysize(kViewSource) - 1;	161 const size_t kViewSourceLength = arraysize(kViewSource) - 1;

169	162

170 // Format the underlying URL and record adjustments.	163 // Format the underlying URL and record adjustments.

171 const std::string& url_str(url.possibly_invalid_spec());	164 const std::string& url_str(url.possibly_invalid_spec());

172 adjustments->clear();	165 adjustments->clear();

173 base::string16 result(	166 base::string16 result(

174 base::ASCIIToUTF16(kViewSource) +	167 base::ASCIIToUTF16(kViewSource) +

175 FormatUrlWithAdjustments(GURL(url_str.substr(kViewSourceLength)),	168 FormatUrlWithAdjustments(GURL(url_str.substr(kViewSourceLength)),

176 languages, format_types, unescape_rules,	169 std::string(), format_types, unescape_rules,

177 new_parsed, prefix_end, adjustments));	170 new_parsed, prefix_end, adjustments));

178 // Revise \|adjustments\| by shifting to the offsets to prefix that the above	171 // Revise \|adjustments\| by shifting to the offsets to prefix that the above

179 // call to FormatUrl didn't get to see.	172 // call to FormatUrl didn't get to see.

180 for (base::OffsetAdjuster::Adjustments::iterator it = adjustments->begin();	173 for (base::OffsetAdjuster::Adjustments::iterator it = adjustments->begin();

181 it != adjustments->end(); ++it)	174 it != adjustments->end(); ++it)

182 it->original_offset += kViewSourceLength;	175 it->original_offset += kViewSourceLength;

183	176

184 // Adjust positions of the parsed components.	177 // Adjust positions of the parsed components.

185 if (new_parsed->scheme.is_nonempty()) {	178 if (new_parsed->scheme.is_nonempty()) {

186 // Assume "view-source:real-scheme" as a scheme.	179 // Assume "view-source:real-scheme" as a scheme.

187 new_parsed->scheme.len += kViewSourceLength;	180 new_parsed->scheme.len += kViewSourceLength;

188 } else {	181 } else {

189 new_parsed->scheme.begin = 0;	182 new_parsed->scheme.begin = 0;

190 new_parsed->scheme.len = kViewSourceLength - 1;	183 new_parsed->scheme.len = kViewSourceLength - 1;

191 }	184 }

192 AdjustAllComponentsButScheme(kViewSourceLength, new_parsed);	185 AdjustAllComponentsButScheme(kViewSourceLength, new_parsed);

193	186

194 if (prefix_end)	187 if (prefix_end)

195 *prefix_end += kViewSourceLength;	188 *prefix_end += kViewSourceLength;

196	189

197 return result;	190 return result;

198 }	191 }

199	192

200 // TODO(brettw) bug 734373: check the scripts for each host component and	193 // TODO(brettw): We may want to skip this step in the case of file URLs to

201 // don't un-IDN-ize if there is more than one. Alternatively, only IDN for	194 // allow unicode UNC hostnames regardless of encodings.

202 // scripts that the user has installed. For now, just put the entire

203 // path through IDN. Maybe this feature can be implemented in ICU itself?

204 //

205 // We may want to skip this step in the case of file URLs to allow unicode

206 // UNC hostnames regardless of encodings.

207 base::string16 IDNToUnicodeWithAdjustments(	195 base::string16 IDNToUnicodeWithAdjustments(

208 const std::string& host,	196 const std::string& host,

209 const std::string& languages,

210 base::OffsetAdjuster::Adjustments* adjustments) {	197 base::OffsetAdjuster::Adjustments* adjustments) {

211 if (adjustments)	198 if (adjustments)

212 adjustments->clear();	199 adjustments->clear();

213 // Convert the ASCII input to a base::string16 for ICU.	200 // Convert the ASCII input to a base::string16 for ICU.

214 base::string16 input16;	201 base::string16 input16;

215 input16.reserve(host.length());	202 input16.reserve(host.length());

216 input16.insert(input16.end(), host.begin(), host.end());	203 input16.insert(input16.end(), host.begin(), host.end());

217	204

218 // Do each component of the host separately, since we enforce script matching	205 // Do each component of the host separately, since we enforce script matching

219 // on a per-component basis.	206 // on a per-component basis.

220 base::string16 out16;	207 base::string16 out16;

221 for (size_t component_start = 0, component_end;	208 for (size_t component_start = 0, component_end;

222 component_start < input16.length();	209 component_start < input16.length();

223 component_start = component_end + 1) {	210 component_start = component_end + 1) {

224 // Find the end of the component.	211 // Find the end of the component.

225 component_end = input16.find('.', component_start);	212 component_end = input16.find('.', component_start);

226 if (component_end == base::string16::npos)	213 if (component_end == base::string16::npos)

227 component_end = input16.length(); // For getting the last component.	214 component_end = input16.length(); // For getting the last component.

228 size_t component_length = component_end - component_start;	215 size_t component_length = component_end - component_start;

229 size_t new_component_start = out16.length();	216 size_t new_component_start = out16.length();

230 bool converted_idn = false;	217 bool converted_idn = false;

231 if (component_end > component_start) {	218 if (component_end > component_start) {

232 // Add the substring that we just found.	219 // Add the substring that we just found.

233 converted_idn =	220 converted_idn =

234 IDNToUnicodeOneComponent(input16.data() + component_start,	221 IDNToUnicodeOneComponent(input16.data() + component_start,

235 component_length, languages, &out16);	222 component_length, &out16);

236 }	223 }

237 size_t new_component_length = out16.length() - new_component_start;	224 size_t new_component_length = out16.length() - new_component_start;

238	225

239 if (converted_idn && adjustments) {	226 if (converted_idn && adjustments) {

240 adjustments->push_back(base::OffsetAdjuster::Adjustment(	227 adjustments->push_back(base::OffsetAdjuster::Adjustment(

241 component_start, component_length, new_component_length));	228 component_start, component_length, new_component_length));

242 }	229 }

243	230

244 // Need to add the dot we just found (if we found one).	231 // Need to add the dot we just found (if we found one).

245 if (component_end < input16.length())	232 if (component_end < input16.length())

246 out16.push_back('.');	233 out16.push_back('.');

247 }	234 }

248 return out16;	235 return out16;

249 }	236 }

250	237

251 // Does some simple normalization of scripts so we can allow certain scripts	238 // A helper class for IDN Spoof checking, used to ensure that no IDN input is

252 // to exist together.	239 // spoofable per Chromium's standard of spoofability. For a more thorough

253 // TODO(brettw) bug 880223: we should allow some other languages to be	240 // explanation of how spoof checking works in Chromium, see

254 // oombined such as Chinese and Latin. We will probably need a more	241 // http://dev.chromium.org/developers/design-documents/idn-in-google-chrome .

255 // complicated system of language pairs to have more fine-grained control.	242 class IDNSpoofChecker {

256 UScriptCode NormalizeScript(UScriptCode code) {	243 public:

257 switch (code) {	244 IDNSpoofChecker();

258 case USCRIPT_KATAKANA:	245

259 case USCRIPT_HIRAGANA:	246 // Returns true if \|label\| is safe to display as Unicode. In the event of

260 case USCRIPT_KATAKANA_OR_HIRAGANA:	247 // library failure, all IDN inputs will be treated as unsafe.

261 case USCRIPT_HANGUL: // This one is arguable.	248 bool Check(base::StringPiece16 label);

262 return USCRIPT_HAN;	249

263 default:	250 private:

264 return code;	251 void SetAllowedUnicodeSet(UErrorCode* status);

	252

	253 USpoofChecker* checker_;

	254 icu::UnicodeSet deviation_characters_;

	255 icu::UnicodeSet latin_letters_;

	256 icu::UnicodeSet non_ascii_latin_letters_;

	257

	258 DISALLOW_COPY_AND_ASSIGN(IDNSpoofChecker);

	259 };

	260

	261 base::LazyInstance<IDNSpoofChecker>::Leaky g_idn_spoof_checker =

	262 LAZY_INSTANCE_INITIALIZER;

	263 base::LazyInstance<base::Lock>::Leaky g_dangerous_pattern_lock =

	264 LAZY_INSTANCE_INITIALIZER;
	Ryan Sleevi 2016/03/16 20:30:49 DESIGN: Do we run the risk of lock contention here DESIGN: Do we run the risk of lock contention here? If so, we could always consider moving g_dangerous_pattern (which I understand can only check one pattern at a time) into a thread-local storage bucket. That makes a tradeoff of memory vs CPU, but it seems like it would amortize as a win, since there will only be a few threads that invoke this routine (ideally, only the IO thread, but ChromeCast & chromoting et all make that weird) jungshik at Google 2016/03/17 07:43:26 Agreed and switched to TLS. Show quoted text On 2016/03/16 20:30:49, Ryan Sleevi wrote: > DESIGN: Do we run the risk of lock contention here? If so, we could always > consider moving g_dangerous_pattern (which I understand can only check one > pattern at a time) into a thread-local storage bucket. That makes a tradeoff of > memory vs CPU, but it seems like it would amortize as a win, since there will > only be a few threads that invoke this routine (ideally, only the IO thread, but > ChromeCast & chromoting et all make that weird) Agreed and switched to TLS.
	265 icu::RegexMatcher* g_dangerous_pattern = nullptr;

	266

	267 IDNSpoofChecker::IDNSpoofChecker() {

	268 UErrorCode status = U_ZERO_ERROR;

	269 checker_ = uspoof_open(&status);

	270 if (U_FAILURE(status)) {

	271 checker_ = nullptr;

	272 return;

265 }	273 }

266 }	274

267	275 // At this point, USpoofChecker has all the checks enabled except

268 bool IsIDNComponentInSingleScript(const base::char16* str, int str_len) {	276 // for USPOOF_CHAR_LIMIT (USPOOF_{RESTRICTION_LEVEL, INVISIBLE,

269 UScriptCode first_script = USCRIPT_INVALID_CODE;	277 // MIXED_SCRIPT_CONFUSABLE, WHOLE_SCRIPT_CONFUSABLE, MIXED_NUMBERS, ANY_CASE})

270 bool is_first = true;	278 // This default configuration is adjusted below as necessary.

271	279

272 int i = 0;	280 // Set the restriction level to moderate. It allows mixing Latin with another

273 while (i < str_len) {	281 // script (+ COMMON and INHERITED). Except for Chinese(Han + Bopomofo),

274 unsigned code_point;	282 // Japanese(Hiragana + Katakana + Han), and Korean(Hangul + Han), only one

275 U16_NEXT(str, i, str_len, code_point);	283 // script other than Common and Inherited can be mixed with Latin. Cyrillic

276	284 // and Greek are not allowed to mix with Latin.

277 UErrorCode err = U_ZERO_ERROR;	285 // See http://www.unicode.org/reports/tr39/#Restriction_Level_Detection

278 UScriptCode cur_script = uscript_getScript(code_point, &err);	286 uspoof_setRestrictionLevel(checker_, USPOOF_MODERATELY_RESTRICTIVE);

279 if (err != U_ZERO_ERROR)	287

280 return false; // Report mixed on error.	288 // Restrict allowed characters in IDN labels and turn on USPOOF_CHAR_LIMIT.

281 cur_script = NormalizeScript(cur_script);	289 SetAllowedUnicodeSet(&status);

282	290

283 // TODO(brettw) We may have to check for USCRIPT_INHERENT as well.	291 // Enable the return of auxillary (non-error) information.

284 if (is_first && cur_script != USCRIPT_COMMON) {	292 int32_t checks = uspoof_getChecks(checker_, &status) \| USPOOF_AUX_INFO;

285 first_script = cur_script;	293

286 is_first = false;	294 // Disable WHOLE_SCRIPT_CONFUSABLE check. The check has a marginal value when

287 } else {	295 // used against a single string as opposed to comparing a pair of strings. In

288 if (cur_script != USCRIPT_COMMON && cur_script != first_script)	296 // addition, it would also flag a number of common labels including the IDN

289 return false;	297 // TLD for Russian.

290 }	298 // A possible alternative would be to turn on the check and block a label

	299 // only under the following conditions, but it'd better be done on the

	300 // server-side (e.g. SafeBrowsing):

	301 // 1. The label is whole-script confusable.

	302 // 2. And the skeleton of the label matches the skeleton of one of top

	303 // domain labels. See http://unicode.org/reports/tr39/#Confusable_Detection

	304 // for the definition of skeleton.

	305 // 3. And the label is different from the matched top domain label in #2.

	306 checks &= ~USPOOF_WHOLE_SCRIPT_CONFUSABLE;

	307

	308 uspoof_setChecks(checker_, checks, &status);

	309

	310 // Four characters handled differently by IDNA 2003 and IDNA 2008. UTS46

	311 // transitional processing treat them as IDNA 2003 does; maps U+00DF and
	Ryan Sleevi 2016/03/16 20:30:49 s/treat/treats/, since transitional processing is s/treat/treats/, since transitional processing is singular jungshik at Google 2016/03/17 07:43:26 Done. Show quoted text On 2016/03/16 20:30:49, Ryan Sleevi wrote: > s/treat/treats/, since transitional processing is singular Done.
	312 // U+03C2 and drops U+200[CD].

	313 deviation_characters_ =

	314 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[\\u00df\\u03c2\\u200c\\u200d]"),

	315 status);

	316 deviation_characters_.freeze();

	317

	318 latin_letters_ =

	319 icu::UnicodeSet(UNICODE_STRING_SIMPLE("[:Latin:]"), status);

	320 latin_letters_.freeze();

	321

	322 // Latin letters outside ASCII. 'Script_Extensions=Latin' is not necessary

	323 // because additional characters pulled in with scx=Latn are not included in

	324 // the allowed set.

	325 non_ascii_latin_letters_ = icu::UnicodeSet(

	326 UNICODE_STRING_SIMPLE("[[:Latin:] - [a-zA-Z]]"), status);

	327 non_ascii_latin_letters_.freeze();

	328

	329 DCHECK(U_SUCCESS(status));

	330 }

	331

	332 bool IDNSpoofChecker::Check(base::StringPiece16 label) {

	333 UErrorCode status = U_ZERO_ERROR;

	334 int32_t result = uspoof_check(checker_, label.data(),

	335 base::checked_cast<int32_t>(label.size()),

	336 NULL, &status);

	337 // If uspoof_check fails or any of check is flagged, treat any IDN as
	Ryan Sleevi 2016/03/16 20:30:49 comment nit: "any of check is flagged" doesn't rea comment nit: "any of check is flagged" doesn't read right. Is this meant to be "if any checks are flagged" ? That might also read weird, so how does // If uspoof_check fails (due to library failure), or if any of the checks fail, treat the IDN as unsafe. Is that the same comment? jungshik at Google 2016/03/17 07:43:26 Done. Show quoted text On 2016/03/16 20:30:49, Ryan Sleevi wrote: > comment nit: "any of check is flagged" doesn't read right. Is this meant to be > "if any checks are flagged" ? That might also read weird, so how does > > // If uspoof_check fails (due to library failure), or if any of the checks fail, > treat the IDN as unsafe. > > Is that the same comment? Done.
	338 // unsafe.

	339 if (U_FAILURE(status) \|\| (result & USPOOF_ALL_CHECKS))

	340 return false;

	341

	342 icu::UnicodeString label_string(FALSE, label.data(),

	343 base::checked_cast<int32_t>(label.size()));

	344

	345 // A punycode label with 'xn--' prefix is not subject to the URL

	346 // canonicalization and is stored as it is in GURL. If it encodes a deviation

	347 // character (UTS 46; e.g. U+00DF/sharp-s), it should be still shown in

	348 // punycode instead of Unicode. Without this check, xn--fu-hia for

	349 // 'fu<sharp-s>' would be shown in 'fu<sharp-s>' while 'fu<sharp-s>' typed
	Ryan Sleevi 2016/03/16 20:30:49 s/shown in/shown as/ ? s/shown in/shown as/ ? Peter Kasting 2016/03/17 06:01:24 Yes, that was a comment I made last time that didn Show quoted text On 2016/03/16 20:30:49, Ryan Sleevi wrote: > s/shown in/shown as/ ? Yes, that was a comment I made last time that didn't actually get addressed. jungshik at Google 2016/03/17 07:43:26 Done. Show quoted text On 2016/03/16 20:30:49, Ryan Sleevi wrote: > s/shown in/shown as/ ? Done.
	350 // or copy and pasted as Unicode would be canonicalized to 'fuss'. This
	Ryan Sleevi 2016/03/16 20:30:48 I have trouble understanding why this is (that is, I have trouble understanding why this is (that is, canonicalized to 'fuss'), and am trying to understand if this is documenting the behaviour of the layers below it (//net and GURL) or the consumption at the layers above it (omnibox). Could you explain a little more via CL comment, and then we can see if any changes to the code comment are possible/worth it? jungshik at Google 2016/03/17 07:43:26 A hostname typed(or copy-n-pasted) by a user or a Show quoted text On 2016/03/16 20:30:48, Ryan Sleevi wrote: > I have trouble understanding why this is (that is, canonicalized to 'fuss'), and > am trying to understand if this is documenting the behaviour of the layers below > it (//net and GURL) or the consumption at the layers above it (omnibox). Could > you explain a little more via CL comment, and then we can see if any changes to > the code comment are possible/worth it? A hostname typed(or copy-n-pasted) by a user or a hostname found in a web page is canonicalized by GURL before being stored for later use. a. A punycode hostname with a deviation character is stored as it is in GURL (because it's in ASCII) b. for display purpose, it is converted to Unicode to 'fu<sharp-s>'. So far it's all right. When the url in the omnibox (with <sharp-s>) is copied, it's canonicalized by GURL (again) to 'ss' before being copied. This makes it a user confused. See http://crbug.com/595263
	351 // additional check is necessary because "UTS 46 section 4 Processing step 4"

	352 // applies validity criteria for non-transitional processing to any punycode

	353 // labels regardless of whether we choose transitional or non-transitional.

	354 if (deviation_characters_.containsSome(label_string))

	355 return false;

	356

	357 // If there's no script mixing, the input is regarded as safe without any

	358 // extra check.

	359 result &= USPOOF_RESTRICTION_LEVEL_MASK;

	360 if (result == USPOOF_ASCII \|\| result == USPOOF_SINGLE_SCRIPT_RESTRICTIVE)

	361 return true;

	362

	363 // When check is passed at 'highly restrictive' level, \|label\| is

	364 // made up of one of the following script sets optionally mixed with Latin.

	365 // - Chinese: Han, Bopomofo, Common

	366 // - Japanese: Han, Hiragana, Katakana, Common

	367 // - Korean: Hangul, Han, Common

	368 // Treat this case as a 'logical' single script unless Latin is mixed.

	369 if (result == USPOOF_HIGHLY_RESTRICTIVE &&

	370 latin_letters_.containsNone(label_string))

	371 return true;

	372

	373 // Additional checks for \|label\| with multiple scripts, one of which is Latin.

	374 // Disallow non-ASCII Latin letters to mix with a non-Latin script.

	375 if (non_ascii_latin_letters_.containsSome(label_string))

	376 return false;

	377

	378 base::AutoLock lock(g_dangerous_pattern_lock.Get());

	379 if (g_dangerous_pattern == nullptr) {

	380 // Disallow the katakana no, so, zo, or n, as they may be mistaken for

	381 // slashes when they're surrounded by non-Japanese scripts (i.e. scripts

	382 // other than Katakana, Hiragana or Han). If {no, so, zo, n} next to a

	383 // non-Japanese script on either side is disallowed, legitimate cases like

	384 // '{vitamin in Katakana}b6' are blocked. Note that trying to block those

	385 // characters when used alone as a label is futile because those cases

	386 // would not reach here.

	387 g_dangerous_pattern = new icu::RegexMatcher(

	388 icu::UnicodeString(

	389 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]"

	390 "[\\u30ce\\u30f3\\u30bd\\u30be]"

	391 "[^\\p{scx=kana}\\p{scx=hira}\\p{scx=hani}]", -1, US_INV),

	392 0, status);

291 }	393 }

292 return true;	394 g_dangerous_pattern->reset(label_string);

293 }	395 return !g_dangerous_pattern->find();

294	396 }

295 // Check if the script of a language can be 'safely' mixed with	397

296 // Latin letters in the ASCII range.	398 void IDNSpoofChecker::SetAllowedUnicodeSet(UErrorCode* status) {

297 bool IsCompatibleWithASCIILetters(const std::string& lang) {	399 if (U_FAILURE(*status))

298 // For now, just list Chinese, Japanese and Korean (positive list).	400 return;

299 // An alternative is negative-listing (languages using Greek and	401

300 // Cyrillic letters), but it can be more dangerous.	402 // The recommended set is a set of characters for identifiers in a

301 return !lang.substr(0, 2).compare("zh") \|\| !lang.substr(0, 2).compare("ja") \|\|	403 // security-sensitive environment taken from UTR 39

302 !lang.substr(0, 2).compare("ko");	404 // (http://unicode.org/reports/tr39/) and

303 }	405 // http://www.unicode.org/Public/security/latest/xidmodifications.txt .

304	406 // The inclusion set comes from "Candidate Characters for Inclusion

305 typedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap;	407 // in idenfiers" of UTR 31 (http://www.unicode.org/reports/tr31). The list

306	408 // may change over the time and will be updated whenever the version of ICU

307 class LangToExemplarSet {	409 // used in Chromium is updated.

308 public:	410 const icu::UnicodeSet* recommended_set =

309 static LangToExemplarSet* GetInstance() {	411 uspoof_getRecommendedUnicodeSet(status);

310 return base::Singleton<LangToExemplarSet>::get();	412 icu::UnicodeSet allowed_set;

311 }	413 allowed_set.addAll(*recommended_set);

312	414 const icu::UnicodeSet* inclusion_set = uspoof_getInclusionUnicodeSet(status);

313 private:	415 allowed_set.addAll(*inclusion_set);

314 LangToExemplarSetMap map;	416

315 LangToExemplarSet() {}	417 // From UTR 31 Table 6:

316 ~LangToExemplarSet() {	418 // http://www.unicode.org/reports/tr31/#Aspirational_Use_Scripts and
	Ryan Sleevi 2016/03/16 20:30:49 the "and" appears to be an incomplete comment? the "and" appears to be an incomplete comment? jungshik at Google 2016/03/17 07:43:26 Acknowledged. Show quoted text On 2016/03/16 20:30:49, Ryan Sleevi wrote: > the "and" appears to be an incomplete comment? Acknowledged.
317 STLDeleteContainerPairSecondPointers(map.begin(), map.end());	419 // We cannot add all the characters of aspirational scripts because some

318 }	420 // characters are excluded. Instead, use characters listed with Status/Type
	Ryan Sleevi 2016/03/16 20:30:49 comment nit: Because some characters of the aspira comment nit: Because some characters of the aspirational scripts are excluded [from what?], it's not possible to add the aspirational scripts themselves. The goal is trying to avoid the "we", and also trying to get clarification as to what they're "excluded" from. Overall, this comment is "Not all characters in the script are included because some are excluded", but that's missing a few subjects (included in what, excluded from what) Peter Kasting 2016/03/17 06:01:24 (FWIW I don't mind "we" as avoiding it sometimes r Show quoted text On 2016/03/16 20:30:49, Ryan Sleevi wrote: > comment nit: Because some characters of the aspirational scripts are excluded > [from what?], it's not possible to add the aspirational scripts themselves. > > The goal is trying to avoid the "we", and also trying to get clarification as to > what they're "excluded" from. Overall, this comment is "Not all characters in > the script are included because some are excluded", but that's missing a few > subjects (included in what, excluded from what) (FWIW I don't mind "we" as avoiding it sometimes results in passive voice or confused constructions, but I do agree in this case that I don't know what the comment is trying to say, probably because I don't know what it means for a "character to be excluded".) jungshik at Google 2016/03/17 07:43:26 Rewrote the paragraph. Show quoted text On 2016/03/16 20:30:49, Ryan Sleevi wrote: > comment nit: Because some characters of the aspirational scripts are excluded > [from what?], it's not possible to add the aspirational scripts themselves. > > The goal is trying to avoid the "we", and also trying to get clarification as to > what they're "excluded" from. Overall, this comment is "Not all characters in > the script are included because some are excluded", but that's missing a few > subjects (included in what, excluded from what) Rewrote the paragraph. jungshik at Google 2016/03/17 07:43:26 Yeah, a lot of passive voice sentences were added Show quoted text On 2016/03/17 06:01:24, Peter Kasting wrote: > On 2016/03/16 20:30:49, Ryan Sleevi wrote: > > comment nit: Because some characters of the aspirational scripts are excluded > > [from what?], it's not possible to add the aspirational scripts themselves. > > > > The goal is trying to avoid the "we", and also trying to get clarification as > to > > what they're "excluded" from. Overall, this comment is "Not all characters in > > the script are included because some are excluded", but that's missing a few > > subjects (included in what, excluded from what) > > (FWIW I don't mind "we" as avoiding it sometimes results in passive voice or > confused constructions, but I do agree in this case that I don't know what the > comment is trying to say, probably because I don't know what it means for a > "character to be excluded".) Yeah, a lot of passive voice sentences were added :-). Anyway, I rewrote the paragraph.
319	421 // = Aspirational at

320 friend class base::Singleton<LangToExemplarSet>;	422 // http://www.unicode.org/Public/security/latest/xidmodifications.txt .

321 friend struct base::DefaultSingletonTraits<LangToExemplarSet>;	423 // The list has to be updated when a new version of Unicode is released. The

322 friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**);	424 // current version is 8.0.0.
	Ryan Sleevi 2016/03/16 20:30:48 Is there a way to guard this with a compile-time f Is there a way to guard this with a compile-time flag, such as indicating what version of ICU is being used? In this way, any time an ICU uprev happens, the person performing the uprev re-evaluates this code and updates the compile-time guard as appropriate. jungshik at Google 2016/03/17 07:43:26 I'm adding a compile time guard based on ICU major Show quoted text On 2016/03/16 20:30:48, Ryan Sleevi wrote: > Is there a way to guard this with a compile-time flag, such as indicating what > version of ICU is being used? In this way, any time an ICU uprev happens, the > person performing the uprev re-evaluates this code and updates the compile-time > guard as appropriate. I'm adding a compile time guard based on ICU major version number.
323 friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*);	425 const icu::UnicodeSet aspirational_scripts(

324	426 icu::UnicodeString(

325 DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet);	427 // Unified Canadian Syllabics

326 };	428 "[\\u1401-\\u166C\\u166F-\\u167F"

327	429 // Mongolian

328 bool GetExemplarSetForLang(const std::string& lang,	430 "\\u1810-\\u1819\\u1820-\\u1877\\u1880-\\u18AA"

329 icu::UnicodeSet** lang_set) {	431 // Unified Canadian Syllabics

330 const LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map;	432 "\\u18B0-\\u18F5"

331 LangToExemplarSetMap::const_iterator pos = map.find(lang);	433 // Tifinagh

332 if (pos != map.end()) {	434 "\\u2D30-\\u2D67\\u2D7F"

333 *lang_set = pos->second;	435 // Yi

334 return true;	436 "\\uA000-\\uA48C"

335 }	437 // Miao

336 return false;	438 "\\U00016F00-\\U00016F44\\U00016F50-\\U00016F7F"

337 }	439 "\\U00016F8F-\\U00016F9F]",

338	440 -1, US_INV),

339 void SetExemplarSetForLang(const std::string& lang, icu::UnicodeSet* lang_set) {	441 *status);

340 LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map;	442 allowed_set.addAll(aspirational_scripts);

341 map.insert(std::make_pair(lang, lang_set));	443

342 }	444 // U+0338 is included in the recommended set while U+05F4 and U+2027 are in
	Ryan Sleevi 2016/03/16 20:30:49 s/set while/set, while/ s/set while/set, while/ jungshik at Google 2016/03/17 07:43:26 Done. Show quoted text On 2016/03/16 20:30:49, Ryan Sleevi wrote: > s/set while/set, while/ Done.
343	445 // the inclusion set, but are blacklisted as a part of Mozilla's IDN blacklist

344 static base::LazyInstance<base::Lock>::Leaky g_lang_set_lock =	446 // (http://kb.mozillazine.org/Network.IDN.blacklist_chars). U+0338 and U+2027

345 LAZY_INSTANCE_INITIALIZER;	447 // are dropped because U+0338 can look like a slash when rendered with a

346	448 // broken font and U+2027 can be confused with U+30FB (Katakana Middle Dot).
	Ryan Sleevi 2016/03/16 20:30:48 s/font and/font, and/ s/font and/font, and/ Peter Kasting 2016/03/17 06:01:24 Hmm, I don't think that's actually better. (It wo Show quoted text On 2016/03/16 20:30:48, Ryan Sleevi wrote: > s/font and/font, and/ Hmm, I don't think that's actually better. (It would be if "because" was replaced with a semicolon, though.) jungshik at Google 2016/03/17 07:43:25 Replaced 'because' with a semicolon :-) Show quoted text On 2016/03/17 06:01:24, Peter Kasting wrote: > On 2016/03/16 20:30:48, Ryan Sleevi wrote: > > s/font and/font, and/ > > Hmm, I don't think that's actually better. (It would be if "because" was > replaced with a semicolon, though.) Replaced 'because' with a semicolon :-) jungshik at Google 2016/03/17 07:43:26 Done. Show quoted text On 2016/03/16 20:30:48, Ryan Sleevi wrote: > s/font and/font, and/ Done.
347 // Returns true if all the characters in component_characters are used by	449 // U+05F4 (Hebrew Punctuation Gershayim) can look like a double quotation

348 // the language \|lang\|.	450 // mark, but using it in Hebrew should be safe. When used with a non-Hebrew

349 bool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters,	451 // script, it'd be filtered by other checks in place.

350 const std::string& lang) {	452 allowed_set.remove(0x338u); // Combining Long Solidus Overlay

351 CR_DEFINE_STATIC_LOCAL(const icu::UnicodeSet, kASCIILetters, ('a', 'z'));	453 allowed_set.remove(0x2027u); // Hyphenation Point

352 icu::UnicodeSet* lang_set = nullptr;	454

353 // We're called from both the UI thread and the history thread.	455 uspoof_setAllowedUnicodeSet(checker_, &allowed_set, status);

354 {

355 base::AutoLock lock(g_lang_set_lock.Get());

356 if (!GetExemplarSetForLang(lang, &lang_set)) {

357 UErrorCode status = U_ZERO_ERROR;

358 ULocaleData* uld = ulocdata_open(lang.c_str(), &status);

359 // TODO(jungshik) Turn this check on when the ICU data file is

360 // rebuilt with the minimal subset of locale data for languages

361 // to which Chrome is not localized but which we offer in the list

362 // of languages selectable for Accept-Languages. With the rebuilt ICU

363 // data, ulocdata_open never should fall back to the default locale.

364 // (issue 2078)

365 // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING);

366 if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) {

367 lang_set = reinterpret_cast<icu::UnicodeSet*>(ulocdata_getExemplarSet(

368 uld, nullptr, 0, ULOCDATA_ES_STANDARD, &status));

369 // On success, if \|lang\| is compatible with ASCII Latin letters, add

370 // them.

371 if (lang_set && IsCompatibleWithASCIILetters(lang))

372 lang_set->addAll(kASCIILetters);

373 }

374

375 if (!lang_set)

376 lang_set = new icu::UnicodeSet(1, 0);

377

378 lang_set->freeze();

379 SetExemplarSetForLang(lang, lang_set);

380 ulocdata_close(uld);

381 }

382 }

383 return !lang_set->isEmpty() && lang_set->containsAll(component_characters);

384 }	456 }

385	457

386 // Returns true if the given Unicode host component is safe to display to the	458 // Returns true if the given Unicode host component is safe to display to the

387 // user.	459 // user. Note that this function does not deal with pure ASCII domain labels at

388 bool IsIDNComponentSafe(const base::char16* str,	460 // all even though it's possible to make up look-alike labels with ASCII

389 int str_len,	461 // characters alone.

390 const std::string& languages) {	462 bool IsIDNComponentSafe(base::StringPiece16 label) {

391 // Most common cases (non-IDN) do not reach here so that we don't	463 return g_idn_spoof_checker.Get().Check(label);

392 // need a fast return path.

393 // TODO(jungshik) : Check if there's any character inappropriate

394 // (although allowed) for domain names.

395 // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and

396 // http://www.unicode.org/reports/tr39/data/xidmodifications.txt

397 // For now, we borrow the list from Mozilla and tweaked it slightly.

398 // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because

399 // they're gonna be canonicalized to U+0020 and full stop before

400 // reaching here.)

401 // The original list is available at

402 // http://kb.mozillazine.org/Network.IDN.blacklist_chars and

403 // at

404 // http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703

405

406 UErrorCode status = U_ZERO_ERROR;

407 #ifdef U_WCHAR_IS_UTF16

408 icu::UnicodeSet dangerous_characters(

409 icu::UnicodeString(

410 L"[[\\ \u00ad\u00bc\u00bd\u01c3\u0337\u0338"

411 L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]"

412 L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]"

413 L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae"

414 L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014"

415 L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14"

416 L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]"

417 L"[\ufffa-\ufffd]\U0001f50f\U0001f510\U0001f512\U0001f513]"),

418 status);

419 DCHECK(U_SUCCESS(status));

420 icu::RegexMatcher dangerous_patterns(

421 icu::UnicodeString(

422 // Lone katakana no, so, or n

423 L"[^\\p{Katakana}][\u30ce\u30f3\u30bd][^\\p{Katakana}]"

424 // Repeating Japanese accent characters

425 L"\|[\u3099\u309a\u309b\u309c][\u3099\u309a\u309b\u309c]"),

426 0, status);

427 #else

428 icu::UnicodeSet dangerous_characters(

429 icu::UnicodeString(

430 "[[\\u0020\\u00ad\\u00bc\\u00bd\\u01c3\\u0337\\u0338"

431 "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]"

432 "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]"

433 "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae"

434 "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014"

435 "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe"

436 "14"

437 "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\uff"

438 "f9]"

439 "[\\ufffa-\\ufffd]\\U0001f50f\\U0001f510\\U0001f512\\U0001f513]",

440 -1, US_INV),

441 status);

442 DCHECK(U_SUCCESS(status));

443 icu::RegexMatcher dangerous_patterns(

444 icu::UnicodeString(

445 // Lone katakana no, so, or n

446 "[^\\p{Katakana}][\\u30ce\\u30f3\\u30bd][^\\p{Katakana}]"

447 // Repeating Japanese accent characters

448 "\|[\\u3099\\u309a\\u309b\\u309c][\\u3099\\u309a\\u309b\\u309c]"),

449 0, status);

450 #endif

451 DCHECK(U_SUCCESS(status));

452 icu::UnicodeSet component_characters;

453 icu::UnicodeString component_string(str, str_len);

454 component_characters.addAll(component_string);

455 if (dangerous_characters.containsSome(component_characters))

456 return false;

457

458 DCHECK(U_SUCCESS(status));

459 dangerous_patterns.reset(component_string);

460 if (dangerous_patterns.find())

461 return false;

462

463 // If the language list is empty, the result is completely determined

464 // by whether a component is a single script or not. This will block

465 // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are

466 // allowed with \|languages\| (while it blocks Chinese + Latin letters with

467 // an accent as should be the case), but we want to err on the safe side

468 // when \|languages\| is empty.

469 if (languages.empty())

470 return IsIDNComponentInSingleScript(str, str_len);

471

472 // \|common_characters\| is made up of ASCII numbers, hyphen, plus and

473 // underscore that are used across scripts and allowed in domain names.

474 // (sync'd with characters allowed in url_canon_host with square

475 // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc.

476 icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"),

477 status);

478 DCHECK(U_SUCCESS(status));

479 // Subtract common characters because they're always allowed so that

480 // we just have to check if a language-specific set contains

481 // the remainder.

482 component_characters.removeAll(common_characters);

483

484 base::StringTokenizer t(languages, ",");

485 while (t.GetNext()) {

486 if (IsComponentCoveredByLang(component_characters, t.token()))

487 return true;

488 }

489 return false;

490 }	464 }

491	465

492 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to	466 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to

493 // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().	467 // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().

494 //	468 //

495 // We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with	469 // We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with the

496 // the backward compatibility in mind. What it does:	470 // backward compatibility in mind. What it does:

497 //	471 //

498 // 1. Use the up-to-date Unicode data.	472 // 1. Use the up-to-date Unicode data.

499 // 2. Define a case folding/mapping with the up-to-date Unicode data as	473 // 2. Define a case folding/mapping with the up-to-date Unicode data as in

500 // in IDNA 2003.	474 // IDNA 2003.

501 // 3. Use transitional mechanism for 4 deviation characters (sharp-s,	475 // 3. Use transitional mechanism for 4 deviation characters (sharp-s,

502 // final sigma, ZWJ and ZWNJ) for now.	476 // final sigma, ZWJ and ZWNJ) for now.

503 // 4. Continue to allow symbols and punctuations.	477 // 4. Continue to allow symbols and punctuations.

504 // 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules.	478 // 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules.

505 // 6. Do not apply STD3 rules	479 // 6. Do not apply STD3 rules

506 // 7. Do not allow unassigned code points.	480 // 7. Do not allow unassigned code points.

507 //	481 //

508 // It also closely matches what IE 10 does except for the BiDi check (	482 // It also closely matches what IE 10 does except for the BiDi check (

509 // http://goo.gl/3XBhqw ).	483 // http://goo.gl/3XBhqw ).

510 // See http://http://unicode.org/reports/tr46/ and references therein	484 // See http://http://unicode.org/reports/tr46/ and references therein/ for more

511 // for more details.	485 // details.

512 struct UIDNAWrapper {	486 struct UIDNAWrapper {

513 UIDNAWrapper() {	487 UIDNAWrapper() {

514 UErrorCode err = U_ZERO_ERROR;	488 UErrorCode err = U_ZERO_ERROR;

515 // TODO(jungshik): Change options as different parties (browsers,	489 // TODO(jungshik): Change options as different parties (browsers,

516 // registrars, search engines) converge toward a consensus.	490 // registrars, search engines) converge toward a consensus.

517 value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err);	491 value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err);

518 if (U_FAILURE(err))	492 if (U_FAILURE(err))

519 value = NULL;	493 value = NULL;

520 }	494 }

521	495

522 UIDNA* value;	496 UIDNA* value;

523 };	497 };

524	498

525 static base::LazyInstance<UIDNAWrapper>::Leaky g_uidna =	499 base::LazyInstance<UIDNAWrapper>::Leaky g_uidna = LAZY_INSTANCE_INITIALIZER;

526 LAZY_INSTANCE_INITIALIZER;

527	500

528 // Converts one component of a host (between dots) to IDN if safe. The result	501 // Converts one component (label) of a host (between dots) to Unicode if safe.

529 // will be APPENDED to the given output string and will be the same as the input	502 // The result will be APPENDED to the given output string and will be the

530 // if it is not IDN or the IDN is unsafe to display. Returns whether any	503 // same as the input if it is not IDN in ACE/punycode or the IDN is unsafe to

531 // conversion was performed.	504 // display.

	505 // Returns whether any conversion was performed.

532 bool IDNToUnicodeOneComponent(const base::char16* comp,	506 bool IDNToUnicodeOneComponent(const base::char16* comp,

533 size_t comp_len,	507 size_t comp_len,

534 const std::string& languages,

535 base::string16* out) {	508 base::string16* out) {

536 DCHECK(out);	509 DCHECK(out);

537 if (comp_len == 0)	510 if (comp_len == 0)

538 return false;	511 return false;

539	512

540 // Only transform if the input can be an IDN component.	513 // Only transform if the input can be an IDN component.

541 static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'};	514 static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'};

542 if ((comp_len > arraysize(kIdnPrefix)) &&	515 if ((comp_len > arraysize(kIdnPrefix)) &&

543 !memcmp(comp, kIdnPrefix, sizeof(kIdnPrefix))) {	516 !memcmp(comp, kIdnPrefix, sizeof(kIdnPrefix))) {

544 UIDNA* uidna = g_uidna.Get().value;	517 UIDNA* uidna = g_uidna.Get().value;

545 DCHECK(uidna != NULL);	518 DCHECK(uidna != NULL);

546 size_t original_length = out->length();	519 size_t original_length = out->length();

547 int output_length = 64;	520 int32_t output_length = 64;

548 UIDNAInfo info = UIDNA_INFO_INITIALIZER;	521 UIDNAInfo info = UIDNA_INFO_INITIALIZER;

549 UErrorCode status;	522 UErrorCode status;

550 do {	523 do {

551 out->resize(original_length + output_length);	524 out->resize(original_length + output_length);

552 status = U_ZERO_ERROR;	525 status = U_ZERO_ERROR;

553 // This returns the actual length required. If this is more than 64	526 // This returns the actual length required. If this is more than 64

554 // code units, \|status\| will be U_BUFFER_OVERFLOW_ERROR and we'll try	527 // code units, \|status\| will be U_BUFFER_OVERFLOW_ERROR and we'll try

555 // the conversion again, but with a sufficiently large buffer.	528 // the conversion again, but with a sufficiently large buffer.

556 output_length = uidna_labelToUnicode(	529 output_length = uidna_labelToUnicode(

557 uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length],	530 uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length],

558 output_length, &info, &status);	531 output_length, &info, &status);

559 } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0));	532 } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0));

560	533

561 if (U_SUCCESS(status) && info.errors == 0) {	534 if (U_SUCCESS(status) && info.errors == 0) {

562 // Converted successfully. Ensure that the converted component	535 // Converted successfully. Ensure that the converted component

563 // can be safely displayed to the user.	536 // can be safely displayed to the user.

564 out->resize(original_length + output_length);	537 out->resize(original_length + output_length);

565 if (IsIDNComponentSafe(out->data() + original_length, output_length,	538 if (IsIDNComponentSafe(

566 languages))	539 base::StringPiece16(out->data() + original_length,

	540 base::checked_cast<size_t>(output_length))))

567 return true;	541 return true;

568 }	542 }

569	543

570 // Something went wrong. Revert to original string.	544 // Something went wrong. Revert to original string.

571 out->resize(original_length);	545 out->resize(original_length);

572 }	546 }

573	547

574 // We get here with no IDN or on error, in which case we just append the	548 // We get here with no IDN or on error, in which case we just append the

575 // literal input.	549 // literal input.

576 out->append(comp, comp_len);	550 out->append(comp, comp_len);

(...skipping 14 matching lines...) Expand all Loading...
591 const std::string& languages,	565 const std::string& languages,

592 FormatUrlTypes format_types,	566 FormatUrlTypes format_types,

593 net::UnescapeRule::Type unescape_rules,	567 net::UnescapeRule::Type unescape_rules,

594 url::Parsed* new_parsed,	568 url::Parsed* new_parsed,

595 size_t* prefix_end,	569 size_t* prefix_end,

596 size_t* offset_for_adjustment) {	570 size_t* offset_for_adjustment) {

597 std::vector<size_t> offsets;	571 std::vector<size_t> offsets;

598 if (offset_for_adjustment)	572 if (offset_for_adjustment)

599 offsets.push_back(*offset_for_adjustment);	573 offsets.push_back(*offset_for_adjustment);

600 base::string16 result =	574 base::string16 result =

601 FormatUrlWithOffsets(url, languages, format_types, unescape_rules,	575 FormatUrlWithOffsets(url, std::string(), format_types, unescape_rules,

602 new_parsed, prefix_end, &offsets);	576 new_parsed, prefix_end, &offsets);

603 if (offset_for_adjustment)	577 if (offset_for_adjustment)

604 *offset_for_adjustment = offsets[0];	578 *offset_for_adjustment = offsets[0];

605 return result;	579 return result;

606 }	580 }

607	581

608 base::string16 FormatUrlWithOffsets(	582 base::string16 FormatUrlWithOffsets(

609 const GURL& url,	583 const GURL& url,

610 const std::string& languages,	584 const std::string& languages,

611 FormatUrlTypes format_types,	585 FormatUrlTypes format_types,

612 net::UnescapeRule::Type unescape_rules,	586 net::UnescapeRule::Type unescape_rules,

613 url::Parsed* new_parsed,	587 url::Parsed* new_parsed,

614 size_t* prefix_end,	588 size_t* prefix_end,

615 std::vector<size_t>* offsets_for_adjustment) {	589 std::vector<size_t>* offsets_for_adjustment) {

616 base::OffsetAdjuster::Adjustments adjustments;	590 base::OffsetAdjuster::Adjustments adjustments;

617 const base::string16& format_url_return_value =	591 const base::string16& format_url_return_value =

618 FormatUrlWithAdjustments(url, languages, format_types, unescape_rules,	592 FormatUrlWithAdjustments(url, std::string(), format_types, unescape_rules,

619 new_parsed, prefix_end, &adjustments);	593 new_parsed, prefix_end, &adjustments);

620 base::OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);	594 base::OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);

621 if (offsets_for_adjustment) {	595 if (offsets_for_adjustment) {

622 std::for_each(	596 std::for_each(

623 offsets_for_adjustment->begin(), offsets_for_adjustment->end(),	597 offsets_for_adjustment->begin(), offsets_for_adjustment->end(),

624 base::LimitOffset<std::string>(format_url_return_value.length()));	598 base::LimitOffset<std::string>(format_url_return_value.length()));

625 }	599 }

626 return format_url_return_value;	600 return format_url_return_value;

627 }	601 }

628	602

(...skipping 14 matching lines...) Expand all Loading...
643 *new_parsed = url::Parsed();	617 *new_parsed = url::Parsed();

644	618

645 // Special handling for view-source:. Don't use content::kViewSourceScheme	619 // Special handling for view-source:. Don't use content::kViewSourceScheme

646 // because this library shouldn't depend on chrome.	620 // because this library shouldn't depend on chrome.

647 const char kViewSource[] = "view-source";	621 const char kViewSource[] = "view-source";

648 // Reject "view-source:view-source:..." to avoid deep recursion.	622 // Reject "view-source:view-source:..." to avoid deep recursion.

649 const char kViewSourceTwice[] = "view-source:view-source:";	623 const char kViewSourceTwice[] = "view-source:view-source:";

650 if (url.SchemeIs(kViewSource) &&	624 if (url.SchemeIs(kViewSource) &&

651 !base::StartsWith(url.possibly_invalid_spec(), kViewSourceTwice,	625 !base::StartsWith(url.possibly_invalid_spec(), kViewSourceTwice,

652 base::CompareCase::INSENSITIVE_ASCII)) {	626 base::CompareCase::INSENSITIVE_ASCII)) {

653 return FormatViewSourceUrl(url, languages, format_types, unescape_rules,	627 return FormatViewSourceUrl(url, format_types, unescape_rules,

654 new_parsed, prefix_end, adjustments);	628 new_parsed, prefix_end, adjustments);

655 }	629 }

656	630

657 // We handle both valid and invalid URLs (this will give us the spec	631 // We handle both valid and invalid URLs (this will give us the spec

658 // regardless of validity).	632 // regardless of validity).

659 const std::string& spec = url.possibly_invalid_spec();	633 const std::string& spec = url.possibly_invalid_spec();

660 const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();	634 const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();

661	635

662 // Scheme & separators. These are ASCII.	636 // Scheme & separators. These are ASCII.

663 base::string16 url_string;	637 base::string16 url_string;

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
713 AppendFormattedComponent(spec, parsed.password,	687 AppendFormattedComponent(spec, parsed.password,

714 NonHostComponentTransform(unescape_rules),	688 NonHostComponentTransform(unescape_rules),

715 &url_string, &new_parsed->password, adjustments);	689 &url_string, &new_parsed->password, adjustments);

716 if (parsed.username.is_valid() \|\| parsed.password.is_valid())	690 if (parsed.username.is_valid() \|\| parsed.password.is_valid())

717 url_string.push_back('@');	691 url_string.push_back('@');

718 }	692 }

719 if (prefix_end)	693 if (prefix_end)

720 *prefix_end = static_cast<size_t>(url_string.length());	694 *prefix_end = static_cast<size_t>(url_string.length());

721	695

722 // Host.	696 // Host.

723 AppendFormattedComponent(spec, parsed.host, HostComponentTransform(languages),	697 AppendFormattedComponent(spec, parsed.host, HostComponentTransform(),

724 &url_string, &new_parsed->host, adjustments);	698 &url_string, &new_parsed->host, adjustments);

725	699

726 // Port.	700 // Port.

727 if (parsed.port.is_nonempty()) {	701 if (parsed.port.is_nonempty()) {

728 url_string.push_back(':');	702 url_string.push_back(':');

729 new_parsed->port.begin = url_string.length();	703 new_parsed->port.begin = url_string.length();

730 url_string.insert(url_string.end(), spec.begin() + parsed.port.begin,	704 url_string.insert(url_string.end(), spec.begin() + parsed.port.begin,

731 spec.begin() + parsed.port.end());	705 spec.begin() + parsed.port.end());

732 new_parsed->port.len = url_string.length() - new_parsed->port.begin;	706 new_parsed->port.len = url_string.length() - new_parsed->port.begin;

733 } else {	707 } else {

(...skipping 55 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
789 // the hostname.	763 // the hostname.

790 return url.IsStandard() && !url.SchemeIsFile() && !url.SchemeIsFileSystem() &&	764 return url.IsStandard() && !url.SchemeIsFile() && !url.SchemeIsFileSystem() &&

791 !url.has_query() && !url.has_ref() && url.path() == "/";	765 !url.has_query() && !url.has_ref() && url.path() == "/";

792 }	766 }

793	767

794 void AppendFormattedHost(const GURL& url,	768 void AppendFormattedHost(const GURL& url,

795 const std::string& languages,	769 const std::string& languages,

796 base::string16* output) {	770 base::string16* output) {

797 AppendFormattedComponent(	771 AppendFormattedComponent(

798 url.possibly_invalid_spec(), url.parsed_for_possibly_invalid_spec().host,	772 url.possibly_invalid_spec(), url.parsed_for_possibly_invalid_spec().host,

799 HostComponentTransform(languages), output, NULL, NULL);	773 HostComponentTransform(), output, NULL, NULL);

800 }	774 }

801	775

802 base::string16 IDNToUnicode(const std::string& host,	776 base::string16 IDNToUnicode(const std::string& host,

803 const std::string& languages) {	777 const std::string& languages) {

804 return IDNToUnicodeWithAdjustments(host, languages, NULL);	778 return IDNToUnicodeWithAdjustments(host, NULL);

805 }	779 }

806	780

807 base::string16 StripWWW(const base::string16& text) {	781 base::string16 StripWWW(const base::string16& text) {

808 const base::string16 www(base::ASCIIToUTF16("www."));	782 const base::string16 www(base::ASCIIToUTF16("www."));

809 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)	783 return base::StartsWith(text, www, base::CompareCase::SENSITIVE)

810 ? text.substr(www.length()) : text;	784 ? text.substr(www.length()) : text;

811 }	785 }

812	786

813 base::string16 StripWWWFromHost(const GURL& url) {	787 base::string16 StripWWWFromHost(const GURL& url) {

814 DCHECK(url.is_valid());	788 DCHECK(url.is_valid());

815 return StripWWW(base::ASCIIToUTF16(url.host_piece()));	789 return StripWWW(base::ASCIIToUTF16(url.host_piece()));

816 }	790 }

817	791

818 } // namespace url_formatter	792 } // namespace url_formatter

OLD	NEW