net/base/net_util_icu.cc - Issue 1260033005: Revert of Move net::FormatUrl and friends outside of //net and into //components

Side by Side Diff: net/base/net_util_icu.cc

Issue 1260033005: Revert of Move net::FormatUrl and friends outside of //net and into //components (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "net/base/net_util.h"	5 #include "net/base/net_util.h"

6	6

	7 #include <map>

	8 #include <vector>

	9

7 #include "base/i18n/time_formatting.h"	10 #include "base/i18n/time_formatting.h"

8 #include "base/json/string_escape.h"	11 #include "base/json/string_escape.h"

	12 #include "base/lazy_instance.h"

	13 #include "base/logging.h"

	14 #include "base/memory/singleton.h"

	15 #include "base/stl_util.h"

	16 #include "base/strings/string_tokenizer.h"

9 #include "base/strings/string_util.h"	17 #include "base/strings/string_util.h"

	18 #include "base/strings/utf_offset_string_conversions.h"

10 #include "base/strings/utf_string_conversions.h"	19 #include "base/strings/utf_string_conversions.h"

11 #include "net/base/escape.h"	20 #include "base/time/time.h"

	21 #include "url/gurl.h"

	22 #include "third_party/icu/source/common/unicode/uidna.h"

	23 #include "third_party/icu/source/common/unicode/uniset.h"

	24 #include "third_party/icu/source/common/unicode/uscript.h"

	25 #include "third_party/icu/source/common/unicode/uset.h"

	26 #include "third_party/icu/source/i18n/unicode/datefmt.h"

	27 #include "third_party/icu/source/i18n/unicode/regex.h"

	28 #include "third_party/icu/source/i18n/unicode/ulocdata.h"

	29

	30 using base::Time;

12	31

13 namespace net {	32 namespace net {

14	33

	34 namespace {

	35

	36 typedef std::vector<size_t> Offsets;

	37

	38 // Does some simple normalization of scripts so we can allow certain scripts

	39 // to exist together.

	40 // TODO(brettw) bug 880223: we should allow some other languages to be

	41 // oombined such as Chinese and Latin. We will probably need a more

	42 // complicated system of language pairs to have more fine-grained control.

	43 UScriptCode NormalizeScript(UScriptCode code) {

	44 switch (code) {

	45 case USCRIPT_KATAKANA:

	46 case USCRIPT_HIRAGANA:

	47 case USCRIPT_KATAKANA_OR_HIRAGANA:

	48 case USCRIPT_HANGUL: // This one is arguable.

	49 return USCRIPT_HAN;

	50 default:

	51 return code;

	52 }

	53 }

	54

	55 bool IsIDNComponentInSingleScript(const base::char16* str, int str_len) {

	56 UScriptCode first_script = USCRIPT_INVALID_CODE;

	57 bool is_first = true;

	58

	59 int i = 0;

	60 while (i < str_len) {

	61 unsigned code_point;

	62 U16_NEXT(str, i, str_len, code_point);

	63

	64 UErrorCode err = U_ZERO_ERROR;

	65 UScriptCode cur_script = uscript_getScript(code_point, &err);

	66 if (err != U_ZERO_ERROR)

	67 return false; // Report mixed on error.

	68 cur_script = NormalizeScript(cur_script);

	69

	70 // TODO(brettw) We may have to check for USCRIPT_INHERENT as well.

	71 if (is_first && cur_script != USCRIPT_COMMON) {

	72 first_script = cur_script;

	73 is_first = false;

	74 } else {

	75 if (cur_script != USCRIPT_COMMON && cur_script != first_script)

	76 return false;

	77 }

	78 }

	79 return true;

	80 }

	81

	82 // Check if the script of a language can be 'safely' mixed with

	83 // Latin letters in the ASCII range.

	84 bool IsCompatibleWithASCIILetters(const std::string& lang) {

	85 // For now, just list Chinese, Japanese and Korean (positive list).

	86 // An alternative is negative-listing (languages using Greek and

	87 // Cyrillic letters), but it can be more dangerous.

	88 return !lang.substr(0, 2).compare("zh") \|\|

	89 !lang.substr(0, 2).compare("ja") \|\|

	90 !lang.substr(0, 2).compare("ko");

	91 }

	92

	93 typedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap;

	94

	95 class LangToExemplarSet {

	96 public:

	97 static LangToExemplarSet* GetInstance() {

	98 return Singleton<LangToExemplarSet>::get();

	99 }

	100

	101 private:

	102 LangToExemplarSetMap map;

	103 LangToExemplarSet() { }

	104 ~LangToExemplarSet() {

	105 STLDeleteContainerPairSecondPointers(map.begin(), map.end());

	106 }

	107

	108 friend class Singleton<LangToExemplarSet>;

	109 friend struct DefaultSingletonTraits<LangToExemplarSet>;

	110 friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**);

	111 friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*);

	112

	113 DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet);

	114 };

	115

	116 bool GetExemplarSetForLang(const std::string& lang,

	117 icu::UnicodeSet** lang_set) {

	118 const LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map;

	119 LangToExemplarSetMap::const_iterator pos = map.find(lang);

	120 if (pos != map.end()) {

	121 *lang_set = pos->second;

	122 return true;

	123 }

	124 return false;

	125 }

	126

	127 void SetExemplarSetForLang(const std::string& lang,

	128 icu::UnicodeSet* lang_set) {

	129 LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map;

	130 map.insert(std::make_pair(lang, lang_set));

	131 }

	132

	133 static base::LazyInstance<base::Lock>::Leaky

	134 g_lang_set_lock = LAZY_INSTANCE_INITIALIZER;

	135

	136 // Returns true if all the characters in component_characters are used by

	137 // the language \|lang\|.

	138 bool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters,

	139 const std::string& lang) {

	140 CR_DEFINE_STATIC_LOCAL(

	141 const icu::UnicodeSet, kASCIILetters, ('a', 'z'));

	142 icu::UnicodeSet* lang_set = nullptr;

	143 // We're called from both the UI thread and the history thread.

	144 {

	145 base::AutoLock lock(g_lang_set_lock.Get());

	146 if (!GetExemplarSetForLang(lang, &lang_set)) {

	147 UErrorCode status = U_ZERO_ERROR;

	148 ULocaleData* uld = ulocdata_open(lang.c_str(), &status);

	149 // TODO(jungshik) Turn this check on when the ICU data file is

	150 // rebuilt with the minimal subset of locale data for languages

	151 // to which Chrome is not localized but which we offer in the list

	152 // of languages selectable for Accept-Languages. With the rebuilt ICU

	153 // data, ulocdata_open never should fall back to the default locale.

	154 // (issue 2078)

	155 // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING);

	156 if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) {

	157 lang_set = reinterpret_cast<icu::UnicodeSet*>(ulocdata_getExemplarSet(

	158 uld, nullptr, 0, ULOCDATA_ES_STANDARD, &status));

	159 // On success, if \|lang\| is compatible with ASCII Latin letters, add

	160 // them.

	161 if (lang_set && IsCompatibleWithASCIILetters(lang))

	162 lang_set->addAll(kASCIILetters);

	163 }

	164

	165 if (!lang_set)

	166 lang_set = new icu::UnicodeSet(1, 0);

	167

	168 lang_set->freeze();

	169 SetExemplarSetForLang(lang, lang_set);

	170 ulocdata_close(uld);

	171 }

	172 }

	173 return !lang_set->isEmpty() && lang_set->containsAll(component_characters);

	174 }

	175

	176 // Returns true if the given Unicode host component is safe to display to the

	177 // user.

	178 bool IsIDNComponentSafe(const base::char16* str,

	179 int str_len,

	180 const std::string& languages) {

	181 // Most common cases (non-IDN) do not reach here so that we don't

	182 // need a fast return path.

	183 // TODO(jungshik) : Check if there's any character inappropriate

	184 // (although allowed) for domain names.

	185 // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and

	186 // http://www.unicode.org/reports/tr39/data/xidmodifications.txt

	187 // For now, we borrow the list from Mozilla and tweaked it slightly.

	188 // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because

	189 // they're gonna be canonicalized to U+0020 and full stop before

	190 // reaching here.)

	191 // The original list is available at

	192 // http://kb.mozillazine.org/Network.IDN.blacklist_chars and

	193 // at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js# 703

	194

	195 UErrorCode status = U_ZERO_ERROR;

	196 #ifdef U_WCHAR_IS_UTF16

	197 icu::UnicodeSet dangerous_characters(

	198 icu::UnicodeString(

	199 L"[[\\ \u00ad\u00bc\u00bd\u01c3\u0337\u0338"

	200 L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]"

	201 L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]"

	202 L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae"

	203 L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014"

	204 L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14"

	205 L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]"

	206 L"[\ufffa-\ufffd]\U0001f50f\U0001f510\U0001f512\U0001f513]"),

	207 status);

	208 DCHECK(U_SUCCESS(status));

	209 icu::RegexMatcher dangerous_patterns(icu::UnicodeString(

	210 // Lone katakana no, so, or n

	211 L"[^\\p{Katakana}][\u30ce\u30f3\u30bd][^\\p{Katakana}]"

	212 // Repeating Japanese accent characters

	213 L"\|[\u3099\u309a\u309b\u309c][\u3099\u309a\u309b\u309c]"),

	214 0, status);

	215 #else

	216 icu::UnicodeSet dangerous_characters(icu::UnicodeString(

	217 "[[\\u0020\\u00ad\\u00bc\\u00bd\\u01c3\\u0337\\u0338"

	218 "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]"

	219 "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]"

	220 "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae"

	221 "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014"

	222 "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14"

	223 "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]"

	224 "[\\ufffa-\\ufffd]\\U0001f50f\\U0001f510\\U0001f512\\U0001f513]", -1,

	225 US_INV), status);

	226 DCHECK(U_SUCCESS(status));

	227 icu::RegexMatcher dangerous_patterns(icu::UnicodeString(

	228 // Lone katakana no, so, or n

	229 "[^\\p{Katakana}][\\u30ce\\u30f3\\u30bd][^\\p{Katakana}]"

	230 // Repeating Japanese accent characters

	231 "\|[\\u3099\\u309a\\u309b\\u309c][\\u3099\\u309a\\u309b\\u309c]"),

	232 0, status);

	233 #endif

	234 DCHECK(U_SUCCESS(status));

	235 icu::UnicodeSet component_characters;

	236 icu::UnicodeString component_string(str, str_len);

	237 component_characters.addAll(component_string);

	238 if (dangerous_characters.containsSome(component_characters))

	239 return false;

	240

	241 DCHECK(U_SUCCESS(status));

	242 dangerous_patterns.reset(component_string);

	243 if (dangerous_patterns.find())

	244 return false;

	245

	246 // If the language list is empty, the result is completely determined

	247 // by whether a component is a single script or not. This will block

	248 // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are

	249 // allowed with \|languages\| (while it blocks Chinese + Latin letters with

	250 // an accent as should be the case), but we want to err on the safe side

	251 // when \|languages\| is empty.

	252 if (languages.empty())

	253 return IsIDNComponentInSingleScript(str, str_len);

	254

	255 // \|common_characters\| is made up of ASCII numbers, hyphen, plus and

	256 // underscore that are used across scripts and allowed in domain names.

	257 // (sync'd with characters allowed in url_canon_host with square

	258 // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc.

	259 icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"),

	260 status);

	261 DCHECK(U_SUCCESS(status));

	262 // Subtract common characters because they're always allowed so that

	263 // we just have to check if a language-specific set contains

	264 // the remainder.

	265 component_characters.removeAll(common_characters);

	266

	267 base::StringTokenizer t(languages, ",");

	268 while (t.GetNext()) {

	269 if (IsComponentCoveredByLang(component_characters, t.token()))

	270 return true;

	271 }

	272 return false;

	273 }

	274

	275 // A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to

	276 // a UTS46/IDNA 2008 handling object opened with uidna_openUTS46().

	277 //

	278 // We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with

	279 // the backward compatibility in mind. What it does:

	280 //

	281 // 1. Use the up-to-date Unicode data.

	282 // 2. Define a case folding/mapping with the up-to-date Unicode data as

	283 // in IDNA 2003.

	284 // 3. Use transitional mechanism for 4 deviation characters (sharp-s,

	285 // final sigma, ZWJ and ZWNJ) for now.

	286 // 4. Continue to allow symbols and punctuations.

	287 // 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules.

	288 // 6. Do not apply STD3 rules

	289 // 7. Do not allow unassigned code points.

	290 //

	291 // It also closely matches what IE 10 does except for the BiDi check (

	292 // http://goo.gl/3XBhqw ).

	293 // See http://http://unicode.org/reports/tr46/ and references therein

	294 // for more details.

	295 struct UIDNAWrapper {

	296 UIDNAWrapper() {

	297 UErrorCode err = U_ZERO_ERROR;

	298 // TODO(jungshik): Change options as different parties (browsers,

	299 // registrars, search engines) converge toward a consensus.

	300 value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err);

	301 if (U_FAILURE(err))

	302 value = NULL;

	303 }

	304

	305 UIDNA* value;

	306 };

	307

	308 static base::LazyInstance<UIDNAWrapper>::Leaky

	309 g_uidna = LAZY_INSTANCE_INITIALIZER;

	310

	311 // Converts one component of a host (between dots) to IDN if safe. The result

	312 // will be APPENDED to the given output string and will be the same as the input

	313 // if it is not IDN or the IDN is unsafe to display. Returns whether any

	314 // conversion was performed.

	315 bool IDNToUnicodeOneComponent(const base::char16* comp,

	316 size_t comp_len,

	317 const std::string& languages,

	318 base::string16* out) {

	319 DCHECK(out);

	320 if (comp_len == 0)

	321 return false;

	322

	323 // Only transform if the input can be an IDN component.

	324 static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'};

	325 if ((comp_len > arraysize(kIdnPrefix)) &&

	326 !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(base::char16))) {

	327 UIDNA* uidna = g_uidna.Get().value;

	328 DCHECK(uidna != NULL);

	329 size_t original_length = out->length();

	330 int output_length = 64;

	331 UIDNAInfo info = UIDNA_INFO_INITIALIZER;

	332 UErrorCode status;

	333 do {

	334 out->resize(original_length + output_length);

	335 status = U_ZERO_ERROR;

	336 // This returns the actual length required. If this is more than 64

	337 // code units, \|status\| will be U_BUFFER_OVERFLOW_ERROR and we'll try

	338 // the conversion again, but with a sufficiently large buffer.

	339 output_length = uidna_labelToUnicode(

	340 uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length],

	341 output_length, &info, &status);

	342 } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0));

	343

	344 if (U_SUCCESS(status) && info.errors == 0) {

	345 // Converted successfully. Ensure that the converted component

	346 // can be safely displayed to the user.

	347 out->resize(original_length + output_length);

	348 if (IsIDNComponentSafe(out->data() + original_length, output_length,

	349 languages))

	350 return true;

	351 }

	352

	353 // Something went wrong. Revert to original string.

	354 out->resize(original_length);

	355 }

	356

	357 // We get here with no IDN or on error, in which case we just append the

	358 // literal input.

	359 out->append(comp, comp_len);

	360 return false;

	361 }

	362

	363 // TODO(brettw) bug 734373: check the scripts for each host component and

	364 // don't un-IDN-ize if there is more than one. Alternatively, only IDN for

	365 // scripts that the user has installed. For now, just put the entire

	366 // path through IDN. Maybe this feature can be implemented in ICU itself?

	367 //

	368 // We may want to skip this step in the case of file URLs to allow unicode

	369 // UNC hostnames regardless of encodings.

	370 base::string16 IDNToUnicodeWithAdjustments(

	371 const std::string& host,

	372 const std::string& languages,

	373 base::OffsetAdjuster::Adjustments* adjustments) {

	374 if (adjustments)

	375 adjustments->clear();

	376 // Convert the ASCII input to a base::string16 for ICU.

	377 base::string16 input16;

	378 input16.reserve(host.length());

	379 input16.insert(input16.end(), host.begin(), host.end());

	380

	381 // Do each component of the host separately, since we enforce script matching

	382 // on a per-component basis.

	383 base::string16 out16;

	384 {

	385 for (size_t component_start = 0, component_end;

	386 component_start < input16.length();

	387 component_start = component_end + 1) {

	388 // Find the end of the component.

	389 component_end = input16.find('.', component_start);

	390 if (component_end == base::string16::npos)

	391 component_end = input16.length(); // For getting the last component.

	392 size_t component_length = component_end - component_start;

	393 size_t new_component_start = out16.length();

	394 bool converted_idn = false;

	395 if (component_end > component_start) {

	396 // Add the substring that we just found.

	397 converted_idn = IDNToUnicodeOneComponent(

	398 input16.data() + component_start, component_length, languages,

	399 &out16);

	400 }

	401 size_t new_component_length = out16.length() - new_component_start;

	402

	403 if (converted_idn && adjustments) {

	404 adjustments->push_back(base::OffsetAdjuster::Adjustment(

	405 component_start, component_length, new_component_length));

	406 }

	407

	408 // Need to add the dot we just found (if we found one).

	409 if (component_end < input16.length())

	410 out16.push_back('.');

	411 }

	412 }

	413 return out16;

	414 }

	415

	416 // If \|component\| is valid, its begin is incremented by \|delta\|.

	417 void AdjustComponent(int delta, url::Component* component) {

	418 if (!component->is_valid())

	419 return;

	420

	421 DCHECK(delta >= 0 \|\| component->begin >= -delta);

	422 component->begin += delta;

	423 }

	424

	425 // Adjusts all the components of \|parsed\| by \|delta\|, except for the scheme.

	426 void AdjustAllComponentsButScheme(int delta, url::Parsed* parsed) {

	427 AdjustComponent(delta, &(parsed->username));

	428 AdjustComponent(delta, &(parsed->password));

	429 AdjustComponent(delta, &(parsed->host));

	430 AdjustComponent(delta, &(parsed->port));

	431 AdjustComponent(delta, &(parsed->path));

	432 AdjustComponent(delta, &(parsed->query));

	433 AdjustComponent(delta, &(parsed->ref));

	434 }

	435

	436 // Helper for FormatUrlWithOffsets().

	437 base::string16 FormatViewSourceUrl(

	438 const GURL& url,

	439 const std::string& languages,

	440 FormatUrlTypes format_types,

	441 UnescapeRule::Type unescape_rules,

	442 url::Parsed* new_parsed,

	443 size_t* prefix_end,

	444 base::OffsetAdjuster::Adjustments* adjustments) {

	445 DCHECK(new_parsed);

	446 const char kViewSource[] = "view-source:";

	447 const size_t kViewSourceLength = arraysize(kViewSource) - 1;

	448

	449 // Format the underlying URL and record adjustments.

	450 const std::string& url_str(url.possibly_invalid_spec());

	451 adjustments->clear();

	452 base::string16 result(base::ASCIIToUTF16(kViewSource) +

	453 FormatUrlWithAdjustments(GURL(url_str.substr(kViewSourceLength)),

	454 languages, format_types, unescape_rules,

	455 new_parsed, prefix_end, adjustments));

	456 // Revise \|adjustments\| by shifting to the offsets to prefix that the above

	457 // call to FormatUrl didn't get to see.

	458 for (base::OffsetAdjuster::Adjustments::iterator it = adjustments->begin();

	459 it != adjustments->end(); ++it)

	460 it->original_offset += kViewSourceLength;

	461

	462 // Adjust positions of the parsed components.

	463 if (new_parsed->scheme.is_nonempty()) {

	464 // Assume "view-source:real-scheme" as a scheme.

	465 new_parsed->scheme.len += kViewSourceLength;

	466 } else {

	467 new_parsed->scheme.begin = 0;

	468 new_parsed->scheme.len = kViewSourceLength - 1;

	469 }

	470 AdjustAllComponentsButScheme(kViewSourceLength, new_parsed);

	471

	472 if (prefix_end)

	473 *prefix_end += kViewSourceLength;

	474

	475 return result;

	476 }

	477

	478 class AppendComponentTransform {

	479 public:

	480 AppendComponentTransform() {}

	481 virtual ~AppendComponentTransform() {}

	482

	483 virtual base::string16 Execute(

	484 const std::string& component_text,

	485 base::OffsetAdjuster::Adjustments* adjustments) const = 0;

	486

	487 // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an

	488 // accessible copy constructor in order to call AppendFormattedComponent()

	489 // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ).

	490 };

	491

	492 class HostComponentTransform : public AppendComponentTransform {

	493 public:

	494 explicit HostComponentTransform(const std::string& languages)

	495 : languages_(languages) {

	496 }

	497

	498 private:

	499 base::string16 Execute(

	500 const std::string& component_text,

	501 base::OffsetAdjuster::Adjustments* adjustments) const override {

	502 return IDNToUnicodeWithAdjustments(component_text, languages_,

	503 adjustments);

	504 }

	505

	506 const std::string& languages_;

	507 };

	508

	509 class NonHostComponentTransform : public AppendComponentTransform {

	510 public:

	511 explicit NonHostComponentTransform(UnescapeRule::Type unescape_rules)

	512 : unescape_rules_(unescape_rules) {

	513 }

	514

	515 private:

	516 base::string16 Execute(

	517 const std::string& component_text,

	518 base::OffsetAdjuster::Adjustments* adjustments) const override {

	519 return (unescape_rules_ == UnescapeRule::NONE) ?

	520 base::UTF8ToUTF16WithAdjustments(component_text, adjustments) :

	521 UnescapeAndDecodeUTF8URLComponentWithAdjustments(component_text,

	522 unescape_rules_, adjustments);

	523 }

	524

	525 const UnescapeRule::Type unescape_rules_;

	526 };

	527

	528 // Transforms the portion of \|spec\| covered by \|original_component\| according to

	529 // \|transform\|. Appends the result to \|output\|. If \|output_component\| is

	530 // non-NULL, its start and length are set to the transformed component's new

	531 // start and length. If \|adjustments\| is non-NULL, appends adjustments (if

	532 // any) that reflect the transformation the original component underwent to

	533 // become the transformed value appended to \|output\|.

	534 void AppendFormattedComponent(const std::string& spec,

	535 const url::Component& original_component,

	536 const AppendComponentTransform& transform,

	537 base::string16* output,

	538 url::Component* output_component,

	539 base::OffsetAdjuster::Adjustments* adjustments) {

	540 DCHECK(output);

	541 if (original_component.is_nonempty()) {

	542 size_t original_component_begin =

	543 static_cast<size_t>(original_component.begin);

	544 size_t output_component_begin = output->length();

	545 std::string component_str(spec, original_component_begin,

	546 static_cast<size_t>(original_component.len));

	547

	548 // Transform \|component_str\| and modify \|adjustments\| appropriately.

	549 base::OffsetAdjuster::Adjustments component_transform_adjustments;

	550 output->append(

	551 transform.Execute(component_str, &component_transform_adjustments));

	552

	553 // Shift all the adjustments made for this component so the offsets are

	554 // valid for the original string and add them to \|adjustments\|.

	555 for (base::OffsetAdjuster::Adjustments::iterator comp_iter =

	556 component_transform_adjustments.begin();

	557 comp_iter != component_transform_adjustments.end(); ++comp_iter)

	558 comp_iter->original_offset += original_component_begin;

	559 if (adjustments) {

	560 adjustments->insert(adjustments->end(),

	561 component_transform_adjustments.begin(),

	562 component_transform_adjustments.end());

	563 }

	564

	565 // Set positions of the parsed component.

	566 if (output_component) {

	567 output_component->begin = static_cast<int>(output_component_begin);

	568 output_component->len =

	569 static_cast<int>(output->length() - output_component_begin);

	570 }

	571 } else if (output_component) {

	572 output_component->reset();

	573 }

	574 }

	575

	576 } // namespace

	577

	578 const FormatUrlType kFormatUrlOmitNothing = 0;

	579 const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0;

	580 const FormatUrlType kFormatUrlOmitHTTP = 1 << 1;

	581 const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2;

	582 const FormatUrlType kFormatUrlOmitAll = kFormatUrlOmitUsernamePassword \|

	583 kFormatUrlOmitHTTP \| kFormatUrlOmitTrailingSlashOnBareHostname;

	584

	585 base::string16 IDNToUnicode(const std::string& host,

	586 const std::string& languages) {

	587 return IDNToUnicodeWithAdjustments(host, languages, NULL);

	588 }

	589

15 std::string GetDirectoryListingEntry(const base::string16& name,	590 std::string GetDirectoryListingEntry(const base::string16& name,

16 const std::string& raw_bytes,	591 const std::string& raw_bytes,

17 bool is_dir,	592 bool is_dir,

18 int64_t size,	593 int64_t size,

19 base::Time modified) {	594 Time modified) {

20 std::string result;	595 std::string result;

21 result.append("<script>addRow(");	596 result.append("<script>addRow(");

22 base::EscapeJSONString(name, true, &result);	597 base::EscapeJSONString(name, true, &result);

23 result.append(",");	598 result.append(",");

24 if (raw_bytes.empty()) {	599 if (raw_bytes.empty()) {

25 base::EscapeJSONString(EscapePath(base::UTF16ToUTF8(name)), true, &result);	600 base::EscapeJSONString(EscapePath(base::UTF16ToUTF8(name)), true, &result);

26 } else {	601 } else {

27 base::EscapeJSONString(EscapePath(raw_bytes), true, &result);	602 base::EscapeJSONString(EscapePath(raw_bytes), true, &result);

28 }	603 }

29

30 if (is_dir) {	604 if (is_dir) {

31 result.append(",1,");	605 result.append(",1,");

32 } else {	606 } else {

33 result.append(",0,");	607 result.append(",0,");

34 }	608 }

35	609

36 // Negative size means unknown or not applicable (e.g. directory).	610 // Negative size means unknown or not applicable (e.g. directory).

37 base::string16 size_string;	611 base::string16 size_string;

38 if (size >= 0)	612 if (size >= 0)

39 size_string = base::FormatBytesUnlocalized(size);	613 size_string = base::FormatBytesUnlocalized(size);

40 base::EscapeJSONString(size_string, true, &result);	614 base::EscapeJSONString(size_string, true, &result);

41	615

42 result.append(",");	616 result.append(",");

43	617

44 base::string16 modified_str;	618 base::string16 modified_str;

45 // \|modified\| can be NULL in FTP listings.	619 // \|modified\| can be NULL in FTP listings.

46 if (!modified.is_null())	620 if (!modified.is_null()) {

47 modified_str = base::TimeFormatShortDateAndTime(modified);	621 modified_str = base::TimeFormatShortDateAndTime(modified);

	622 }

48 base::EscapeJSONString(modified_str, true, &result);	623 base::EscapeJSONString(modified_str, true, &result);

49	624

50 result.append(");</script>\n");	625 result.append(");</script>\n");

51	626

52 return result;	627 return result;

53 }	628 }

54	629

	630 void AppendFormattedHost(const GURL& url,

	631 const std::string& languages,

	632 base::string16* output) {

	633 AppendFormattedComponent(url.possibly_invalid_spec(),

	634 url.parsed_for_possibly_invalid_spec().host,

	635 HostComponentTransform(languages), output, NULL, NULL);

	636 }

	637

	638 base::string16 FormatUrlWithOffsets(

	639 const GURL& url,

	640 const std::string& languages,

	641 FormatUrlTypes format_types,

	642 UnescapeRule::Type unescape_rules,

	643 url::Parsed* new_parsed,

	644 size_t* prefix_end,

	645 std::vector<size_t>* offsets_for_adjustment) {

	646 base::OffsetAdjuster::Adjustments adjustments;

	647 const base::string16& format_url_return_value =

	648 FormatUrlWithAdjustments(url, languages, format_types, unescape_rules,

	649 new_parsed, prefix_end, &adjustments);

	650 base::OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);

	651 if (offsets_for_adjustment) {

	652 std::for_each(

	653 offsets_for_adjustment->begin(),

	654 offsets_for_adjustment->end(),

	655 base::LimitOffset<std::string>(format_url_return_value.length()));

	656 }

	657 return format_url_return_value;

	658 }

	659

	660 base::string16 FormatUrlWithAdjustments(

	661 const GURL& url,

	662 const std::string& languages,

	663 FormatUrlTypes format_types,

	664 UnescapeRule::Type unescape_rules,

	665 url::Parsed* new_parsed,

	666 size_t* prefix_end,

	667 base::OffsetAdjuster::Adjustments* adjustments) {

	668 DCHECK(adjustments != NULL);

	669 adjustments->clear();

	670 url::Parsed parsed_temp;

	671 if (!new_parsed)

	672 new_parsed = &parsed_temp;

	673 else

	674 *new_parsed = url::Parsed();

	675

	676 // Special handling for view-source:. Don't use content::kViewSourceScheme

	677 // because this library shouldn't depend on chrome.

	678 const char kViewSource[] = "view-source";

	679 // Reject "view-source:view-source:..." to avoid deep recursion.

	680 const char kViewSourceTwice[] = "view-source:view-source:";

	681 if (url.SchemeIs(kViewSource) &&

	682 !base::StartsWith(url.possibly_invalid_spec(), kViewSourceTwice,

	683 base::CompareCase::INSENSITIVE_ASCII)) {

	684 return FormatViewSourceUrl(url, languages, format_types,

	685 unescape_rules, new_parsed, prefix_end,

	686 adjustments);

	687 }

	688

	689 // We handle both valid and invalid URLs (this will give us the spec

	690 // regardless of validity).

	691 const std::string& spec = url.possibly_invalid_spec();

	692 const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec();

	693

	694 // Scheme & separators. These are ASCII.

	695 base::string16 url_string;

	696 url_string.insert(

	697 url_string.end(), spec.begin(),

	698 spec.begin() + parsed.CountCharactersBefore(url::Parsed::USERNAME, true));

	699 const char kHTTP[] = "http://";

	700 const char kFTP[] = "ftp.";

	701 // url_fixer::FixupURL() treats "ftp.foo.com" as ftp://ftp.foo.com. This

	702 // means that if we trim "http://" off a URL whose host starts with "ftp." and

	703 // the user inputs this into any field subject to fixup (which is basically

	704 // all input fields), the meaning would be changed. (In fact, often the

	705 // formatted URL is directly pre-filled into an input field.) For this reason

	706 // we avoid stripping "http://" in this case.

	707 bool omit_http =

	708 (format_types & kFormatUrlOmitHTTP) &&

	709 base::EqualsASCII(url_string, kHTTP) &&

	710 !base::StartsWith(url.host(), kFTP, base::CompareCase::SENSITIVE);

	711 new_parsed->scheme = parsed.scheme;

	712

	713 // Username & password.

	714 if ((format_types & kFormatUrlOmitUsernamePassword) != 0) {

	715 // Remove the username and password fields. We don't want to display those

	716 // to the user since they can be used for attacks,

	717 // e.g. "http://google.com:search@evil.ru/"

	718 new_parsed->username.reset();

	719 new_parsed->password.reset();

	720 // Update the adjustments based on removed username and/or password.

	721 if (parsed.username.is_nonempty() \|\| parsed.password.is_nonempty()) {

	722 if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) {

	723 // The seeming off-by-two is to account for the ':' after the username

	724 // and '@' after the password.

	725 adjustments->push_back(base::OffsetAdjuster::Adjustment(

	726 static_cast<size_t>(parsed.username.begin),

	727 static_cast<size_t>(parsed.username.len + parsed.password.len + 2),

	728 0));

	729 } else {

	730 const url::Component* nonempty_component =

	731 parsed.username.is_nonempty() ? &parsed.username : &parsed.password;

	732 // The seeming off-by-one is to account for the '@' after the

	733 // username/password.

	734 adjustments->push_back(base::OffsetAdjuster::Adjustment(

	735 static_cast<size_t>(nonempty_component->begin),

	736 static_cast<size_t>(nonempty_component->len + 1),

	737 0));

	738 }

	739 }

	740 } else {

	741 AppendFormattedComponent(spec, parsed.username,

	742 NonHostComponentTransform(unescape_rules),

	743 &url_string, &new_parsed->username, adjustments);

	744 if (parsed.password.is_valid())

	745 url_string.push_back(':');

	746 AppendFormattedComponent(spec, parsed.password,

	747 NonHostComponentTransform(unescape_rules),

	748 &url_string, &new_parsed->password, adjustments);

	749 if (parsed.username.is_valid() \|\| parsed.password.is_valid())

	750 url_string.push_back('@');

	751 }

	752 if (prefix_end)

	753 *prefix_end = static_cast<size_t>(url_string.length());

	754

	755 // Host.

	756 AppendFormattedComponent(spec, parsed.host, HostComponentTransform(languages),

	757 &url_string, &new_parsed->host, adjustments);

	758

	759 // Port.

	760 if (parsed.port.is_nonempty()) {

	761 url_string.push_back(':');

	762 new_parsed->port.begin = url_string.length();

	763 url_string.insert(url_string.end(),

	764 spec.begin() + parsed.port.begin,

	765 spec.begin() + parsed.port.end());

	766 new_parsed->port.len = url_string.length() - new_parsed->port.begin;

	767 } else {

	768 new_parsed->port.reset();

	769 }

	770

	771 // Path & query. Both get the same general unescape & convert treatment.

	772 if (!(format_types & kFormatUrlOmitTrailingSlashOnBareHostname) \|\|

	773 !CanStripTrailingSlash(url)) {

	774 AppendFormattedComponent(spec, parsed.path,

	775 NonHostComponentTransform(unescape_rules),

	776 &url_string, &new_parsed->path, adjustments);

	777 } else {

	778 if (parsed.path.len > 0) {

	779 adjustments->push_back(base::OffsetAdjuster::Adjustment(

	780 parsed.path.begin, parsed.path.len, 0));

	781 }

	782 }

	783 if (parsed.query.is_valid())

	784 url_string.push_back('?');

	785 AppendFormattedComponent(spec, parsed.query,

	786 NonHostComponentTransform(unescape_rules),

	787 &url_string, &new_parsed->query, adjustments);

	788

	789 // Ref. This is valid, unescaped UTF-8, so we can just convert.

	790 if (parsed.ref.is_valid())

	791 url_string.push_back('#');

	792 AppendFormattedComponent(spec, parsed.ref,

	793 NonHostComponentTransform(UnescapeRule::NONE),

	794 &url_string, &new_parsed->ref, adjustments);

	795

	796 // If we need to strip out http do it after the fact.

	797 if (omit_http &&

	798 base::StartsWith(url_string, base::ASCIIToUTF16(kHTTP),

	799 base::CompareCase::SENSITIVE)) {

	800 const size_t kHTTPSize = arraysize(kHTTP) - 1;

	801 url_string = url_string.substr(kHTTPSize);

	802 // Because offsets in the \|adjustments\| are already calculated with respect

	803 // to the string with the http:// prefix in it, those offsets remain correct

	804 // after stripping the prefix. The only thing necessary is to add an

	805 // adjustment to reflect the stripped prefix.

	806 adjustments->insert(adjustments->begin(),

	807 base::OffsetAdjuster::Adjustment(0, kHTTPSize, 0));

	808

	809 if (prefix_end)

	810 *prefix_end -= kHTTPSize;

	811

	812 // Adjust new_parsed.

	813 DCHECK(new_parsed->scheme.is_valid());

	814 int delta = -(new_parsed->scheme.len + 3); // +3 for ://.

	815 new_parsed->scheme.reset();

	816 AdjustAllComponentsButScheme(delta, new_parsed);

	817 }

	818

	819 return url_string;

	820 }

	821

	822 base::string16 FormatUrl(const GURL& url,

	823 const std::string& languages,

	824 FormatUrlTypes format_types,

	825 UnescapeRule::Type unescape_rules,

	826 url::Parsed* new_parsed,

	827 size_t* prefix_end,

	828 size_t* offset_for_adjustment) {

	829 Offsets offsets;

	830 if (offset_for_adjustment)

	831 offsets.push_back(*offset_for_adjustment);

	832 base::string16 result = FormatUrlWithOffsets(url, languages, format_types,

	833 unescape_rules, new_parsed, prefix_end, &offsets);

	834 if (offset_for_adjustment)

	835 *offset_for_adjustment = offsets[0];

	836 return result;

	837 }

	838

55 } // namespace net	839 } // namespace net

OLD	NEW

« no previous file with comments | « net/base/net_util.cc ('k') | net/base/net_util_icu_unittest.cc » ('j') | no next file with comments »