| OLD | NEW |
| 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include <algorithm> | 5 #include <algorithm> |
| 6 #include <map> | 6 #include <map> |
| 7 #include <unicode/ucnv.h> | 7 #include <unicode/ucnv.h> |
| 8 #include <unicode/uidna.h> | 8 #include <unicode/uidna.h> |
| 9 #include <unicode/ulocdata.h> | 9 #include <unicode/ulocdata.h> |
| 10 #include <unicode/uniset.h> | 10 #include <unicode/uniset.h> |
| (...skipping 476 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 487 // Latin letters in the ASCII range. | 487 // Latin letters in the ASCII range. |
| 488 bool IsCompatibleWithASCIILetters(const std::string& lang) { | 488 bool IsCompatibleWithASCIILetters(const std::string& lang) { |
| 489 // For now, just list Chinese, Japanese and Korean (positive list). | 489 // For now, just list Chinese, Japanese and Korean (positive list). |
| 490 // An alternative is negative-listing (languages using Greek and | 490 // An alternative is negative-listing (languages using Greek and |
| 491 // Cyrillic letters), but it can be more dangerous. | 491 // Cyrillic letters), but it can be more dangerous. |
| 492 return !lang.substr(0, 2).compare("zh") || | 492 return !lang.substr(0, 2).compare("zh") || |
| 493 !lang.substr(0, 2).compare("ja") || | 493 !lang.substr(0, 2).compare("ja") || |
| 494 !lang.substr(0, 2).compare("ko"); | 494 !lang.substr(0, 2).compare("ko"); |
| 495 } | 495 } |
| 496 | 496 |
| 497 typedef std::map<std::string, UnicodeSet*> LangToExemplarSetMap; | 497 typedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap; |
| 498 | 498 |
| 499 class LangToExemplarSet { | 499 class LangToExemplarSet { |
| 500 private: | 500 private: |
| 501 LangToExemplarSetMap map; | 501 LangToExemplarSetMap map; |
| 502 LangToExemplarSet() { } | 502 LangToExemplarSet() { } |
| 503 ~LangToExemplarSet() { | 503 ~LangToExemplarSet() { |
| 504 STLDeleteContainerPairSecondPointers(map.begin(), map.end()); | 504 STLDeleteContainerPairSecondPointers(map.begin(), map.end()); |
| 505 } | 505 } |
| 506 | 506 |
| 507 friend class Singleton<LangToExemplarSet>; | 507 friend class Singleton<LangToExemplarSet>; |
| 508 friend struct DefaultSingletonTraits<LangToExemplarSet>; | 508 friend struct DefaultSingletonTraits<LangToExemplarSet>; |
| 509 friend bool GetExemplarSetForLang(const std::string&, UnicodeSet**); | 509 friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**); |
| 510 friend void SetExemplarSetForLang(const std::string&, UnicodeSet*); | 510 friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*); |
| 511 | 511 |
| 512 DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet); | 512 DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet); |
| 513 }; | 513 }; |
| 514 | 514 |
| 515 bool GetExemplarSetForLang(const std::string& lang, UnicodeSet** lang_set) { | 515 bool GetExemplarSetForLang(const std::string& lang, |
| 516 icu::UnicodeSet** lang_set) { |
| 516 const LangToExemplarSetMap& map = Singleton<LangToExemplarSet>()->map; | 517 const LangToExemplarSetMap& map = Singleton<LangToExemplarSet>()->map; |
| 517 LangToExemplarSetMap::const_iterator pos = map.find(lang); | 518 LangToExemplarSetMap::const_iterator pos = map.find(lang); |
| 518 if (pos != map.end()) { | 519 if (pos != map.end()) { |
| 519 *lang_set = pos->second; | 520 *lang_set = pos->second; |
| 520 return true; | 521 return true; |
| 521 } | 522 } |
| 522 return false; | 523 return false; |
| 523 } | 524 } |
| 524 | 525 |
| 525 void SetExemplarSetForLang(const std::string& lang, UnicodeSet* lang_set) { | 526 void SetExemplarSetForLang(const std::string& lang, |
| 527 icu::UnicodeSet* lang_set) { |
| 526 LangToExemplarSetMap& map = Singleton<LangToExemplarSet>()->map; | 528 LangToExemplarSetMap& map = Singleton<LangToExemplarSet>()->map; |
| 527 map.insert(std::make_pair(lang, lang_set)); | 529 map.insert(std::make_pair(lang, lang_set)); |
| 528 } | 530 } |
| 529 | 531 |
| 530 static Lock lang_set_lock; | 532 static Lock lang_set_lock; |
| 531 | 533 |
| 532 // Returns true if all the characters in component_characters are used by | 534 // Returns true if all the characters in component_characters are used by |
| 533 // the language |lang|. | 535 // the language |lang|. |
| 534 bool IsComponentCoveredByLang(const UnicodeSet& component_characters, | 536 bool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters, |
| 535 const std::string& lang) { | 537 const std::string& lang) { |
| 536 static const UnicodeSet kASCIILetters(0x61, 0x7a); // [a-z] | 538 static const icu::UnicodeSet kASCIILetters(0x61, 0x7a); // [a-z] |
| 537 UnicodeSet* lang_set; | 539 icu::UnicodeSet* lang_set; |
| 538 // We're called from both the UI thread and the history thread. | 540 // We're called from both the UI thread and the history thread. |
| 539 { | 541 { |
| 540 AutoLock lock(lang_set_lock); | 542 AutoLock lock(lang_set_lock); |
| 541 if (!GetExemplarSetForLang(lang, &lang_set)) { | 543 if (!GetExemplarSetForLang(lang, &lang_set)) { |
| 542 UErrorCode status = U_ZERO_ERROR; | 544 UErrorCode status = U_ZERO_ERROR; |
| 543 ULocaleData* uld = ulocdata_open(lang.c_str(), &status); | 545 ULocaleData* uld = ulocdata_open(lang.c_str(), &status); |
| 544 // TODO(jungshik) Turn this check on when the ICU data file is | 546 // TODO(jungshik) Turn this check on when the ICU data file is |
| 545 // rebuilt with the minimal subset of locale data for languages | 547 // rebuilt with the minimal subset of locale data for languages |
| 546 // to which Chrome is not localized but which we offer in the list | 548 // to which Chrome is not localized but which we offer in the list |
| 547 // of languages selectable for Accept-Languages. With the rebuilt ICU | 549 // of languages selectable for Accept-Languages. With the rebuilt ICU |
| 548 // data, ulocdata_open never should fall back to the default locale. | 550 // data, ulocdata_open never should fall back to the default locale. |
| 549 // (issue 2078) | 551 // (issue 2078) |
| 550 // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING); | 552 // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING); |
| 551 if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) { | 553 if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) { |
| 552 lang_set = reinterpret_cast<UnicodeSet *>( | 554 lang_set = reinterpret_cast<icu::UnicodeSet *>( |
| 553 ulocdata_getExemplarSet(uld, NULL, 0, | 555 ulocdata_getExemplarSet(uld, NULL, 0, |
| 554 ULOCDATA_ES_STANDARD, &status)); | 556 ULOCDATA_ES_STANDARD, &status)); |
| 555 // If |lang| is compatible with ASCII Latin letters, add them. | 557 // If |lang| is compatible with ASCII Latin letters, add them. |
| 556 if (IsCompatibleWithASCIILetters(lang)) | 558 if (IsCompatibleWithASCIILetters(lang)) |
| 557 lang_set->addAll(kASCIILetters); | 559 lang_set->addAll(kASCIILetters); |
| 558 } else { | 560 } else { |
| 559 lang_set = new UnicodeSet(1, 0); | 561 lang_set = new icu::UnicodeSet(1, 0); |
| 560 } | 562 } |
| 561 lang_set->freeze(); | 563 lang_set->freeze(); |
| 562 SetExemplarSetForLang(lang, lang_set); | 564 SetExemplarSetForLang(lang, lang_set); |
| 563 ulocdata_close(uld); | 565 ulocdata_close(uld); |
| 564 } | 566 } |
| 565 } | 567 } |
| 566 return !lang_set->isEmpty() && lang_set->containsAll(component_characters); | 568 return !lang_set->isEmpty() && lang_set->containsAll(component_characters); |
| 567 } | 569 } |
| 568 | 570 |
| 569 // Returns true if the given Unicode host component is safe to display to the | 571 // Returns true if the given Unicode host component is safe to display to the |
| (...skipping 21 matching lines...) Expand all Loading... |
| 591 L"[[\\ \u00bc\u00bd\u01c3\u0337\u0338" | 593 L"[[\\ \u00bc\u00bd\u01c3\u0337\u0338" |
| 592 L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]" | 594 L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]" |
| 593 L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]" | 595 L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]" |
| 594 L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae" | 596 L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae" |
| 595 L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014" | 597 L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014" |
| 596 L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14" | 598 L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14" |
| 597 L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]" | 599 L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]" |
| 598 L"[\ufffa-\ufffd]]"), status); | 600 L"[\ufffa-\ufffd]]"), status); |
| 599 #else | 601 #else |
| 600 UnicodeSet dangerous_characters(UnicodeString( | 602 UnicodeSet dangerous_characters(UnicodeString( |
| 601 "[[\\ \\u0020\\u00bc\\u00bd\\u01c3\\u0337\\u0338" | 603 "[[\\u0020\\u00bc\\u00bd\\u01c3\\u0337\\u0338" |
| 602 "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]" | 604 "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]" |
| 603 "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]" | 605 "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]" |
| 604 "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae" | 606 "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae" |
| 605 "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014" | 607 "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014" |
| 606 "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14" | 608 "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14" |
| 607 "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]" | 609 "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]" |
| 608 "[\\ufffa-\\ufffd]]", -1, US_INV), status); | 610 "[\\ufffa-\\ufffd]]", -1, US_INV), status); |
| 609 #endif | 611 #endif |
| 610 DCHECK(U_SUCCESS(status)); | 612 DCHECK(U_SUCCESS(status)); |
| 611 UnicodeSet component_characters; | 613 icu::UnicodeSet component_characters; |
| 612 component_characters.addAll(UnicodeString(str, str_len)); | 614 component_characters.addAll(icu::UnicodeString(str, str_len)); |
| 613 if (dangerous_characters.containsSome(component_characters)) | 615 if (dangerous_characters.containsSome(component_characters)) |
| 614 return false; | 616 return false; |
| 615 | 617 |
| 616 // If the language list is empty, the result is completely determined | 618 // If the language list is empty, the result is completely determined |
| 617 // by whether a component is a single script or not. This will block | 619 // by whether a component is a single script or not. This will block |
| 618 // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are | 620 // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are |
| 619 // allowed with |languages| (while it blocks Chinese + Latin letters with | 621 // allowed with |languages| (while it blocks Chinese + Latin letters with |
| 620 // an accent as should be the case), but we want to err on the safe side | 622 // an accent as should be the case), but we want to err on the safe side |
| 621 // when |languages| is empty. | 623 // when |languages| is empty. |
| 622 if (languages.empty()) | 624 if (languages.empty()) |
| 623 return IsIDNComponentInSingleScript(str, str_len); | 625 return IsIDNComponentInSingleScript(str, str_len); |
| 624 | 626 |
| 625 // |common_characters| is made up of ASCII numbers, hyphen, plus and | 627 // |common_characters| is made up of ASCII numbers, hyphen, plus and |
| 626 // underscore that are used across scripts and allowed in domain names. | 628 // underscore that are used across scripts and allowed in domain names. |
| 627 // (sync'd with characters allowed in url_canon_host with square | 629 // (sync'd with characters allowed in url_canon_host with square |
| 628 // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc. | 630 // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc. |
| 629 UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"), | 631 icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"), |
| 630 status); | 632 status); |
| 631 DCHECK(U_SUCCESS(status)); | 633 DCHECK(U_SUCCESS(status)); |
| 632 // Subtract common characters because they're always allowed so that | 634 // Subtract common characters because they're always allowed so that |
| 633 // we just have to check if a language-specific set contains | 635 // we just have to check if a language-specific set contains |
| 634 // the remainder. | 636 // the remainder. |
| 635 component_characters.removeAll(common_characters); | 637 component_characters.removeAll(common_characters); |
| 636 | 638 |
| 637 std::string languages_list(WideToASCII(languages)); | 639 std::string languages_list(WideToASCII(languages)); |
| 638 StringTokenizer t(languages_list, ","); | 640 StringTokenizer t(languages_list, ","); |
| 639 while (t.GetNext()) { | 641 while (t.GetNext()) { |
| 640 if (IsComponentCoveredByLang(component_characters, t.token())) | 642 if (IsComponentCoveredByLang(component_characters, t.token())) |
| (...skipping 659 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1300 GURL SimplifyUrlForRequest(const GURL& url) { | 1302 GURL SimplifyUrlForRequest(const GURL& url) { |
| 1301 DCHECK(url.is_valid()); | 1303 DCHECK(url.is_valid()); |
| 1302 GURL::Replacements replacements; | 1304 GURL::Replacements replacements; |
| 1303 replacements.ClearUsername(); | 1305 replacements.ClearUsername(); |
| 1304 replacements.ClearPassword(); | 1306 replacements.ClearPassword(); |
| 1305 replacements.ClearRef(); | 1307 replacements.ClearRef(); |
| 1306 return url.ReplaceComponents(replacements); | 1308 return url.ReplaceComponents(replacements); |
| 1307 } | 1309 } |
| 1308 | 1310 |
| 1309 } // namespace net | 1311 } // namespace net |
| OLD | NEW |