| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2011 Google Inc. All rights reserved. | 2 * Copyright (C) 2011 Google Inc. All rights reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions are | 5 * modification, are permitted provided that the following conditions are |
| 6 * met: | 6 * met: |
| 7 * | 7 * |
| 8 * * Redistributions of source code must retain the above copyright | 8 * * Redistributions of source code must retain the above copyright |
| 9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
| 10 * * Redistributions in binary form must reproduce the above | 10 * * Redistributions in binary form must reproduce the above |
| (...skipping 18 matching lines...) Expand all Loading... |
| 29 */ | 29 */ |
| 30 | 30 |
| 31 #include "platform/text/LocaleToScriptMapping.h" | 31 #include "platform/text/LocaleToScriptMapping.h" |
| 32 | 32 |
| 33 #include "wtf/HashMap.h" | 33 #include "wtf/HashMap.h" |
| 34 #include "wtf/HashSet.h" | 34 #include "wtf/HashSet.h" |
| 35 #include "wtf/text/StringHash.h" | 35 #include "wtf/text/StringHash.h" |
| 36 | 36 |
| 37 namespace blink { | 37 namespace blink { |
| 38 | 38 |
| 39 struct SubtagScript { |
| 40 const char* subtag; |
| 41 UScriptCode script; |
| 42 }; |
| 43 |
| 44 using SubtagScriptMap = HashMap<String, UScriptCode, CaseFoldingHash>; |
| 45 |
| 46 static SubtagScriptMap createSubtagScriptMap(const SubtagScript list[], size_t s
ize) |
| 47 { |
| 48 SubtagScriptMap map; |
| 49 for (size_t i = 0; i < size; ++i) |
| 50 map.set(list[i].subtag, list[i].script); |
| 51 return map; |
| 52 } |
| 53 |
| 39 UScriptCode scriptNameToCode(const String& scriptName) | 54 UScriptCode scriptNameToCode(const String& scriptName) |
| 40 { | 55 { |
| 41 struct ScriptNameCode { | |
| 42 const char* name; | |
| 43 UScriptCode code; | |
| 44 }; | |
| 45 | |
| 46 // This generally maps an ISO 15924 script code to its UScriptCode, but cert
ain families of script codes are | 56 // This generally maps an ISO 15924 script code to its UScriptCode, but cert
ain families of script codes are |
| 47 // treated as a single script for assigning a per-script font in Settings. F
or example, "hira" is mapped to | 57 // treated as a single script for assigning a per-script font in Settings. F
or example, "hira" is mapped to |
| 48 // USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want a
ll Japanese scripts to be rendered | 58 // USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want a
ll Japanese scripts to be rendered |
| 49 // using the same font setting. | 59 // using the same font setting. |
| 50 static const ScriptNameCode scriptNameCodeList[] = { | 60 static const SubtagScript scriptNameCodeList[] = { |
| 51 { "zyyy", USCRIPT_COMMON }, | 61 { "zyyy", USCRIPT_COMMON }, |
| 52 { "qaai", USCRIPT_INHERITED }, | 62 { "qaai", USCRIPT_INHERITED }, |
| 53 { "arab", USCRIPT_ARABIC }, | 63 { "arab", USCRIPT_ARABIC }, |
| 54 { "armn", USCRIPT_ARMENIAN }, | 64 { "armn", USCRIPT_ARMENIAN }, |
| 55 { "beng", USCRIPT_BENGALI }, | 65 { "beng", USCRIPT_BENGALI }, |
| 56 { "bopo", USCRIPT_BOPOMOFO }, | 66 { "bopo", USCRIPT_BOPOMOFO }, |
| 57 { "cher", USCRIPT_CHEROKEE }, | 67 { "cher", USCRIPT_CHEROKEE }, |
| 58 { "copt", USCRIPT_COPTIC }, | 68 { "copt", USCRIPT_COPTIC }, |
| 59 { "cyrl", USCRIPT_CYRILLIC }, | 69 { "cyrl", USCRIPT_CYRILLIC }, |
| 60 { "dsrt", USCRIPT_DESERET }, | 70 { "dsrt", USCRIPT_DESERET }, |
| (...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 148 { "syrn", USCRIPT_EASTERN_SYRIAC }, | 158 { "syrn", USCRIPT_EASTERN_SYRIAC }, |
| 149 { "teng", USCRIPT_TENGWAR }, | 159 { "teng", USCRIPT_TENGWAR }, |
| 150 { "vaii", USCRIPT_VAI }, | 160 { "vaii", USCRIPT_VAI }, |
| 151 { "visp", USCRIPT_VISIBLE_SPEECH }, | 161 { "visp", USCRIPT_VISIBLE_SPEECH }, |
| 152 { "xsux", USCRIPT_CUNEIFORM }, | 162 { "xsux", USCRIPT_CUNEIFORM }, |
| 153 { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA }, | 163 { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA }, |
| 154 { "kore", USCRIPT_HANGUL }, | 164 { "kore", USCRIPT_HANGUL }, |
| 155 { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES }, | 165 { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES }, |
| 156 { "zzzz", USCRIPT_UNKNOWN } | 166 { "zzzz", USCRIPT_UNKNOWN } |
| 157 }; | 167 }; |
| 168 DEFINE_STATIC_LOCAL(SubtagScriptMap, scriptNameCodeMap, |
| 169 (createSubtagScriptMap(scriptNameCodeList, WTF_ARRAY_LENGTH(scriptNameCo
deList)))); |
| 158 | 170 |
| 159 typedef HashMap<String, UScriptCode> ScriptNameCodeMap; | 171 const auto& it = scriptNameCodeMap.find(scriptName); |
| 160 DEFINE_STATIC_LOCAL(ScriptNameCodeMap, scriptNameCodeMap, ()); | |
| 161 if (scriptNameCodeMap.isEmpty()) { | |
| 162 for (size_t i = 0; i < sizeof(scriptNameCodeList) / sizeof(scriptNameCod
eList[0]); ++i) | |
| 163 scriptNameCodeMap.set(scriptNameCodeList[i].name, scriptNameCodeList
[i].code); | |
| 164 } | |
| 165 | |
| 166 HashMap<String, UScriptCode>::iterator it = scriptNameCodeMap.find(scriptNam
e.lower()); | |
| 167 if (it != scriptNameCodeMap.end()) | 172 if (it != scriptNameCodeMap.end()) |
| 168 return it->value; | 173 return it->value; |
| 169 return USCRIPT_INVALID_CODE; | 174 return USCRIPT_INVALID_CODE; |
| 170 } | 175 } |
| 171 | 176 |
| 172 UScriptCode localeToScriptCodeForFontSelection(const String& locale) | 177 UScriptCode localeToScriptCodeForFontSelection(const String& locale) |
| 173 { | 178 { |
| 174 struct LocaleScript { | 179 static const SubtagScript localeScriptList[] = { |
| 175 const char* locale; | |
| 176 UScriptCode script; | |
| 177 }; | |
| 178 | |
| 179 static const LocaleScript localeScriptList[] = { | |
| 180 { "aa", USCRIPT_LATIN }, | 180 { "aa", USCRIPT_LATIN }, |
| 181 { "ab", USCRIPT_CYRILLIC }, | 181 { "ab", USCRIPT_CYRILLIC }, |
| 182 { "ady", USCRIPT_CYRILLIC }, | 182 { "ady", USCRIPT_CYRILLIC }, |
| 183 { "aeb", USCRIPT_ARABIC }, | 183 { "aeb", USCRIPT_ARABIC }, |
| 184 { "af", USCRIPT_LATIN }, | 184 { "af", USCRIPT_LATIN }, |
| 185 { "ak", USCRIPT_LATIN }, | 185 { "ak", USCRIPT_LATIN }, |
| 186 { "am", USCRIPT_ETHIOPIC }, | 186 { "am", USCRIPT_ETHIOPIC }, |
| 187 { "ar", USCRIPT_ARABIC }, | 187 { "ar", USCRIPT_ARABIC }, |
| 188 { "arq", USCRIPT_ARABIC }, | 188 { "arq", USCRIPT_ARABIC }, |
| 189 { "ary", USCRIPT_ARABIC }, | 189 { "ary", USCRIPT_ARABIC }, |
| (...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 408 { "vi", USCRIPT_LATIN }, | 408 { "vi", USCRIPT_LATIN }, |
| 409 { "wal", USCRIPT_ETHIOPIC }, | 409 { "wal", USCRIPT_ETHIOPIC }, |
| 410 { "war", USCRIPT_LATIN }, | 410 { "war", USCRIPT_LATIN }, |
| 411 { "wo", USCRIPT_LATIN }, | 411 { "wo", USCRIPT_LATIN }, |
| 412 { "xh", USCRIPT_LATIN }, | 412 { "xh", USCRIPT_LATIN }, |
| 413 { "yap", USCRIPT_LATIN }, | 413 { "yap", USCRIPT_LATIN }, |
| 414 { "yo", USCRIPT_LATIN }, | 414 { "yo", USCRIPT_LATIN }, |
| 415 { "za", USCRIPT_LATIN }, | 415 { "za", USCRIPT_LATIN }, |
| 416 { "zdj", USCRIPT_ARABIC }, | 416 { "zdj", USCRIPT_ARABIC }, |
| 417 { "zh", USCRIPT_SIMPLIFIED_HAN }, | 417 { "zh", USCRIPT_SIMPLIFIED_HAN }, |
| 418 { "zh_hk", USCRIPT_TRADITIONAL_HAN }, | 418 { "zu", USCRIPT_LATIN }, |
| 419 { "zh_tw", USCRIPT_TRADITIONAL_HAN }, | 419 // Encompassed languages within the Chinese macrolanguage. |
| 420 { "zu", USCRIPT_LATIN } | 420 // http://www-01.sil.org/iso639-3/documentation.asp?id=zho |
| 421 // http://lists.w3.org/Archives/Public/public-i18n-cjk/2016JulSep/0022.h
tml |
| 422 { "cdo", USCRIPT_SIMPLIFIED_HAN }, |
| 423 { "cjy", USCRIPT_SIMPLIFIED_HAN }, |
| 424 { "cmn", USCRIPT_SIMPLIFIED_HAN }, |
| 425 { "cpx", USCRIPT_SIMPLIFIED_HAN }, |
| 426 { "czh", USCRIPT_SIMPLIFIED_HAN }, |
| 427 { "czo", USCRIPT_SIMPLIFIED_HAN }, |
| 428 { "gan", USCRIPT_SIMPLIFIED_HAN }, |
| 429 { "hsn", USCRIPT_SIMPLIFIED_HAN }, |
| 430 { "mnp", USCRIPT_SIMPLIFIED_HAN }, |
| 431 { "wuu", USCRIPT_SIMPLIFIED_HAN }, |
| 432 { "hak", USCRIPT_TRADITIONAL_HAN }, |
| 433 { "lzh", USCRIPT_TRADITIONAL_HAN }, |
| 434 { "nan", USCRIPT_TRADITIONAL_HAN }, |
| 435 { "yue", USCRIPT_TRADITIONAL_HAN }, |
| 436 { "zh-cdo", USCRIPT_SIMPLIFIED_HAN }, |
| 437 { "zh-cjy", USCRIPT_SIMPLIFIED_HAN }, |
| 438 { "zh-cmn", USCRIPT_SIMPLIFIED_HAN }, |
| 439 { "zh-cpx", USCRIPT_SIMPLIFIED_HAN }, |
| 440 { "zh-czh", USCRIPT_SIMPLIFIED_HAN }, |
| 441 { "zh-czo", USCRIPT_SIMPLIFIED_HAN }, |
| 442 { "zh-gan", USCRIPT_SIMPLIFIED_HAN }, |
| 443 { "zh-hsn", USCRIPT_SIMPLIFIED_HAN }, |
| 444 { "zh-mnp", USCRIPT_SIMPLIFIED_HAN }, |
| 445 { "zh-wuu", USCRIPT_SIMPLIFIED_HAN }, |
| 446 { "zh-hak", USCRIPT_TRADITIONAL_HAN }, |
| 447 { "zh-lzh", USCRIPT_TRADITIONAL_HAN }, |
| 448 { "zh-nan", USCRIPT_TRADITIONAL_HAN }, |
| 449 { "zh-yue", USCRIPT_TRADITIONAL_HAN }, |
| 450 // Chinese with regions. Logically, regions should be handled |
| 451 // separately, but this works for the current purposes. |
| 452 { "zh-hk", USCRIPT_TRADITIONAL_HAN }, |
| 453 { "zh-mo", USCRIPT_TRADITIONAL_HAN }, |
| 454 { "zh-tw", USCRIPT_TRADITIONAL_HAN }, |
| 421 }; | 455 }; |
| 456 DEFINE_STATIC_LOCAL(SubtagScriptMap, localeScriptMap, |
| 457 (createSubtagScriptMap(localeScriptList, WTF_ARRAY_LENGTH(localeScriptLi
st)))); |
| 422 | 458 |
| 423 typedef HashMap<String, UScriptCode> LocaleScriptMap; | 459 // BCP 47 uses '-' as the delimiter but ICU uses '_'. |
| 424 DEFINE_STATIC_LOCAL(LocaleScriptMap, localeScriptMap, ()); | 460 // https://tools.ietf.org/html/bcp47 |
| 425 if (localeScriptMap.isEmpty()) { | 461 String canonicalLocale = locale; |
| 426 for (size_t i = 0; i < sizeof(localeScriptList) / sizeof(localeScriptLis
t[0]); ++i) | 462 canonicalLocale.replace('_', '-'); |
| 427 localeScriptMap.set(localeScriptList[i].locale, localeScriptList[i].
script); | |
| 428 } | |
| 429 | |
| 430 String canonicalLocale = locale.lower().replace('-', '_'); | |
| 431 while (!canonicalLocale.isEmpty()) { | 463 while (!canonicalLocale.isEmpty()) { |
| 432 HashMap<String, UScriptCode>::iterator it = localeScriptMap.find(canonic
alLocale); | 464 const auto& it = localeScriptMap.find(canonicalLocale); |
| 433 if (it != localeScriptMap.end()) | 465 if (it != localeScriptMap.end()) |
| 434 return it->value; | 466 return it->value; |
| 435 size_t pos = canonicalLocale.reverseFind('_'); | 467 size_t pos = canonicalLocale.reverseFind('-'); |
| 436 if (pos == kNotFound) | 468 if (pos == kNotFound) |
| 437 break; | 469 break; |
| 438 UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos + 1)); | 470 // script = 4ALPHA |
| 439 if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN) | 471 if (canonicalLocale.length() - (pos + 1) == 4) { |
| 440 return code; | 472 UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos +
1)); |
| 473 if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN) |
| 474 return code; |
| 475 } |
| 441 canonicalLocale = canonicalLocale.substring(0, pos); | 476 canonicalLocale = canonicalLocale.substring(0, pos); |
| 442 } | 477 } |
| 443 return USCRIPT_COMMON; | 478 return USCRIPT_COMMON; |
| 444 } | 479 } |
| 445 | 480 |
| 446 static bool isUnambiguousHanScript(UScriptCode script) | 481 static UScriptCode scriptCodeForHanFromRegion(const String& region) |
| 447 { | 482 { |
| 448 // localeToScriptCodeForFontSelection() does not return these values. | 483 static const SubtagScript regionScriptList[] = { |
| 449 ASSERT(script != USCRIPT_HIRAGANA && script != USCRIPT_KATAKANA); | |
| 450 return script == USCRIPT_KATAKANA_OR_HIRAGANA | |
| 451 || script == USCRIPT_SIMPLIFIED_HAN | |
| 452 || script == USCRIPT_TRADITIONAL_HAN | |
| 453 || script == USCRIPT_HANGUL; | |
| 454 } | |
| 455 | |
| 456 static UScriptCode scriptCodeForHanFromSubtag(const String& subtag) | |
| 457 { | |
| 458 struct SubtagScript { | |
| 459 const char* subtag; | |
| 460 UScriptCode script; | |
| 461 }; | |
| 462 | |
| 463 static const SubtagScript subtagScriptList[] = { | |
| 464 { "cn", USCRIPT_SIMPLIFIED_HAN }, | |
| 465 { "hans", USCRIPT_SIMPLIFIED_HAN }, | |
| 466 { "hant", USCRIPT_TRADITIONAL_HAN }, | |
| 467 { "hk", USCRIPT_TRADITIONAL_HAN }, | 484 { "hk", USCRIPT_TRADITIONAL_HAN }, |
| 468 { "jp", USCRIPT_KATAKANA_OR_HIRAGANA }, | 485 { "jp", USCRIPT_KATAKANA_OR_HIRAGANA }, |
| 469 { "kr", USCRIPT_HANGUL }, | 486 { "kr", USCRIPT_HANGUL }, |
| 487 { "mo", USCRIPT_TRADITIONAL_HAN }, |
| 470 { "tw", USCRIPT_TRADITIONAL_HAN }, | 488 { "tw", USCRIPT_TRADITIONAL_HAN }, |
| 471 }; | 489 }; |
| 490 DEFINE_STATIC_LOCAL(SubtagScriptMap, regionScriptMap, |
| 491 (createSubtagScriptMap(regionScriptList, WTF_ARRAY_LENGTH(regionScriptLi
st)))); |
| 472 | 492 |
| 473 typedef HashMap<String, UScriptCode> SubtagScriptMap; | 493 const auto& it = regionScriptMap.find(region); |
| 474 DEFINE_STATIC_LOCAL(SubtagScriptMap, subtagScriptMap, ()); | 494 return it != regionScriptMap.end() ? it->value : USCRIPT_COMMON; |
| 475 if (subtagScriptMap.isEmpty()) { | |
| 476 for (size_t i = 0; i < WTF_ARRAY_LENGTH(subtagScriptList); ++i) | |
| 477 subtagScriptMap.set(subtagScriptList[i].subtag, subtagScriptList[i].
script); | |
| 478 } | |
| 479 | |
| 480 const auto& it = subtagScriptMap.find(subtag.lower()); | |
| 481 return it != subtagScriptMap.end() ? it->value : USCRIPT_COMMON; | |
| 482 } | 495 } |
| 483 | 496 |
| 484 static UScriptCode scriptCodeForHanFromSubtags(const String& locale, char delimi
ter) | 497 UScriptCode scriptCodeForHanFromSubtags(const String& locale, char delimiter) |
| 485 { | 498 { |
| 486 // Some sites emit lang="en-JP" when English is set as the preferred | 499 // Some sites emit lang="en-JP" when English is set as the preferred |
| 487 // language. Use script/region subtags of the content locale to pick the | 500 // language. Use script/region subtags of the content locale to pick the |
| 488 // fallback font for unified Han ideographs. | 501 // fallback font for unified Han ideographs. |
| 489 for (size_t end = locale.find(delimiter); end != kNotFound; ) { | 502 for (size_t end = locale.find(delimiter); end != kNotFound; ) { |
| 490 size_t begin = end + 1; | 503 size_t begin = end + 1; |
| 491 end = locale.find(delimiter, begin); | 504 end = locale.find(delimiter, begin); |
| 492 UScriptCode script = scriptCodeForHanFromSubtag( | 505 size_t len = (end == kNotFound ? locale.length() : end) - begin; |
| 493 locale.substring(begin, | 506 UScriptCode script; |
| 494 end == kNotFound ? UINT_MAX : end - begin)); | 507 switch (len) { |
| 495 if (script != USCRIPT_COMMON) | 508 case 2: // region = 2ALPHA / 3DIGIT |
| 496 return script; | 509 script = scriptCodeForHanFromRegion(locale.substring(begin, len)); |
| 510 if (script != USCRIPT_COMMON) |
| 511 return script; |
| 512 break; |
| 513 case 4: // script = 4ALPHA |
| 514 script = scriptNameToCode(locale.substring(begin, len)); |
| 515 if (script != USCRIPT_INVALID_CODE) |
| 516 return script; |
| 517 } |
| 497 } | 518 } |
| 498 | 519 |
| 499 return USCRIPT_COMMON; | 520 return USCRIPT_COMMON; |
| 500 } | 521 } |
| 501 | 522 |
| 502 UScriptCode scriptCodeForHanFromLocale(UScriptCode script, const String& locale,
char delimiter) | |
| 503 { | |
| 504 if (isUnambiguousHanScript(script)) | |
| 505 return script; | |
| 506 | |
| 507 // Identify the script for Han if the UScriptCode is ambiguous. | |
| 508 // Check subtags only, because the UScriptCode covers the language part. | |
| 509 return scriptCodeForHanFromSubtags(locale, delimiter); | |
| 510 } | |
| 511 | |
| 512 UScriptCode scriptCodeForHanFromLocale(const String& locale, char delimiter) | |
| 513 { | |
| 514 UScriptCode script = localeToScriptCodeForFontSelection(locale); | |
| 515 return scriptCodeForHanFromLocale(script, locale, delimiter); | |
| 516 } | |
| 517 | |
| 518 } // namespace blink | 523 } // namespace blink |
| OLD | NEW |