OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2011 Google Inc. All rights reserved. | 2 * Copyright (C) 2011 Google Inc. All rights reserved. |
3 * | 3 * |
4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
5 * modification, are permitted provided that the following conditions are | 5 * modification, are permitted provided that the following conditions are |
6 * met: | 6 * met: |
7 * | 7 * |
8 * * Redistributions of source code must retain the above copyright | 8 * * Redistributions of source code must retain the above copyright |
9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
10 * * Redistributions in binary form must reproduce the above | 10 * * Redistributions in binary form must reproduce the above |
(...skipping 18 matching lines...) Expand all Loading... |
29 */ | 29 */ |
30 | 30 |
31 #include "platform/text/LocaleToScriptMapping.h" | 31 #include "platform/text/LocaleToScriptMapping.h" |
32 | 32 |
33 #include "wtf/HashMap.h" | 33 #include "wtf/HashMap.h" |
34 #include "wtf/HashSet.h" | 34 #include "wtf/HashSet.h" |
35 #include "wtf/text/StringHash.h" | 35 #include "wtf/text/StringHash.h" |
36 | 36 |
37 namespace blink { | 37 namespace blink { |
38 | 38 |
| 39 struct SubtagScript { |
| 40 const char* subtag; |
| 41 UScriptCode script; |
| 42 }; |
| 43 |
| 44 using SubtagScriptMap = HashMap<String, UScriptCode, CaseFoldingHash>; |
| 45 |
| 46 static SubtagScriptMap createSubtagScriptMap(const SubtagScript list[], size_t s
ize) |
| 47 { |
| 48 SubtagScriptMap map; |
| 49 for (size_t i = 0; i < size; ++i) |
| 50 map.set(list[i].subtag, list[i].script); |
| 51 return map; |
| 52 } |
| 53 |
39 UScriptCode scriptNameToCode(const String& scriptName) | 54 UScriptCode scriptNameToCode(const String& scriptName) |
40 { | 55 { |
41 struct ScriptNameCode { | |
42 const char* name; | |
43 UScriptCode code; | |
44 }; | |
45 | |
46 // This generally maps an ISO 15924 script code to its UScriptCode, but cert
ain families of script codes are | 56 // This generally maps an ISO 15924 script code to its UScriptCode, but cert
ain families of script codes are |
47 // treated as a single script for assigning a per-script font in Settings. F
or example, "hira" is mapped to | 57 // treated as a single script for assigning a per-script font in Settings. F
or example, "hira" is mapped to |
48 // USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want a
ll Japanese scripts to be rendered | 58 // USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want a
ll Japanese scripts to be rendered |
49 // using the same font setting. | 59 // using the same font setting. |
50 static const ScriptNameCode scriptNameCodeList[] = { | 60 static const SubtagScript scriptNameCodeList[] = { |
51 { "zyyy", USCRIPT_COMMON }, | 61 { "zyyy", USCRIPT_COMMON }, |
52 { "qaai", USCRIPT_INHERITED }, | 62 { "qaai", USCRIPT_INHERITED }, |
53 { "arab", USCRIPT_ARABIC }, | 63 { "arab", USCRIPT_ARABIC }, |
54 { "armn", USCRIPT_ARMENIAN }, | 64 { "armn", USCRIPT_ARMENIAN }, |
55 { "beng", USCRIPT_BENGALI }, | 65 { "beng", USCRIPT_BENGALI }, |
56 { "bopo", USCRIPT_BOPOMOFO }, | 66 { "bopo", USCRIPT_BOPOMOFO }, |
57 { "cher", USCRIPT_CHEROKEE }, | 67 { "cher", USCRIPT_CHEROKEE }, |
58 { "copt", USCRIPT_COPTIC }, | 68 { "copt", USCRIPT_COPTIC }, |
59 { "cyrl", USCRIPT_CYRILLIC }, | 69 { "cyrl", USCRIPT_CYRILLIC }, |
60 { "dsrt", USCRIPT_DESERET }, | 70 { "dsrt", USCRIPT_DESERET }, |
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
148 { "syrn", USCRIPT_EASTERN_SYRIAC }, | 158 { "syrn", USCRIPT_EASTERN_SYRIAC }, |
149 { "teng", USCRIPT_TENGWAR }, | 159 { "teng", USCRIPT_TENGWAR }, |
150 { "vaii", USCRIPT_VAI }, | 160 { "vaii", USCRIPT_VAI }, |
151 { "visp", USCRIPT_VISIBLE_SPEECH }, | 161 { "visp", USCRIPT_VISIBLE_SPEECH }, |
152 { "xsux", USCRIPT_CUNEIFORM }, | 162 { "xsux", USCRIPT_CUNEIFORM }, |
153 { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA }, | 163 { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA }, |
154 { "kore", USCRIPT_HANGUL }, | 164 { "kore", USCRIPT_HANGUL }, |
155 { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES }, | 165 { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES }, |
156 { "zzzz", USCRIPT_UNKNOWN } | 166 { "zzzz", USCRIPT_UNKNOWN } |
157 }; | 167 }; |
| 168 DEFINE_STATIC_LOCAL(SubtagScriptMap, scriptNameCodeMap, |
| 169 (createSubtagScriptMap(scriptNameCodeList, WTF_ARRAY_LENGTH(scriptNameCo
deList)))); |
158 | 170 |
159 typedef HashMap<String, UScriptCode> ScriptNameCodeMap; | 171 const auto& it = scriptNameCodeMap.find(scriptName); |
160 DEFINE_STATIC_LOCAL(ScriptNameCodeMap, scriptNameCodeMap, ()); | |
161 if (scriptNameCodeMap.isEmpty()) { | |
162 for (size_t i = 0; i < sizeof(scriptNameCodeList) / sizeof(scriptNameCod
eList[0]); ++i) | |
163 scriptNameCodeMap.set(scriptNameCodeList[i].name, scriptNameCodeList
[i].code); | |
164 } | |
165 | |
166 HashMap<String, UScriptCode>::iterator it = scriptNameCodeMap.find(scriptNam
e.lower()); | |
167 if (it != scriptNameCodeMap.end()) | 172 if (it != scriptNameCodeMap.end()) |
168 return it->value; | 173 return it->value; |
169 return USCRIPT_INVALID_CODE; | 174 return USCRIPT_INVALID_CODE; |
170 } | 175 } |
171 | 176 |
172 UScriptCode localeToScriptCodeForFontSelection(const String& locale) | 177 UScriptCode localeToScriptCodeForFontSelection(const String& locale) |
173 { | 178 { |
174 struct LocaleScript { | 179 static const SubtagScript localeScriptList[] = { |
175 const char* locale; | |
176 UScriptCode script; | |
177 }; | |
178 | |
179 static const LocaleScript localeScriptList[] = { | |
180 { "aa", USCRIPT_LATIN }, | 180 { "aa", USCRIPT_LATIN }, |
181 { "ab", USCRIPT_CYRILLIC }, | 181 { "ab", USCRIPT_CYRILLIC }, |
182 { "ady", USCRIPT_CYRILLIC }, | 182 { "ady", USCRIPT_CYRILLIC }, |
183 { "aeb", USCRIPT_ARABIC }, | 183 { "aeb", USCRIPT_ARABIC }, |
184 { "af", USCRIPT_LATIN }, | 184 { "af", USCRIPT_LATIN }, |
185 { "ak", USCRIPT_LATIN }, | 185 { "ak", USCRIPT_LATIN }, |
186 { "am", USCRIPT_ETHIOPIC }, | 186 { "am", USCRIPT_ETHIOPIC }, |
187 { "ar", USCRIPT_ARABIC }, | 187 { "ar", USCRIPT_ARABIC }, |
188 { "arq", USCRIPT_ARABIC }, | 188 { "arq", USCRIPT_ARABIC }, |
189 { "ary", USCRIPT_ARABIC }, | 189 { "ary", USCRIPT_ARABIC }, |
(...skipping 218 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
408 { "vi", USCRIPT_LATIN }, | 408 { "vi", USCRIPT_LATIN }, |
409 { "wal", USCRIPT_ETHIOPIC }, | 409 { "wal", USCRIPT_ETHIOPIC }, |
410 { "war", USCRIPT_LATIN }, | 410 { "war", USCRIPT_LATIN }, |
411 { "wo", USCRIPT_LATIN }, | 411 { "wo", USCRIPT_LATIN }, |
412 { "xh", USCRIPT_LATIN }, | 412 { "xh", USCRIPT_LATIN }, |
413 { "yap", USCRIPT_LATIN }, | 413 { "yap", USCRIPT_LATIN }, |
414 { "yo", USCRIPT_LATIN }, | 414 { "yo", USCRIPT_LATIN }, |
415 { "za", USCRIPT_LATIN }, | 415 { "za", USCRIPT_LATIN }, |
416 { "zdj", USCRIPT_ARABIC }, | 416 { "zdj", USCRIPT_ARABIC }, |
417 { "zh", USCRIPT_SIMPLIFIED_HAN }, | 417 { "zh", USCRIPT_SIMPLIFIED_HAN }, |
418 { "zh_hk", USCRIPT_TRADITIONAL_HAN }, | 418 { "zu", USCRIPT_LATIN }, |
419 { "zh_tw", USCRIPT_TRADITIONAL_HAN }, | 419 // Encompassed languages within the Chinese macrolanguage. |
420 { "zu", USCRIPT_LATIN } | 420 // http://www-01.sil.org/iso639-3/documentation.asp?id=zho |
| 421 // http://lists.w3.org/Archives/Public/public-i18n-cjk/2016JulSep/0022.h
tml |
| 422 { "cdo", USCRIPT_SIMPLIFIED_HAN }, |
| 423 { "cjy", USCRIPT_SIMPLIFIED_HAN }, |
| 424 { "cmn", USCRIPT_SIMPLIFIED_HAN }, |
| 425 { "cpx", USCRIPT_SIMPLIFIED_HAN }, |
| 426 { "czh", USCRIPT_SIMPLIFIED_HAN }, |
| 427 { "czo", USCRIPT_SIMPLIFIED_HAN }, |
| 428 { "gan", USCRIPT_SIMPLIFIED_HAN }, |
| 429 { "hsn", USCRIPT_SIMPLIFIED_HAN }, |
| 430 { "mnp", USCRIPT_SIMPLIFIED_HAN }, |
| 431 { "wuu", USCRIPT_SIMPLIFIED_HAN }, |
| 432 { "hak", USCRIPT_TRADITIONAL_HAN }, |
| 433 { "lzh", USCRIPT_TRADITIONAL_HAN }, |
| 434 { "nan", USCRIPT_TRADITIONAL_HAN }, |
| 435 { "yue", USCRIPT_TRADITIONAL_HAN }, |
| 436 { "zh-cdo", USCRIPT_SIMPLIFIED_HAN }, |
| 437 { "zh-cjy", USCRIPT_SIMPLIFIED_HAN }, |
| 438 { "zh-cmn", USCRIPT_SIMPLIFIED_HAN }, |
| 439 { "zh-cpx", USCRIPT_SIMPLIFIED_HAN }, |
| 440 { "zh-czh", USCRIPT_SIMPLIFIED_HAN }, |
| 441 { "zh-czo", USCRIPT_SIMPLIFIED_HAN }, |
| 442 { "zh-gan", USCRIPT_SIMPLIFIED_HAN }, |
| 443 { "zh-hsn", USCRIPT_SIMPLIFIED_HAN }, |
| 444 { "zh-mnp", USCRIPT_SIMPLIFIED_HAN }, |
| 445 { "zh-wuu", USCRIPT_SIMPLIFIED_HAN }, |
| 446 { "zh-hak", USCRIPT_TRADITIONAL_HAN }, |
| 447 { "zh-lzh", USCRIPT_TRADITIONAL_HAN }, |
| 448 { "zh-nan", USCRIPT_TRADITIONAL_HAN }, |
| 449 { "zh-yue", USCRIPT_TRADITIONAL_HAN }, |
| 450 // Chinese with regions. Logically, regions should be handled |
| 451 // separately, but this works for the current purposes. |
| 452 { "zh-hk", USCRIPT_TRADITIONAL_HAN }, |
| 453 { "zh-mo", USCRIPT_TRADITIONAL_HAN }, |
| 454 { "zh-tw", USCRIPT_TRADITIONAL_HAN }, |
421 }; | 455 }; |
| 456 DEFINE_STATIC_LOCAL(SubtagScriptMap, localeScriptMap, |
| 457 (createSubtagScriptMap(localeScriptList, WTF_ARRAY_LENGTH(localeScriptLi
st)))); |
422 | 458 |
423 typedef HashMap<String, UScriptCode> LocaleScriptMap; | 459 // BCP 47 uses '-' as the delimiter but ICU uses '_'. |
424 DEFINE_STATIC_LOCAL(LocaleScriptMap, localeScriptMap, ()); | 460 // https://tools.ietf.org/html/bcp47 |
425 if (localeScriptMap.isEmpty()) { | 461 String canonicalLocale = locale; |
426 for (size_t i = 0; i < sizeof(localeScriptList) / sizeof(localeScriptLis
t[0]); ++i) | 462 canonicalLocale.replace('_', '-'); |
427 localeScriptMap.set(localeScriptList[i].locale, localeScriptList[i].
script); | |
428 } | |
429 | |
430 String canonicalLocale = locale.lower().replace('-', '_'); | |
431 while (!canonicalLocale.isEmpty()) { | 463 while (!canonicalLocale.isEmpty()) { |
432 HashMap<String, UScriptCode>::iterator it = localeScriptMap.find(canonic
alLocale); | 464 const auto& it = localeScriptMap.find(canonicalLocale); |
433 if (it != localeScriptMap.end()) | 465 if (it != localeScriptMap.end()) |
434 return it->value; | 466 return it->value; |
435 size_t pos = canonicalLocale.reverseFind('_'); | 467 size_t pos = canonicalLocale.reverseFind('-'); |
436 if (pos == kNotFound) | 468 if (pos == kNotFound) |
437 break; | 469 break; |
438 UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos + 1)); | 470 // script = 4ALPHA |
439 if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN) | 471 if (canonicalLocale.length() - (pos + 1) == 4) { |
440 return code; | 472 UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos +
1)); |
| 473 if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN) |
| 474 return code; |
| 475 } |
441 canonicalLocale = canonicalLocale.substring(0, pos); | 476 canonicalLocale = canonicalLocale.substring(0, pos); |
442 } | 477 } |
443 return USCRIPT_COMMON; | 478 return USCRIPT_COMMON; |
444 } | 479 } |
445 | 480 |
446 static bool isUnambiguousHanScript(UScriptCode script) | 481 static UScriptCode scriptCodeForHanFromRegion(const String& region) |
447 { | 482 { |
448 // localeToScriptCodeForFontSelection() does not return these values. | 483 static const SubtagScript regionScriptList[] = { |
449 ASSERT(script != USCRIPT_HIRAGANA && script != USCRIPT_KATAKANA); | |
450 return script == USCRIPT_KATAKANA_OR_HIRAGANA | |
451 || script == USCRIPT_SIMPLIFIED_HAN | |
452 || script == USCRIPT_TRADITIONAL_HAN | |
453 || script == USCRIPT_HANGUL; | |
454 } | |
455 | |
456 static UScriptCode scriptCodeForHanFromSubtag(const String& subtag) | |
457 { | |
458 struct SubtagScript { | |
459 const char* subtag; | |
460 UScriptCode script; | |
461 }; | |
462 | |
463 static const SubtagScript subtagScriptList[] = { | |
464 { "cn", USCRIPT_SIMPLIFIED_HAN }, | |
465 { "hans", USCRIPT_SIMPLIFIED_HAN }, | |
466 { "hant", USCRIPT_TRADITIONAL_HAN }, | |
467 { "hk", USCRIPT_TRADITIONAL_HAN }, | 484 { "hk", USCRIPT_TRADITIONAL_HAN }, |
468 { "jp", USCRIPT_KATAKANA_OR_HIRAGANA }, | 485 { "jp", USCRIPT_KATAKANA_OR_HIRAGANA }, |
469 { "kr", USCRIPT_HANGUL }, | 486 { "kr", USCRIPT_HANGUL }, |
| 487 { "mo", USCRIPT_TRADITIONAL_HAN }, |
470 { "tw", USCRIPT_TRADITIONAL_HAN }, | 488 { "tw", USCRIPT_TRADITIONAL_HAN }, |
471 }; | 489 }; |
| 490 DEFINE_STATIC_LOCAL(SubtagScriptMap, regionScriptMap, |
| 491 (createSubtagScriptMap(regionScriptList, WTF_ARRAY_LENGTH(regionScriptLi
st)))); |
472 | 492 |
473 typedef HashMap<String, UScriptCode> SubtagScriptMap; | 493 const auto& it = regionScriptMap.find(region); |
474 DEFINE_STATIC_LOCAL(SubtagScriptMap, subtagScriptMap, ()); | 494 return it != regionScriptMap.end() ? it->value : USCRIPT_COMMON; |
475 if (subtagScriptMap.isEmpty()) { | |
476 for (size_t i = 0; i < WTF_ARRAY_LENGTH(subtagScriptList); ++i) | |
477 subtagScriptMap.set(subtagScriptList[i].subtag, subtagScriptList[i].
script); | |
478 } | |
479 | |
480 const auto& it = subtagScriptMap.find(subtag.lower()); | |
481 return it != subtagScriptMap.end() ? it->value : USCRIPT_COMMON; | |
482 } | 495 } |
483 | 496 |
484 static UScriptCode scriptCodeForHanFromSubtags(const String& locale, char delimi
ter) | 497 UScriptCode scriptCodeForHanFromSubtags(const String& locale, char delimiter) |
485 { | 498 { |
486 // Some sites emit lang="en-JP" when English is set as the preferred | 499 // Some sites emit lang="en-JP" when English is set as the preferred |
487 // language. Use script/region subtags of the content locale to pick the | 500 // language. Use script/region subtags of the content locale to pick the |
488 // fallback font for unified Han ideographs. | 501 // fallback font for unified Han ideographs. |
489 for (size_t end = locale.find(delimiter); end != kNotFound; ) { | 502 for (size_t end = locale.find(delimiter); end != kNotFound; ) { |
490 size_t begin = end + 1; | 503 size_t begin = end + 1; |
491 end = locale.find(delimiter, begin); | 504 end = locale.find(delimiter, begin); |
492 UScriptCode script = scriptCodeForHanFromSubtag( | 505 size_t len = (end == kNotFound ? locale.length() : end) - begin; |
493 locale.substring(begin, | 506 UScriptCode script; |
494 end == kNotFound ? UINT_MAX : end - begin)); | 507 switch (len) { |
495 if (script != USCRIPT_COMMON) | 508 case 2: // region = 2ALPHA / 3DIGIT |
496 return script; | 509 script = scriptCodeForHanFromRegion(locale.substring(begin, len)); |
| 510 if (script != USCRIPT_COMMON) |
| 511 return script; |
| 512 break; |
| 513 case 4: // script = 4ALPHA |
| 514 script = scriptNameToCode(locale.substring(begin, len)); |
| 515 if (script != USCRIPT_INVALID_CODE) |
| 516 return script; |
| 517 } |
497 } | 518 } |
498 | 519 |
499 return USCRIPT_COMMON; | 520 return USCRIPT_COMMON; |
500 } | 521 } |
501 | 522 |
502 UScriptCode scriptCodeForHanFromLocale(UScriptCode script, const String& locale,
char delimiter) | |
503 { | |
504 if (isUnambiguousHanScript(script)) | |
505 return script; | |
506 | |
507 // Identify the script for Han if the UScriptCode is ambiguous. | |
508 // Check subtags only, because the UScriptCode covers the language part. | |
509 return scriptCodeForHanFromSubtags(locale, delimiter); | |
510 } | |
511 | |
512 UScriptCode scriptCodeForHanFromLocale(const String& locale, char delimiter) | |
513 { | |
514 UScriptCode script = localeToScriptCodeForFontSelection(locale); | |
515 return scriptCodeForHanFromLocale(script, locale, delimiter); | |
516 } | |
517 | |
518 } // namespace blink | 523 } // namespace blink |
OLD | NEW |