Index: third_party/WebKit/Source/platform/text/LocaleToScriptMapping.cpp |
diff --git a/third_party/WebKit/Source/platform/text/LocaleToScriptMapping.cpp b/third_party/WebKit/Source/platform/text/LocaleToScriptMapping.cpp |
index ae5dc158db1defb434d9929923e815e265ae7e84..d06e737f3075b2f553ac4740a832b1bb0e651e6d 100644 |
--- a/third_party/WebKit/Source/platform/text/LocaleToScriptMapping.cpp |
+++ b/third_party/WebKit/Source/platform/text/LocaleToScriptMapping.cpp |
@@ -36,18 +36,28 @@ |
namespace blink { |
-UScriptCode scriptNameToCode(const String& scriptName) |
+struct SubtagScript { |
+ const char* subtag; |
+ UScriptCode script; |
+}; |
+ |
+using SubtagScriptMap = HashMap<String, UScriptCode, CaseFoldingHash>; |
+ |
+static SubtagScriptMap createSubtagScriptMap(const SubtagScript list[], size_t size) |
{ |
- struct ScriptNameCode { |
- const char* name; |
- UScriptCode code; |
- }; |
+ SubtagScriptMap map; |
+ for (size_t i = 0; i < size; ++i) |
+ map.set(list[i].subtag, list[i].script); |
+ return map; |
+} |
+UScriptCode scriptNameToCode(const String& scriptName) |
+{ |
// This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are |
// treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to |
// USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered |
// using the same font setting. |
- static const ScriptNameCode scriptNameCodeList[] = { |
+ static const SubtagScript scriptNameCodeList[] = { |
{ "zyyy", USCRIPT_COMMON }, |
{ "qaai", USCRIPT_INHERITED }, |
{ "arab", USCRIPT_ARABIC }, |
@@ -155,15 +165,10 @@ UScriptCode scriptNameToCode(const String& scriptName) |
{ "zxxx", USCRIPT_UNWRITTEN_LANGUAGES }, |
{ "zzzz", USCRIPT_UNKNOWN } |
}; |
+ DEFINE_STATIC_LOCAL(SubtagScriptMap, scriptNameCodeMap, |
+ (createSubtagScriptMap(scriptNameCodeList, WTF_ARRAY_LENGTH(scriptNameCodeList)))); |
- typedef HashMap<String, UScriptCode> ScriptNameCodeMap; |
- DEFINE_STATIC_LOCAL(ScriptNameCodeMap, scriptNameCodeMap, ()); |
- if (scriptNameCodeMap.isEmpty()) { |
- for (size_t i = 0; i < sizeof(scriptNameCodeList) / sizeof(scriptNameCodeList[0]); ++i) |
- scriptNameCodeMap.set(scriptNameCodeList[i].name, scriptNameCodeList[i].code); |
- } |
- |
- HashMap<String, UScriptCode>::iterator it = scriptNameCodeMap.find(scriptName.lower()); |
+ const auto& it = scriptNameCodeMap.find(scriptName); |
if (it != scriptNameCodeMap.end()) |
return it->value; |
return USCRIPT_INVALID_CODE; |
@@ -171,12 +176,7 @@ UScriptCode scriptNameToCode(const String& scriptName) |
UScriptCode localeToScriptCodeForFontSelection(const String& locale) |
{ |
- struct LocaleScript { |
- const char* locale; |
- UScriptCode script; |
- }; |
- |
- static const LocaleScript localeScriptList[] = { |
+ static const SubtagScript localeScriptList[] = { |
{ "aa", USCRIPT_LATIN }, |
{ "ab", USCRIPT_CYRILLIC }, |
{ "ady", USCRIPT_CYRILLIC }, |
@@ -415,73 +415,86 @@ UScriptCode localeToScriptCodeForFontSelection(const String& locale) |
{ "za", USCRIPT_LATIN }, |
{ "zdj", USCRIPT_ARABIC }, |
{ "zh", USCRIPT_SIMPLIFIED_HAN }, |
- { "zh_hk", USCRIPT_TRADITIONAL_HAN }, |
- { "zh_tw", USCRIPT_TRADITIONAL_HAN }, |
- { "zu", USCRIPT_LATIN } |
+ { "zu", USCRIPT_LATIN }, |
+ // Encompassed languages within the Chinese macrolanguage. |
+ // http://www-01.sil.org/iso639-3/documentation.asp?id=zho |
+ // http://lists.w3.org/Archives/Public/public-i18n-cjk/2016JulSep/0022.html |
+ { "cdo", USCRIPT_SIMPLIFIED_HAN }, |
+ { "cjy", USCRIPT_SIMPLIFIED_HAN }, |
+ { "cmn", USCRIPT_SIMPLIFIED_HAN }, |
+ { "cpx", USCRIPT_SIMPLIFIED_HAN }, |
+ { "czh", USCRIPT_SIMPLIFIED_HAN }, |
+ { "czo", USCRIPT_SIMPLIFIED_HAN }, |
+ { "gan", USCRIPT_SIMPLIFIED_HAN }, |
+ { "hsn", USCRIPT_SIMPLIFIED_HAN }, |
+ { "mnp", USCRIPT_SIMPLIFIED_HAN }, |
+ { "wuu", USCRIPT_SIMPLIFIED_HAN }, |
+ { "hak", USCRIPT_TRADITIONAL_HAN }, |
+ { "lzh", USCRIPT_TRADITIONAL_HAN }, |
+ { "nan", USCRIPT_TRADITIONAL_HAN }, |
+ { "yue", USCRIPT_TRADITIONAL_HAN }, |
+ { "zh-cdo", USCRIPT_SIMPLIFIED_HAN }, |
+ { "zh-cjy", USCRIPT_SIMPLIFIED_HAN }, |
+ { "zh-cmn", USCRIPT_SIMPLIFIED_HAN }, |
+ { "zh-cpx", USCRIPT_SIMPLIFIED_HAN }, |
+ { "zh-czh", USCRIPT_SIMPLIFIED_HAN }, |
+ { "zh-czo", USCRIPT_SIMPLIFIED_HAN }, |
+ { "zh-gan", USCRIPT_SIMPLIFIED_HAN }, |
+ { "zh-hsn", USCRIPT_SIMPLIFIED_HAN }, |
+ { "zh-mnp", USCRIPT_SIMPLIFIED_HAN }, |
+ { "zh-wuu", USCRIPT_SIMPLIFIED_HAN }, |
+ { "zh-hak", USCRIPT_TRADITIONAL_HAN }, |
+ { "zh-lzh", USCRIPT_TRADITIONAL_HAN }, |
+ { "zh-nan", USCRIPT_TRADITIONAL_HAN }, |
+ { "zh-yue", USCRIPT_TRADITIONAL_HAN }, |
+ // Chinese with regions. Logically, regions should be handled |
+ // separately, but this works for the current purposes. |
+ { "zh-hk", USCRIPT_TRADITIONAL_HAN }, |
+ { "zh-mo", USCRIPT_TRADITIONAL_HAN }, |
+ { "zh-tw", USCRIPT_TRADITIONAL_HAN }, |
}; |
+ DEFINE_STATIC_LOCAL(SubtagScriptMap, localeScriptMap, |
+ (createSubtagScriptMap(localeScriptList, WTF_ARRAY_LENGTH(localeScriptList)))); |
- typedef HashMap<String, UScriptCode> LocaleScriptMap; |
- DEFINE_STATIC_LOCAL(LocaleScriptMap, localeScriptMap, ()); |
- if (localeScriptMap.isEmpty()) { |
- for (size_t i = 0; i < sizeof(localeScriptList) / sizeof(localeScriptList[0]); ++i) |
- localeScriptMap.set(localeScriptList[i].locale, localeScriptList[i].script); |
- } |
- |
- String canonicalLocale = locale.lower().replace('-', '_'); |
+ // BCP 47 uses '-' as the delimiter but ICU uses '_'. |
+ // https://tools.ietf.org/html/bcp47 |
+ String canonicalLocale = locale; |
+ canonicalLocale.replace('_', '-'); |
while (!canonicalLocale.isEmpty()) { |
- HashMap<String, UScriptCode>::iterator it = localeScriptMap.find(canonicalLocale); |
+ const auto& it = localeScriptMap.find(canonicalLocale); |
if (it != localeScriptMap.end()) |
return it->value; |
- size_t pos = canonicalLocale.reverseFind('_'); |
+ size_t pos = canonicalLocale.reverseFind('-'); |
if (pos == kNotFound) |
break; |
- UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos + 1)); |
- if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN) |
- return code; |
+ // script = 4ALPHA |
+ if (canonicalLocale.length() - (pos + 1) == 4) { |
+ UScriptCode code = scriptNameToCode(canonicalLocale.substring(pos + 1)); |
+ if (code != USCRIPT_INVALID_CODE && code != USCRIPT_UNKNOWN) |
+ return code; |
+ } |
canonicalLocale = canonicalLocale.substring(0, pos); |
} |
return USCRIPT_COMMON; |
} |
-static bool isUnambiguousHanScript(UScriptCode script) |
-{ |
- // localeToScriptCodeForFontSelection() does not return these values. |
- ASSERT(script != USCRIPT_HIRAGANA && script != USCRIPT_KATAKANA); |
- return script == USCRIPT_KATAKANA_OR_HIRAGANA |
- || script == USCRIPT_SIMPLIFIED_HAN |
- || script == USCRIPT_TRADITIONAL_HAN |
- || script == USCRIPT_HANGUL; |
-} |
- |
-static UScriptCode scriptCodeForHanFromSubtag(const String& subtag) |
+static UScriptCode scriptCodeForHanFromRegion(const String& region) |
{ |
- struct SubtagScript { |
- const char* subtag; |
- UScriptCode script; |
- }; |
- |
- static const SubtagScript subtagScriptList[] = { |
- { "cn", USCRIPT_SIMPLIFIED_HAN }, |
- { "hans", USCRIPT_SIMPLIFIED_HAN }, |
- { "hant", USCRIPT_TRADITIONAL_HAN }, |
+ static const SubtagScript regionScriptList[] = { |
{ "hk", USCRIPT_TRADITIONAL_HAN }, |
{ "jp", USCRIPT_KATAKANA_OR_HIRAGANA }, |
{ "kr", USCRIPT_HANGUL }, |
+ { "mo", USCRIPT_TRADITIONAL_HAN }, |
{ "tw", USCRIPT_TRADITIONAL_HAN }, |
}; |
+ DEFINE_STATIC_LOCAL(SubtagScriptMap, regionScriptMap, |
+ (createSubtagScriptMap(regionScriptList, WTF_ARRAY_LENGTH(regionScriptList)))); |
- typedef HashMap<String, UScriptCode> SubtagScriptMap; |
- DEFINE_STATIC_LOCAL(SubtagScriptMap, subtagScriptMap, ()); |
- if (subtagScriptMap.isEmpty()) { |
- for (size_t i = 0; i < WTF_ARRAY_LENGTH(subtagScriptList); ++i) |
- subtagScriptMap.set(subtagScriptList[i].subtag, subtagScriptList[i].script); |
- } |
- |
- const auto& it = subtagScriptMap.find(subtag.lower()); |
- return it != subtagScriptMap.end() ? it->value : USCRIPT_COMMON; |
+ const auto& it = regionScriptMap.find(region); |
+ return it != regionScriptMap.end() ? it->value : USCRIPT_COMMON; |
} |
-static UScriptCode scriptCodeForHanFromSubtags(const String& locale, char delimiter) |
+UScriptCode scriptCodeForHanFromSubtags(const String& locale, char delimiter) |
{ |
// Some sites emit lang="en-JP" when English is set as the preferred |
// language. Use script/region subtags of the content locale to pick the |
@@ -489,30 +502,22 @@ static UScriptCode scriptCodeForHanFromSubtags(const String& locale, char delimi |
for (size_t end = locale.find(delimiter); end != kNotFound; ) { |
size_t begin = end + 1; |
end = locale.find(delimiter, begin); |
- UScriptCode script = scriptCodeForHanFromSubtag( |
- locale.substring(begin, |
- end == kNotFound ? UINT_MAX : end - begin)); |
- if (script != USCRIPT_COMMON) |
- return script; |
+ size_t len = (end == kNotFound ? locale.length() : end) - begin; |
+ UScriptCode script; |
+ switch (len) { |
+ case 2: // region = 2ALPHA / 3DIGIT |
+ script = scriptCodeForHanFromRegion(locale.substring(begin, len)); |
+ if (script != USCRIPT_COMMON) |
+ return script; |
+ break; |
+ case 4: // script = 4ALPHA |
+ script = scriptNameToCode(locale.substring(begin, len)); |
+ if (script != USCRIPT_INVALID_CODE) |
+ return script; |
+ } |
} |
return USCRIPT_COMMON; |
} |
-UScriptCode scriptCodeForHanFromLocale(UScriptCode script, const String& locale, char delimiter) |
-{ |
- if (isUnambiguousHanScript(script)) |
- return script; |
- |
- // Identify the script for Han if the UScriptCode is ambiguous. |
- // Check subtags only, because the UScriptCode covers the language part. |
- return scriptCodeForHanFromSubtags(locale, delimiter); |
-} |
- |
-UScriptCode scriptCodeForHanFromLocale(const String& locale, char delimiter) |
-{ |
- UScriptCode script = localeToScriptCodeForFontSelection(locale); |
- return scriptCodeForHanFromLocale(script, locale, delimiter); |
-} |
- |
} // namespace blink |