| Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/ext_lang_enc.cc
|
| ===================================================================
|
| --- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/ext_lang_enc.cc (revision 0)
|
| +++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/ext_lang_enc.cc (revision 0)
|
| @@ -0,0 +1,540 @@
|
| +// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#include <stdlib.h>
|
| +#include <stdio.h>
|
| +#include <string.h>
|
| +
|
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/ext_lang_enc.h"
|
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_macros.h"
|
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_strtoint.h"
|
| +
|
| +// Language names above NUM_LANGUAGES
|
| +// These are also the C enum declared names
|
| +static const char* const kExtLanguageName[] = {
|
| +"X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
|
| +
|
| +// Pseudo-languages for Unicode scripts that express a single language
|
| +"X_OGHAM", "X_RUNIC", "X_YI", "X_OLD_ITALIC", "X_GOTHIC",
|
| +"X_DESERET", "X_HANUNOO", "X_BUHID", "X_TAGBANWA", "X_TAI_LE",
|
| +"X_LINEAR_B", "X_UGARITIC", "X_SHAVIAN", "X_OSMANYA", "X_CYPRIOT",
|
| +"X_BUGINESE", "X_COPTIC", "X_NEW_TAI_LUE", "X_GLAGOLITIC", "X_TIFINAGH",
|
| +"X_SYLOTI_NAGRI", "X_OLD_PERSIAN", "X_KHAROSHTHI", "X_BALINESE", "X_CUNEIFORM",
|
| +"X_PHOENICIAN", "X_PHAGS_PA", "X_NKO",
|
| +
|
| +// Unicode 5.1
|
| +"X_SUDANESE", "X_LEPCHA", "X_OL_CHIKI", "X_VAI", "X_SAURASHTRA",
|
| +"X_KAYAH_LI", "X_REJANG", "X_LYCIAN", "X_CARIAN", "X_LYDIAN",
|
| +"X_CHAM",
|
| +};
|
| +
|
| +
|
| +// These are the C enum declared names, for programs creating C code
|
| +static const char* const kExtLangDeclaredName[] = {
|
| + "ENGLISH", /* 0 */
|
| + "DANISH", /* 1 */
|
| + "DUTCH", /* 2 */
|
| + "FINNISH", /* 3 */
|
| + "FRENCH", /* 4 */
|
| + "GERMAN", /* 5 */
|
| + "HEBREW", /* 6 */
|
| + "ITALIAN", /* 7 */
|
| + "JAPANESE", /* 8 */
|
| + "KOREAN", /* 9 */
|
| + "NORWEGIAN", /* 10 */
|
| + "POLISH", /* 11 */
|
| + "PORTUGUESE", /* 12 */
|
| + "RUSSIAN", /* 13 */
|
| + "SPANISH", /* 14 */
|
| + "SWEDISH", /* 15 */
|
| + "CHINESE", /* 16 */
|
| + "CZECH", /* 17 */
|
| + "GREEK", /* 18 */
|
| + "ICELANDIC", /* 19 */
|
| + "LATVIAN", /* 20 */
|
| + "LITHUANIAN", /* 21 */
|
| + "ROMANIAN", /* 22 */
|
| + "HUNGARIAN", /* 23 */
|
| + "ESTONIAN", /* 24 */
|
| + "TG_UNKNOWN_LANGUAGE", /* 25 */
|
| + "UNKNOWN_LANGUAGE", /* 26 */
|
| + "BULGARIAN", /* 27 */
|
| + "CROATIAN", /* 28 */
|
| + "SERBIAN", /* 29 */
|
| + "IRISH", /* 30 */
|
| + "GALICIAN", /* 31 */
|
| + "TAGALOG", /* 32 */
|
| + "TURKISH", /* 33 */
|
| + "UKRAINIAN", /* 34 */
|
| + "HINDI", /* 35 */
|
| + "MACEDONIAN", /* 36 */
|
| + "BENGALI", /* 37 */
|
| + "INDONESIAN", /* 38 */
|
| + "LATIN", /* 39 */
|
| + "MALAY", /* 40 */
|
| + "MALAYALAM", /* 41 */
|
| + "WELSH", /* 42 */
|
| + "NEPALI", /* 43 */
|
| + "TELUGU", /* 44 */
|
| + "ALBANIAN", /* 45 */
|
| + "TAMIL", /* 46 */
|
| + "BELARUSIAN", /* 47 */
|
| + "JAVANESE", /* 48 */
|
| + "OCCITAN", /* 49 */
|
| + "URDU", /* 50 */
|
| + "BIHARI", /* 51 */
|
| + "GUJARATI", /* 52 */
|
| + "THAI", /* 53 */
|
| + "ARABIC", /* 54 */
|
| + "CATALAN", /* 55 */
|
| + "ESPERANTO", /* 56 */
|
| + "BASQUE", /* 57 */
|
| + "INTERLINGUA", /* 58 */
|
| + "KANNADA", /* 59 */
|
| + "PUNJABI", /* 60 */
|
| + "SCOTS_GAELIC", /* 61 */
|
| + "SWAHILI", /* 62 */
|
| + "SLOVENIAN", /* 63 */
|
| + "MARATHI", /* 64 */
|
| + "MALTESE", /* 65 */
|
| + "VIETNAMESE", /* 66 */
|
| + "FRISIAN", /* 67 */
|
| + "SLOVAK", /* 68 */
|
| + "CHINESE_T", /* 69 */
|
| + "FAROESE", /* 70 */
|
| + "SUNDANESE", /* 71 */
|
| + "UZBEK", /* 72 */
|
| + "AMHARIC", /* 73 */
|
| + "AZERBAIJANI", /* 74 */
|
| + "GEORGIAN", /* 75 */
|
| + "TIGRINYA", /* 76 */
|
| + "PERSIAN", /* 77 */
|
| + "BOSNIAN", /* 78 */
|
| + "SINHALESE", /* 79 */
|
| + "NORWEGIAN_N", /* 80 */
|
| + "PORTUGUESE_P", /* 81 */
|
| + "PORTUGUESE_B", /* 82 */
|
| + "XHOSA", /* 83 */
|
| + "ZULU", /* 84 */
|
| + "GUARANI", /* 85 */
|
| + "SESOTHO", /* 86 */
|
| + "TURKMEN", /* 87 */
|
| + "KYRGYZ", /* 88 */
|
| + "BRETON", /* 89 */
|
| + "TWI", /* 90 */
|
| + "YIDDISH", /* 91 */
|
| + "SERBO_CROATIAN", /* 92 */
|
| + "SOMALI", /* 93 */
|
| + "UIGHUR", /* 94 */
|
| + "KURDISH", /* 95 */
|
| + "MONGOLIAN", /* 96 */
|
| + "ARMENIAN", /* 97 */
|
| + "LAOTHIAN", /* 98 */
|
| + "SINDHI", /* 99 */
|
| + "RHAETO_ROMANCE", /* 100 */
|
| + "AFRIKAANS", /* 101 */
|
| + "LUXEMBOURGISH", /* 102 */
|
| + "BURMESE", /* 103 */
|
| + "KHMER", /* 104 */
|
| + "TIBETAN", /* 105 */
|
| + "DHIVEHI", /* 106 */ // sometimes spelled Divehi; lang of Maldives
|
| + "CHEROKEE", /* 107 */
|
| + "SYRIAC", /* 108 */
|
| + "LIMBU", /* 109 */
|
| + "ORIYA", /* 110 */
|
| + "ASSAMESE", /* 111 */
|
| + "CORSICAN", /* 112 */
|
| + "INTERLINGUE", /* 113 */
|
| + "KAZAKH", /* 114 */
|
| + "LINGALA", /* 115 */
|
| + "MOLDAVIAN", /* 116 */
|
| + "PASHTO", /* 117 */
|
| + "QUECHUA", /* 118 */
|
| + "SHONA", /* 119 */
|
| + "TAJIK", /* 120 */
|
| + "TATAR", /* 121 */
|
| + "TONGA", /* 122 */
|
| + "YORUBA", /* 123 */
|
| + "CREOLES_AND_PIDGINS_ENGLISH_BASED", /* 124 */
|
| + "CREOLES_AND_PIDGINS_FRENCH_BASED", /* 125 */
|
| + "CREOLES_AND_PIDGINS_PORTUGUESE_BASED", /* 126 */
|
| + "CREOLES_AND_PIDGINS_OTHER", /* 127 */
|
| + "MAORI", /* 128 */
|
| + "WOLOF", /* 129 */
|
| + "ABKHAZIAN", /* 130 */
|
| + "AFAR", /* 131 */
|
| + "AYMARA", /* 132 */
|
| + "BASHKIR", /* 133 */
|
| + "BISLAMA", /* 134 */
|
| + "DZONGKHA", /* 135 */
|
| + "FIJIAN", /* 136 */
|
| + "GREENLANDIC", /* 137 */
|
| + "HAUSA", /* 138 */
|
| + "HAITIAN_CREOLE", /* 139 */
|
| + "INUPIAK", /* 140 */
|
| + "INUKTITUT", /* 141 */
|
| + "KASHMIRI", /* 142 */
|
| + "KINYARWANDA", /* 143 */
|
| + "MALAGASY", /* 144 */
|
| + "NAURU", /* 145 */
|
| + "OROMO", /* 146 */
|
| + "RUNDI", /* 147 */
|
| + "SAMOAN", /* 148 */
|
| + "SANGO", /* 149 */
|
| + "SANSKRIT", /* 150 */
|
| + "SISWANT", /* 151 */
|
| + "TSONGA", /* 152 */
|
| + "TSWANA", /* 153 */
|
| + "VOLAPUK", /* 154 */
|
| + "ZHUANG", /* 155 */
|
| + "KHASI", /* 156 */
|
| + "SCOTS", /* 157 */
|
| + "GANDA", /* 158 */
|
| + "MANX", /* 159 */
|
| + "MONTENEGRIN", /* 160 */
|
| + // Add new language declared names just before here
|
| +};
|
| +
|
| +COMPILE_ASSERT(arraysize(kExtLangDeclaredName) == NUM_LANGUAGES,
|
| + kExtLangDeclaredName_has_incorrect_length);
|
| +
|
| +
|
| +// Language codes above NUM_LANGUAGES
|
| +// I made all these up, except Klingon from ISO-639-2
|
| +// NOTE: zza is a standard name
|
| +static const char* const kExtLanguageCode[] = {
|
| + // "X_BORK_BORK_BORK", "X_PIG_LATIN", "X_HACKER", "X_KLINGON", "X_ELMER_FUDD",
|
| + // All Latin script
|
| + "zzb", "zzp", "zzh", "tlh", "zze",
|
| +
|
| + // Pseudo-languages for Unicode scripts that express a single language
|
| + "xx-Ogam", "xx-Runr", "xx-Yiii", "xx-Ital", "xx-Goth",
|
| + "xx-Dsrt", "xx-Hano", "xx-Buhd", "xx-Tagb", "xx-Tale",
|
| + "xx-Linb", "xx-Ugar", "xx-Shaw", "xx-Osma", "xx-Cprt",
|
| + "xx-Bugi", "xx-Copt", "xx-Talu", "xx-Glag", "xx-Tfng",
|
| + "xx-Sylo", "xx-Xpeo", "xx-Khar", "xx-Bali", "xx-Xsux",
|
| + "xx-Phnx", "xx-Phag", "xx-Nkoo",
|
| +
|
| + // Unicode 5.1
|
| + "xx-Sund", "xx-Lepc", "xx-Olck", "xx-Vaii", "xx-Saur",
|
| + "xx-Kali", "xx-Rjng", "xx-Lyci", "xx-Cari", "xx-Lydi",
|
| + "xx-Cham",
|
| +};
|
| +
|
| +
|
| +// Given the Language, returns its string name used as the output by
|
| +// the lang/enc identifier, e.g. "Korean"
|
| +// "invalid_language" if the input is invalid.
|
| +// TG_UNKNOWN_LANGUAGE is used as a placeholder for the "ignore me" language,
|
| +// used to subtract out HTML, link farms, DNA strings, and alittle English porn
|
| +const char* ExtLanguageName(const Language lang) {
|
| + if (lang < 0) {
|
| + // No-text-at-all result from a Tote
|
| + return "";
|
| + }
|
| + // CompactLanguageDetect extension
|
| + if (lang == TG_UNKNOWN_LANGUAGE) {
|
| + return "Ignore";
|
| + }
|
| + if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
|
| + return LanguageName(lang);
|
| + }
|
| + if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
|
| + return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
|
| + }
|
| + return invalid_language_name();
|
| +}
|
| +
|
| +
|
| +// Given the Language, returns its Language enum spelling, for use by
|
| +// programs that create C declarations, e.g. "KOREAN"
|
| +// "UNKNOWN_LANGUAGE" if the input is invalid.
|
| +const char* ExtLanguageDeclaredName(const Language lang) {
|
| + if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
|
| + return kExtLangDeclaredName[lang];
|
| + }
|
| + if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
|
| + return kExtLanguageName[lang - EXT_LANGUAGE_BASE];
|
| + }
|
| + return "UNKNOWN_LANGUAGE";
|
| +}
|
| +
|
| +// Given the Language, return the language code, e.g. "ko"
|
| +const char* ExtLanguageCode(const Language lang) {
|
| + // Hack for ignore/porn pseudo-language
|
| + if (lang == TG_UNKNOWN_LANGUAGE) {
|
| + return "xxx";
|
| + }
|
| + if ((0 <= lang) && (lang < NUM_LANGUAGES)) {
|
| + return LanguageCode(lang);
|
| + }
|
| + if ((EXT_LANGUAGE_BASE <= lang) && (lang < EXT_NUM_LANGUAGES)) {
|
| + return kExtLanguageCode[lang - EXT_LANGUAGE_BASE];
|
| + }
|
| + return "??";
|
| +}
|
| +
|
| +
|
| +// Convert "en-Latn-GB" to ENGLISH
|
| +// Normalize to PORTUGUESE, not PORTUGUESE_B nor PORTUGUESE_P
|
| +// Consider for later: NORWEGIAN, NORWEGIAN_N
|
| +// Consider for later: SCOTS, SCOTS_GAELIC
|
| +// Consider for later: SERBO_CROATIAN, SERBIAN, CROATIAN, BOSNIAN
|
| +//
|
| +Language GetLanguageFromNumberOrName(const char* src) {
|
| + if (strspn(src, "0123456789") == strlen(src)) {
|
| + // All digits
|
| + return static_cast<Language>(strto32(src, NULL, 10));
|
| + }
|
| +
|
| + Language retlang = UNKNOWN_LANGUAGE;
|
| + size_t len = strlen(src);
|
| +
|
| + if (true /*FLAGS_mergepairs*/) {
|
| + // Merge sets of langauges pt-xx en-xx fr-xx, NOT bs/hr/sr
|
| + if (memcmp(src, "pt-", 3) == 0) {return PORTUGUESE;}
|
| + if (memcmp(src, "en-", 3) == 0) {return ENGLISH;}
|
| + if (memcmp(src, "fr-", 3) == 0) {return FRENCH;}
|
| + // Use NormalizeLanguage instead
|
| + if (memcmp(src, "bs-", 3) == 0) {return CROATIAN;}
|
| + if (memcmp(src, "hr-", 3) == 0) {return CROATIAN;}
|
| + if (memcmp(src, "sr-Latn", 7) == 0) {return CROATIAN;}
|
| + if (memcmp(src, "sh-Latn", 7) == 0) {return CROATIAN;}
|
| + if (memcmp(src, "sr-Cyrl", 7) == 0) {return SERBIAN;}
|
| + if (memcmp(src, "sh-Cyrl", 7) == 0) {return SERBIAN;}
|
| + }
|
| +
|
| + // Extensions
|
| + if (len >= 3) {
|
| + // Standin for ignore/porn "language"
|
| + if (memcmp(src, "xxx", 3) == 0) {return TG_UNKNOWN_LANGUAGE;}
|
| +
|
| + if (memcmp(src, "zzb", 3) == 0) {return X_BORK_BORK_BORK;}
|
| + if (memcmp(src, "zzp", 3) == 0) {return X_PIG_LATIN;}
|
| + if (memcmp(src, "zzh", 3) == 0) {return X_HACKER;}
|
| + if (memcmp(src, "tlh", 3) == 0) {return X_KLINGON;}
|
| + if (memcmp(src, "zze", 3) == 0) {return X_ELMER_FUDD;}
|
| + }
|
| +
|
| + // We have a name like en-Latn-GB or pt-BR
|
| + // First, get rid of some special cases
|
| + if (len <= 3) {
|
| + LanguageFromCode(src, &retlang);
|
| + } else if (len == 7) {
|
| + // More Extensions
|
| + if (memcmp(src, "xx-", 3) == 0) {
|
| + if (memcmp(src, "xx-Ogam", 7) == 0) {return X_OGHAM;}
|
| + if (memcmp(src, "xx-Runr", 7) == 0) {return X_RUNIC;}
|
| + if (memcmp(src, "xx-Yiii", 7) == 0) {return X_YI;}
|
| + if (memcmp(src, "xx-Ital", 7) == 0) {return X_OLD_ITALIC;}
|
| + if (memcmp(src, "xx-Goth", 7) == 0) {return X_GOTHIC;}
|
| + if (memcmp(src, "xx-Dsrt", 7) == 0) {return X_DESERET;}
|
| + if (memcmp(src, "xx-Hano", 7) == 0) {return X_HANUNOO;}
|
| + if (memcmp(src, "xx-Buhd", 7) == 0) {return X_BUHID;}
|
| + if (memcmp(src, "xx-Tagb", 7) == 0) {return X_TAGBANWA;}
|
| + if (memcmp(src, "xx-Tale", 7) == 0) {return X_TAI_LE;}
|
| + if (memcmp(src, "xx-Linb", 7) == 0) {return X_LINEAR_B;}
|
| + if (memcmp(src, "xx-Ugar", 7) == 0) {return X_UGARITIC;}
|
| + if (memcmp(src, "xx-Shaw", 7) == 0) {return X_SHAVIAN;}
|
| + if (memcmp(src, "xx-Osma", 7) == 0) {return X_OSMANYA;}
|
| + if (memcmp(src, "xx-Cprt", 7) == 0) {return X_CYPRIOT;}
|
| + if (memcmp(src, "xx-Bugi", 7) == 0) {return X_BUGINESE;}
|
| + if (memcmp(src, "xx-Copt", 7) == 0) {return X_COPTIC;}
|
| + if (memcmp(src, "xx-Talu", 7) == 0) {return X_NEW_TAI_LUE;}
|
| + if (memcmp(src, "xx-Glag", 7) == 0) {return X_GLAGOLITIC;}
|
| + if (memcmp(src, "xx-Tfng", 7) == 0) {return X_TIFINAGH;}
|
| + if (memcmp(src, "xx-Sylo", 7) == 0) {return X_SYLOTI_NAGRI;}
|
| + if (memcmp(src, "xx-Xpeo", 7) == 0) {return X_OLD_PERSIAN;}
|
| + if (memcmp(src, "xx-Khar", 7) == 0) {return X_KHAROSHTHI;}
|
| + if (memcmp(src, "xx-Bali", 7) == 0) {return X_BALINESE;}
|
| + if (memcmp(src, "xx-Xsux", 7) == 0) {return X_CUNEIFORM;}
|
| + if (memcmp(src, "xx-Phnx", 7) == 0) {return X_PHOENICIAN;}
|
| + if (memcmp(src, "xx-Phag", 7) == 0) {return X_PHAGS_PA;}
|
| + if (memcmp(src, "xx-Nkoo", 7) == 0) {return X_NKO;}
|
| +
|
| + // Unicode 5.1
|
| + if (memcmp(src, "xx-Sund", 7) == 0) {return X_SUDANESE;}
|
| + if (memcmp(src, "xx-Lepc", 7) == 0) {return X_LEPCHA;}
|
| + if (memcmp(src, "xx-Olck", 7) == 0) {return X_OL_CHIKI;}
|
| + if (memcmp(src, "xx-Vaii", 7) == 0) {return X_VAI;}
|
| + if (memcmp(src, "xx-Saur", 7) == 0) {return X_SAURASHTRA;}
|
| + if (memcmp(src, "xx-Kali", 7) == 0) {return X_KAYAH_LI;}
|
| + if (memcmp(src, "xx-Rjng", 7) == 0) {return X_REJANG;}
|
| + if (memcmp(src, "xx-Lyci", 7) == 0) {return X_LYCIAN;}
|
| + if (memcmp(src, "xx-Cari", 7) == 0) {return X_CARIAN;}
|
| + if (memcmp(src, "xx-Lydi", 7) == 0) {return X_LYDIAN;}
|
| + if (memcmp(src, "xx-Cham", 7) == 0) {return X_CHAM;}
|
| + }
|
| + }
|
| + // Some other weird ones
|
| + // Could be Latn or Limb; all our current training data is Latn
|
| + if (strcmp(src, "sit-NP") == 0) {return LIMBU;}
|
| + if (strcmp(src, "un-Latn") == 0) {return UNKNOWN_LANGUAGE;}
|
| +
|
| + // Multi-country langauges
|
| + if (memcmp(src, "zh", 2) == 0) {
|
| + if (memcmp(&src[len - 2], "TW", 2) == 0) {return CHINESE_T;}
|
| + if (memcmp(&src[len - 2], "HK", 2) == 0) {return CHINESE_T;}
|
| + return CHINESE;
|
| + }
|
| + if (memcmp(src, "pt", 2) == 0) {
|
| + if (memcmp(&src[len - 2], "BR", 2) == 0) {return PORTUGUESE;}
|
| + return PORTUGUESE;
|
| + }
|
| + if (memcmp(src, "fr", 2) == 0) {
|
| + if (memcmp(&src[len -2], "CA", 2) == 0) {return FRENCH;}
|
| + return FRENCH;
|
| + }
|
| +
|
| + // None of the special cases matched
|
| + if (src[2] == '-') {
|
| + char temp[4];
|
| + memcpy(temp, src, 4);
|
| + temp[2] = '\0';
|
| + LanguageFromCode(temp, &retlang);
|
| + }
|
| + if (src[3] == '-') {
|
| + char temp[4];
|
| + memcpy(temp, src, 4);
|
| + temp[3] = '\0';
|
| + LanguageFromCode(temp, &retlang);
|
| + }
|
| + if (retlang != UNKNOWN_LANGUAGE) {
|
| + return retlang;
|
| + }
|
| +
|
| + return retlang;
|
| +}
|
| +
|
| +typedef struct {
|
| + const char* name;
|
| + UnicodeLScript lscript;
|
| +} NameScriptPair;
|
| +
|
| +// In alphabetic order for binary search
|
| +static const NameScriptPair kNameScriptPair[] = {
|
| + // Unicode 5.1 additional scripts
|
| + {"Arab", ULScript_Arabic},
|
| + {"Armn", ULScript_Armenian},
|
| + {"Bali", ULScript_Balinese},
|
| + {"Beng", ULScript_Bengali},
|
| + {"Bugi", ULScript_Buginese},
|
| + {"Buhd", ULScript_Buhid},
|
| + {"Cans", ULScript_Canadian_Aboriginal},
|
| + {"Cari", ULScript_Carian}, // Unicode 5.1
|
| + {"Cham", ULScript_Cham}, // Unicode 5.1
|
| + {"Cher", ULScript_Cherokee},
|
| + {"Copt", ULScript_Coptic},
|
| + {"Cprt", ULScript_Cypriot},
|
| + {"Cyrl", ULScript_Cyrillic},
|
| + {"Deva", ULScript_Devanagari},
|
| + {"Dsrt", ULScript_Deseret},
|
| + {"Ethi", ULScript_Ethiopic},
|
| + {"Geor", ULScript_Georgian},
|
| + {"Glag", ULScript_Glagolitic},
|
| + {"Goth", ULScript_Gothic},
|
| + {"Grek", ULScript_Greek},
|
| + {"Gujr", ULScript_Gujarati},
|
| + {"Guru", ULScript_Gurmukhi},
|
| + {"Hani", ULScript_HanCJK},
|
| + {"Hano", ULScript_Hanunoo},
|
| + {"Hebr", ULScript_Hebrew},
|
| + {"Ital", ULScript_Old_Italic},
|
| + {"Kali", ULScript_Kayah_Li}, // Unicode 5.1
|
| + {"Khar", ULScript_Kharoshthi},
|
| + {"Khmr", ULScript_Khmer},
|
| + {"Knda", ULScript_Kannada},
|
| + {"Laoo", ULScript_Lao},
|
| + {"Latn", ULScript_Latin},
|
| + {"Lepc", ULScript_Lepcha}, // Unicode 5.1
|
| + {"Limb", ULScript_Limbu},
|
| + {"Linb", ULScript_Linear_B},
|
| + {"Lyci", ULScript_Lycian}, // Unicode 5.1
|
| + {"Lydi", ULScript_Lydian}, // Unicode 5.1
|
| + {"Mlym", ULScript_Malayalam},
|
| + {"Mong", ULScript_Mongolian},
|
| + {"Mymr", ULScript_Myanmar},
|
| + {"Nkoo", ULScript_Nko},
|
| + {"Ogam", ULScript_Ogham},
|
| + {"Olck", ULScript_Ol_Chiki}, // Unicode 5.1
|
| + {"Orya", ULScript_Oriya},
|
| + {"Osma", ULScript_Osmanya},
|
| + {"Phag", ULScript_Phags_Pa},
|
| + {"Phnx", ULScript_Phoenician},
|
| + {"Rjng", ULScript_Rejang}, // Unicode 5.1
|
| + {"Runr", ULScript_Runic},
|
| + {"Saur", ULScript_Saurashtra}, // Unicode 5.1
|
| + {"Shaw", ULScript_Shavian},
|
| + {"Sinh", ULScript_Sinhala},
|
| + {"Sund", ULScript_Sundanese}, // Unicode 5.1
|
| + {"Sylo", ULScript_Syloti_Nagri},
|
| + {"Syrc", ULScript_Syriac},
|
| + {"Tagb", ULScript_Tagbanwa},
|
| + {"Tale", ULScript_Tai_Le},
|
| + {"Talu", ULScript_New_Tai_Lue},
|
| + {"Taml", ULScript_Tamil},
|
| + {"Telu", ULScript_Telugu},
|
| + {"Tfng", ULScript_Tifinagh},
|
| + {"Tglg", ULScript_Tagalog},
|
| + {"Thaa", ULScript_Thaana},
|
| + {"Thai", ULScript_Thai},
|
| + {"Tibt", ULScript_Tibetan},
|
| + {"Ugar", ULScript_Ugaritic},
|
| + {"Vaii", ULScript_Vai}, // Unicode 5.1 // NOTE: apparently 'Vai '
|
| + {"Xpeo", ULScript_Old_Persian},
|
| + {"Xsux", ULScript_Cuneiform},
|
| + {"Yiii", ULScript_Yi},
|
| + {"Zyyy", ULScript_Common},
|
| + {"Zzzz", ULScript_Inherited},
|
| +};
|
| +
|
| +// Convert "en-Latn-GB" to ULScript_Latin
|
| +UnicodeLScript GetLScriptFromNumberOrName(const char* src) {
|
| + if (strspn(src, "0123456789") == strlen(src)) {
|
| + // All digits
|
| + return static_cast<UnicodeLScript>(strto32(src, NULL, 10));
|
| + }
|
| +
|
| + if (strcmp(src, "zh-TW") == 0) {return ULScript_HanCJK;}
|
| + if (strcmp(src, "zh-CN") == 0) {return ULScript_HanCJK;}
|
| + if (strcmp(src, "pt-BR") == 0) {return ULScript_Latin;}
|
| + if (strcmp(src, "pt-PT") == 0) {return ULScript_Latin;}
|
| + // Could be Latn or Limb; all our current training data is Latn
|
| + if (strcmp(src, "sit-NP") == 0) {return ULScript_Latin;}
|
| +
|
| + // Isolate just the script field
|
| + char temp[5];
|
| + const char* src2 = strchr(src, '-');
|
| + if (src2 == NULL) {return ULScript_Latin;}
|
| + src2 += 1; // over the -
|
| + memcpy(temp, src2, 4);
|
| + temp[4] = '\0';
|
| +
|
| + int lo = 0;
|
| + int hi = ULScript_NUM_SCRIPTS;
|
| + while (lo < hi) {
|
| + int mid = (lo + hi) >> 1;
|
| + if (strcmp(temp, kNameScriptPair[mid].name) < 0) {
|
| + hi = mid;
|
| + } else if (strcmp(temp, kNameScriptPair[mid].name) > 0) {
|
| + lo = mid + 1;
|
| + } else {
|
| + return kNameScriptPair[mid].lscript;
|
| + }
|
| + }
|
| + return ULScript_Latin;
|
| +}
|
| +
|
| +
|
| +// Merge together some languages, such as bo/hr/sr
|
| +// Croatian Latin and Serbian Cyrillic now.
|
| +Language NormalizeLanguage(Language lang) {
|
| + if (lang == BOSNIAN) {return CROATIAN;}
|
| + if (lang == SERBO_CROATIAN) {return SERBIAN;}
|
| +
|
| + if (lang == PORTUGUESE_P) {return PORTUGUESE;}
|
| + if (lang == PORTUGUESE_B) {return PORTUGUESE;}
|
| +
|
| + return lang;
|
| +}
|
|
|
| Property changes on: third_party\cld\bar\toolbar\cld\i18n\encodings\compact_lang_det\ext_lang_enc.cc
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|