| OLD | NEW |
| (Empty) |
| 1 // Copyright 2004-2009 Google Inc. | |
| 2 // | |
| 3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
| 4 // you may not use this file except in compliance with the License. | |
| 5 // You may obtain a copy of the License at | |
| 6 // | |
| 7 // http://www.apache.org/licenses/LICENSE-2.0 | |
| 8 // | |
| 9 // Unless required by applicable law or agreed to in writing, software | |
| 10 // distributed under the License is distributed on an "AS IS" BASIS, | |
| 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 12 // See the License for the specific language governing permissions and | |
| 13 // limitations under the License. | |
| 14 // ======================================================================== | |
| 15 // | |
| 16 // This file is for i18n. It contains two enums, namely Language and | |
| 17 // Encoding, where Language is the linguistic convention, and Encoding | |
| 18 // contains information on both language encoding and character set. | |
| 19 // | |
| 20 // The language and encoding are both based on Teragram's conventions, | |
| 21 // except for some common ISO-8859 encodings that are not detected by | |
| 22 // Teragram but might be in the future. | |
| 23 // | |
| 24 // This file also includes functions that do mappings among | |
| 25 // Language/Encoding enums, language/encoding string names (typically | |
| 26 // the output from Language Encoding identifier), and language codes | |
| 27 // (iso 639), and two-letter country codes (iso 3166) | |
| 28 // | |
| 29 // NOTE: Both Language and Encoding enums should always start from | |
| 30 // zero value. This assumption has been made and used. | |
| 31 | |
| 32 #ifndef OMAHA_BASE_LANG_ENC_H_ | |
| 33 #define OMAHA_BASE_LANG_ENC_H_ | |
| 34 | |
| 35 #include <windows.h> | |
| 36 | |
| 37 // some of the popular encoding aliases | |
| 38 #define LATIN1 ISO_8859_1 | |
| 39 #define LATIN2 ISO_8859_2 | |
| 40 #define LATIN3 ISO_8859_3 | |
| 41 #define LATIN4 ISO_8859_4 | |
| 42 #define CYRILLIC ISO_8859_5 | |
| 43 #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language | |
| 44 #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language | |
| 45 #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language | |
| 46 #define LATIN5 ISO_8859_9 | |
| 47 #define LATIN6 ISO_8859_10 | |
| 48 #define KOREAN_HANGUL KOREAN_EUC_KR | |
| 49 | |
| 50 // NOTE: Only add new languages to the end of this list (but before | |
| 51 // NUM_LANGUAGES). | |
| 52 enum Language { | |
| 53 ENGLISH = 0, /* 0 */ | |
| 54 DANISH, /* 1 */ | |
| 55 DUTCH, /* 2 */ | |
| 56 FINNISH, /* 3 */ | |
| 57 FRENCH, /* 4 */ | |
| 58 GERMAN, /* 5 */ | |
| 59 HEBREW, /* 6 */ | |
| 60 ITALIAN, /* 7 */ | |
| 61 JAPANESE, /* 8 */ | |
| 62 KOREAN, /* 9 */ | |
| 63 NORWEGIAN, /* 10 */ | |
| 64 POLISH, /* 11 */ | |
| 65 PORTUGUESE, /* 12 */ | |
| 66 RUSSIAN, /* 13 */ | |
| 67 SPANISH, /* 14 */ | |
| 68 SWEDISH, /* 15 */ | |
| 69 CHINESE, /* 16 */ | |
| 70 CZECH, /* 17 */ | |
| 71 GREEK, /* 18 */ | |
| 72 ICELANDIC, /* 19 */ | |
| 73 LATVIAN, /* 20 */ | |
| 74 LITHUANIAN, /* 21 */ | |
| 75 ROMANIAN, /* 22 */ | |
| 76 HUNGARIAN, /* 23 */ | |
| 77 ESTONIAN, /* 24 */ | |
| 78 TG_UNKNOWN_LANGUAGE, /* 25 */ | |
| 79 UNKNOWN_LANGUAGE, /* 26 */ | |
| 80 BULGARIAN, /* 27 */ | |
| 81 CROATIAN, /* 28 */ | |
| 82 SERBIAN, /* 29 */ | |
| 83 IRISH, /* 30 */ | |
| 84 GALICIAN, /* 31 */ | |
| 85 TAGALOG, /* 32 */ | |
| 86 TURKISH, /* 33 */ | |
| 87 UKRAINIAN, /* 34 */ | |
| 88 HINDI, /* 35 */ | |
| 89 MACEDONIAN, /* 36 */ | |
| 90 BENGALI, /* 37 */ | |
| 91 INDONESIAN, /* 38 */ | |
| 92 LATIN, /* 39 */ | |
| 93 MALAY, /* 40 */ | |
| 94 MALAYALAM, /* 41 */ | |
| 95 WELSH, /* 42 */ | |
| 96 NEPALI, /* 43 */ | |
| 97 TELUGU, /* 44 */ | |
| 98 ALBANIAN, /* 45 */ | |
| 99 TAMIL, /* 46 */ | |
| 100 BELARUSIAN, /* 47 */ | |
| 101 JAVANESE, /* 48 */ | |
| 102 OCCITAN, /* 49 */ | |
| 103 URDU, /* 50 */ | |
| 104 BIHARI, /* 51 */ | |
| 105 GUJARATI, /* 52 */ | |
| 106 THAI, /* 53 */ | |
| 107 ARABIC, /* 54 */ | |
| 108 CATALAN, /* 55 */ | |
| 109 ESPERANTO, /* 56 */ | |
| 110 BASQUE, /* 57 */ | |
| 111 INTERLINGUA, /* 58 */ | |
| 112 KANNADA, /* 59 */ | |
| 113 PUNJABI, /* 60 */ | |
| 114 SCOTS_GAELIC, /* 61 */ | |
| 115 SWAHILI, /* 62 */ | |
| 116 SLOVENIAN, /* 63 */ | |
| 117 MARATHI, /* 64 */ | |
| 118 MALTESE, /* 65 */ | |
| 119 VIETNAMESE, /* 66 */ | |
| 120 FRISIAN, /* 67 */ | |
| 121 SLOVAK, /* 68 */ | |
| 122 CHINESE_T, /* 69 */ // This is added to solve the problem of | |
| 123 // distinguishing Traditional and Simplified | |
| 124 // Chinese when the encoding is UTF8. | |
| 125 FAROESE, /* 70 */ | |
| 126 SUNDANESE, /* 71 */ | |
| 127 UZBEK, /* 72 */ | |
| 128 AMHARIC, /* 73 */ | |
| 129 AZERBAIJANI, /* 74 */ | |
| 130 GEORGIAN, /* 75 */ | |
| 131 TIGRINYA, /* 76 */ | |
| 132 PERSIAN, /* 77 */ | |
| 133 BOSNIAN, /* 78 */ | |
| 134 SINHALESE, /* 79 */ | |
| 135 NORWEGIAN_N, /* 80 */ | |
| 136 PORTUGUESE_P, /* 81 */ | |
| 137 PORTUGUESE_B, /* 82 */ | |
| 138 XHOSA, /* 83 */ | |
| 139 ZULU, /* 84 */ | |
| 140 GUARANI, /* 85 */ | |
| 141 SESOTHO, /* 86 */ | |
| 142 TURKMEN, /* 87 */ | |
| 143 KYRGYZ, /* 88 */ | |
| 144 BRETON, /* 89 */ | |
| 145 TWI, /* 90 */ | |
| 146 YIDDISH, /* 91 */ | |
| 147 ORIYA, /* 92 */ | |
| 148 SERBO_CROATIAN, /* 93 */ | |
| 149 SOMALI, /* 94 */ | |
| 150 UIGHUR, /* 95 */ | |
| 151 KURDISH, /* 96 */ | |
| 152 MONGOLIAN, /* 97 */ | |
| 153 ARMENIAN, /* 98 */ | |
| 154 LAOTHIAN, /* 99 */ | |
| 155 SINDHI, /* 100! */ | |
| 156 RHAETO_ROMANCE, /* 101 */ | |
| 157 CHINESE_JAPANESE_KOREAN, /* 103 */ // Not really a language | |
| 158 PSEUDOTRANSLATION, /* 104 */ // Not really a language | |
| 159 NUM_LANGUAGES, // Always keep this at the end. It is not a | |
| 160 // valid Language enum, it is only used to | |
| 161 // indicate the total number of Languages. | |
| 162 }; | |
| 163 | |
| 164 | |
| 165 // Language codes for those languages we support, used to map to IDs from | |
| 166 // the Language enumeration. We could have used the Rfc1766ToLcid from the | |
| 167 // Win32 system's mlang.dll to map these to LCIDs, but a) we don't want to | |
| 168 // have to load mlang.dll and b) we are using our own language IDs. | |
| 169 const TCHAR* const kLangCodeChinesePrc = _T("zh_cn"); | |
| 170 const TCHAR* const kLangCodeChineseTaiwan = _T("zh_tw"); | |
| 171 const TCHAR* const kLangCodeCjk = _T("cjk"); | |
| 172 const TCHAR* const kLangCodeDutch = _T("nl"); | |
| 173 const TCHAR* const kLangCodeEnglish = _T("en"); | |
| 174 const TCHAR* const kLangCodeFrench = _T("fr"); | |
| 175 const TCHAR* const kLangCodeGerman = _T("de"); | |
| 176 const TCHAR* const kLangCodeItalian = _T("it"); | |
| 177 const TCHAR* const kLangCodeJapanese = _T("ja"); | |
| 178 const TCHAR* const kLangCodeKorean = _T("ko"); | |
| 179 const TCHAR* const kLangCodePseudo = _T("x"); | |
| 180 const TCHAR* const kLangCodeSpanish = _T("es"); | |
| 181 | |
| 182 | |
| 183 // Maps language codes to languages. Terminated by a { NULL, UNKNOWN_LANGUAGE } | |
| 184 // item. | |
| 185 struct CodeToLanguage { | |
| 186 const TCHAR* code; | |
| 187 Language language; | |
| 188 }; | |
| 189 | |
| 190 SELECTANY CodeToLanguage codes_to_languages[] = { | |
| 191 { kLangCodeChinesePrc, CHINESE }, | |
| 192 { kLangCodeChineseTaiwan, CHINESE_T }, | |
| 193 { kLangCodeCjk, CHINESE_JAPANESE_KOREAN }, | |
| 194 { kLangCodeDutch, DUTCH }, | |
| 195 { kLangCodeEnglish, ENGLISH }, | |
| 196 { kLangCodeFrench, FRENCH }, | |
| 197 { kLangCodeGerman, GERMAN }, | |
| 198 { kLangCodeItalian, ITALIAN }, | |
| 199 { kLangCodeJapanese, JAPANESE }, | |
| 200 { kLangCodeKorean, KOREAN }, | |
| 201 { kLangCodePseudo, PSEUDOTRANSLATION }, | |
| 202 { kLangCodeSpanish, SPANISH }, | |
| 203 { NULL, UNKNOWN_LANGUAGE } | |
| 204 }; | |
| 205 | |
| 206 | |
| 207 | |
| 208 // Macro to wrap the notion of "unknown language". | |
| 209 #define IS_LANGUAGE_UNKNOWN(l) \ | |
| 210 ((l) == TG_UNKNOWN_LANGUAGE || (l) == UNKNOWN_LANGUAGE) | |
| 211 | |
| 212 // NOTE: Only add new encodings to the end of this list (but before | |
| 213 // NUM_ENCODINGS). | |
| 214 // NOTE: If you add an encoding here, you must also modify basistech_encoding() | |
| 215 // and google2/com/google/i18n/Encoding.java | |
| 216 enum Encoding { | |
| 217 ISO_8859_1 = 0, // 0: Teragram ASCII | |
| 218 ISO_8859_2, // 1: Teragram Latin2 | |
| 219 ISO_8859_3, // 2: in BasisTech but not in Teragram | |
| 220 ISO_8859_4, // 3: Teragram Latin4 | |
| 221 ISO_8859_5, // 4: Teragram ISO-8859-5 | |
| 222 ISO_8859_6, // 5: Teragram Arabic | |
| 223 ISO_8859_7, // 6: Teragram Greek | |
| 224 ISO_8859_8, // 7: Teragram Hebrew | |
| 225 ISO_8859_9, // 8: in BasisTech but not in Teragram | |
| 226 ISO_8859_10, // 9: in BasisTech but not in Teragram | |
| 227 JAPANESE_EUC_JP, // 10: Teragram EUC_JP | |
| 228 JAPANESE_SHIFT_JIS, // 11: Teragram SJS | |
| 229 JAPANESE_JIS, // 12: Teragram JIS | |
| 230 CHINESE_BIG5, // 13: Teragram BIG5 | |
| 231 CHINESE_GB, // 14: Teragram GB | |
| 232 CHINESE_EUC_CN, // 15: Teragram EUC-CN | |
| 233 KOREAN_EUC_KR, // 16: Teragram KSC | |
| 234 UNICODE_ENCODING, // 17: Teragram Unicode, changed to UNICODE_ENCODING | |
| 235 // from UNICODE, which is predefined by WINDOW | |
| 236 CHINESE_EUC_DEC, // 18: Teragram EUC | |
| 237 CHINESE_CNS, // 19: Teragram CNS | |
| 238 CHINESE_BIG5_CP950, // 20: Teragram BIG5_CP950 | |
| 239 JAPANESE_CP932, // 21: Teragram CP932 | |
| 240 UTF8, // 22 | |
| 241 UNKNOWN_ENCODING, // 23 | |
| 242 ASCII_7BIT, // 24: ISO_8859_1 with all characters <= 127. | |
| 243 // Should be present only in the crawler | |
| 244 // and in the repository, | |
| 245 // *never* as a result of Document::encoding(). | |
| 246 RUSSIAN_KOI8_R, // 25: Teragram KOI8R | |
| 247 RUSSIAN_CP1251, // 26: Teragram CP1251 | |
| 248 | |
| 249 //---------------------------------------------------------- | |
| 250 // These are _not_ output from teragram. Instead, they are as | |
| 251 // detected in the headers of usenet articles. | |
| 252 MSFT_CP1252, // 27: CP1252 aka MSFT euro ascii | |
| 253 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian | |
| 254 MSFT_CP1250, // 29: CP1250 aka MSFT eastern european | |
| 255 ISO_8859_15, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized | |
| 256 //---------------------------------------------------------- | |
| 257 | |
| 258 //---------------------------------------------------------- | |
| 259 // These are in BasisTech but not in Teragram. They are | |
| 260 // needed for new interface languages. Now detected by | |
| 261 // research langid | |
| 262 MSFT_CP1254, // 31: used for Turkish | |
| 263 MSFT_CP1257, // 32: used in Baltic countries | |
| 264 //---------------------------------------------------------- | |
| 265 | |
| 266 //---------------------------------------------------------- | |
| 267 //---------------------------------------------------------- | |
| 268 // New encodings detected by Teragram | |
| 269 ISO_8859_11, // 33: aka TIS-620, used for Thai | |
| 270 MSFT_CP874, // 34: used for Thai | |
| 271 MSFT_CP1256, // 35: used for Arabic | |
| 272 | |
| 273 //---------------------------------------------------------- | |
| 274 // Detected as ISO_8859_8 by Teragram, but can be found in META tags | |
| 275 MSFT_CP1255, // 36: Logical Hebrew Microsoft | |
| 276 ISO_8859_8_I, // 37: Iso Hebrew Logical | |
| 277 HEBREW_VISUAL, // 38: Iso Hebrew Visual | |
| 278 //---------------------------------------------------------- | |
| 279 | |
| 280 //---------------------------------------------------------- | |
| 281 // Detected by research langid | |
| 282 CZECH_CP852, // 39 | |
| 283 CZECH_CSN_369103, // 40: aka ISO_IR_139 aka KOI8_CS | |
| 284 MSFT_CP1253, // 41: used for Greek | |
| 285 RUSSIAN_CP866, // 42 | |
| 286 //---------------------------------------------------------- | |
| 287 HZ_ENCODING, | |
| 288 ISO2022_CN, | |
| 289 ISO2022_KR, | |
| 290 | |
| 291 NUM_ENCODINGS // Always keep this at the end. It is not a | |
| 292 // valid Encoding enum, it is only used to | |
| 293 // indicate the total number of Encodings. | |
| 294 }; | |
| 295 | |
| 296 const int kNumLanguages = NUM_LANGUAGES; | |
| 297 const int kNumEncodings = NUM_ENCODINGS; | |
| 298 | |
| 299 #endif // OMAHA_BASE_LANG_ENC_H_ | |
| OLD | NEW |