OLD | NEW |
| (Empty) |
1 // Copyright 2004-2009 Google Inc. | |
2 // | |
3 // Licensed under the Apache License, Version 2.0 (the "License"); | |
4 // you may not use this file except in compliance with the License. | |
5 // You may obtain a copy of the License at | |
6 // | |
7 // http://www.apache.org/licenses/LICENSE-2.0 | |
8 // | |
9 // Unless required by applicable law or agreed to in writing, software | |
10 // distributed under the License is distributed on an "AS IS" BASIS, | |
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 // See the License for the specific language governing permissions and | |
13 // limitations under the License. | |
14 // ======================================================================== | |
15 // | |
16 // This file is for i18n. It contains two enums, namely Language and | |
17 // Encoding, where Language is the linguistic convention, and Encoding | |
18 // contains information on both language encoding and character set. | |
19 // | |
20 // The language and encoding are both based on Teragram's conventions, | |
21 // except for some common ISO-8859 encodings that are not detected by | |
22 // Teragram but might be in the future. | |
23 // | |
24 // This file also includes functions that do mappings among | |
25 // Language/Encoding enums, language/encoding string names (typically | |
26 // the output from Language Encoding identifier), and language codes | |
27 // (iso 639), and two-letter country codes (iso 3166) | |
28 // | |
29 // NOTE: Both Language and Encoding enums should always start from | |
30 // zero value. This assumption has been made and used. | |
31 | |
32 #ifndef OMAHA_BASE_LANG_ENC_H_ | |
33 #define OMAHA_BASE_LANG_ENC_H_ | |
34 | |
35 #include <windows.h> | |
36 | |
37 // some of the popular encoding aliases | |
38 #define LATIN1 ISO_8859_1 | |
39 #define LATIN2 ISO_8859_2 | |
40 #define LATIN3 ISO_8859_3 | |
41 #define LATIN4 ISO_8859_4 | |
42 #define CYRILLIC ISO_8859_5 | |
43 #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language | |
44 #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language | |
45 #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language | |
46 #define LATIN5 ISO_8859_9 | |
47 #define LATIN6 ISO_8859_10 | |
48 #define KOREAN_HANGUL KOREAN_EUC_KR | |
49 | |
50 // NOTE: Only add new languages to the end of this list (but before | |
51 // NUM_LANGUAGES). | |
52 enum Language { | |
53 ENGLISH = 0, /* 0 */ | |
54 DANISH, /* 1 */ | |
55 DUTCH, /* 2 */ | |
56 FINNISH, /* 3 */ | |
57 FRENCH, /* 4 */ | |
58 GERMAN, /* 5 */ | |
59 HEBREW, /* 6 */ | |
60 ITALIAN, /* 7 */ | |
61 JAPANESE, /* 8 */ | |
62 KOREAN, /* 9 */ | |
63 NORWEGIAN, /* 10 */ | |
64 POLISH, /* 11 */ | |
65 PORTUGUESE, /* 12 */ | |
66 RUSSIAN, /* 13 */ | |
67 SPANISH, /* 14 */ | |
68 SWEDISH, /* 15 */ | |
69 CHINESE, /* 16 */ | |
70 CZECH, /* 17 */ | |
71 GREEK, /* 18 */ | |
72 ICELANDIC, /* 19 */ | |
73 LATVIAN, /* 20 */ | |
74 LITHUANIAN, /* 21 */ | |
75 ROMANIAN, /* 22 */ | |
76 HUNGARIAN, /* 23 */ | |
77 ESTONIAN, /* 24 */ | |
78 TG_UNKNOWN_LANGUAGE, /* 25 */ | |
79 UNKNOWN_LANGUAGE, /* 26 */ | |
80 BULGARIAN, /* 27 */ | |
81 CROATIAN, /* 28 */ | |
82 SERBIAN, /* 29 */ | |
83 IRISH, /* 30 */ | |
84 GALICIAN, /* 31 */ | |
85 TAGALOG, /* 32 */ | |
86 TURKISH, /* 33 */ | |
87 UKRAINIAN, /* 34 */ | |
88 HINDI, /* 35 */ | |
89 MACEDONIAN, /* 36 */ | |
90 BENGALI, /* 37 */ | |
91 INDONESIAN, /* 38 */ | |
92 LATIN, /* 39 */ | |
93 MALAY, /* 40 */ | |
94 MALAYALAM, /* 41 */ | |
95 WELSH, /* 42 */ | |
96 NEPALI, /* 43 */ | |
97 TELUGU, /* 44 */ | |
98 ALBANIAN, /* 45 */ | |
99 TAMIL, /* 46 */ | |
100 BELARUSIAN, /* 47 */ | |
101 JAVANESE, /* 48 */ | |
102 OCCITAN, /* 49 */ | |
103 URDU, /* 50 */ | |
104 BIHARI, /* 51 */ | |
105 GUJARATI, /* 52 */ | |
106 THAI, /* 53 */ | |
107 ARABIC, /* 54 */ | |
108 CATALAN, /* 55 */ | |
109 ESPERANTO, /* 56 */ | |
110 BASQUE, /* 57 */ | |
111 INTERLINGUA, /* 58 */ | |
112 KANNADA, /* 59 */ | |
113 PUNJABI, /* 60 */ | |
114 SCOTS_GAELIC, /* 61 */ | |
115 SWAHILI, /* 62 */ | |
116 SLOVENIAN, /* 63 */ | |
117 MARATHI, /* 64 */ | |
118 MALTESE, /* 65 */ | |
119 VIETNAMESE, /* 66 */ | |
120 FRISIAN, /* 67 */ | |
121 SLOVAK, /* 68 */ | |
122 CHINESE_T, /* 69 */ // This is added to solve the problem of | |
123 // distinguishing Traditional and Simplified | |
124 // Chinese when the encoding is UTF8. | |
125 FAROESE, /* 70 */ | |
126 SUNDANESE, /* 71 */ | |
127 UZBEK, /* 72 */ | |
128 AMHARIC, /* 73 */ | |
129 AZERBAIJANI, /* 74 */ | |
130 GEORGIAN, /* 75 */ | |
131 TIGRINYA, /* 76 */ | |
132 PERSIAN, /* 77 */ | |
133 BOSNIAN, /* 78 */ | |
134 SINHALESE, /* 79 */ | |
135 NORWEGIAN_N, /* 80 */ | |
136 PORTUGUESE_P, /* 81 */ | |
137 PORTUGUESE_B, /* 82 */ | |
138 XHOSA, /* 83 */ | |
139 ZULU, /* 84 */ | |
140 GUARANI, /* 85 */ | |
141 SESOTHO, /* 86 */ | |
142 TURKMEN, /* 87 */ | |
143 KYRGYZ, /* 88 */ | |
144 BRETON, /* 89 */ | |
145 TWI, /* 90 */ | |
146 YIDDISH, /* 91 */ | |
147 ORIYA, /* 92 */ | |
148 SERBO_CROATIAN, /* 93 */ | |
149 SOMALI, /* 94 */ | |
150 UIGHUR, /* 95 */ | |
151 KURDISH, /* 96 */ | |
152 MONGOLIAN, /* 97 */ | |
153 ARMENIAN, /* 98 */ | |
154 LAOTHIAN, /* 99 */ | |
155 SINDHI, /* 100! */ | |
156 RHAETO_ROMANCE, /* 101 */ | |
157 CHINESE_JAPANESE_KOREAN, /* 103 */ // Not really a language | |
158 PSEUDOTRANSLATION, /* 104 */ // Not really a language | |
159 NUM_LANGUAGES, // Always keep this at the end. It is not a | |
160 // valid Language enum, it is only used to | |
161 // indicate the total number of Languages. | |
162 }; | |
163 | |
164 | |
165 // Language codes for those languages we support, used to map to IDs from | |
166 // the Language enumeration. We could have used the Rfc1766ToLcid from the | |
167 // Win32 system's mlang.dll to map these to LCIDs, but a) we don't want to | |
168 // have to load mlang.dll and b) we are using our own language IDs. | |
169 const TCHAR* const kLangCodeChinesePrc = _T("zh_cn"); | |
170 const TCHAR* const kLangCodeChineseTaiwan = _T("zh_tw"); | |
171 const TCHAR* const kLangCodeCjk = _T("cjk"); | |
172 const TCHAR* const kLangCodeDutch = _T("nl"); | |
173 const TCHAR* const kLangCodeEnglish = _T("en"); | |
174 const TCHAR* const kLangCodeFrench = _T("fr"); | |
175 const TCHAR* const kLangCodeGerman = _T("de"); | |
176 const TCHAR* const kLangCodeItalian = _T("it"); | |
177 const TCHAR* const kLangCodeJapanese = _T("ja"); | |
178 const TCHAR* const kLangCodeKorean = _T("ko"); | |
179 const TCHAR* const kLangCodePseudo = _T("x"); | |
180 const TCHAR* const kLangCodeSpanish = _T("es"); | |
181 | |
182 | |
183 // Maps language codes to languages. Terminated by a { NULL, UNKNOWN_LANGUAGE } | |
184 // item. | |
185 struct CodeToLanguage { | |
186 const TCHAR* code; | |
187 Language language; | |
188 }; | |
189 | |
190 SELECTANY CodeToLanguage codes_to_languages[] = { | |
191 { kLangCodeChinesePrc, CHINESE }, | |
192 { kLangCodeChineseTaiwan, CHINESE_T }, | |
193 { kLangCodeCjk, CHINESE_JAPANESE_KOREAN }, | |
194 { kLangCodeDutch, DUTCH }, | |
195 { kLangCodeEnglish, ENGLISH }, | |
196 { kLangCodeFrench, FRENCH }, | |
197 { kLangCodeGerman, GERMAN }, | |
198 { kLangCodeItalian, ITALIAN }, | |
199 { kLangCodeJapanese, JAPANESE }, | |
200 { kLangCodeKorean, KOREAN }, | |
201 { kLangCodePseudo, PSEUDOTRANSLATION }, | |
202 { kLangCodeSpanish, SPANISH }, | |
203 { NULL, UNKNOWN_LANGUAGE } | |
204 }; | |
205 | |
206 | |
207 | |
208 // Macro to wrap the notion of "unknown language". | |
209 #define IS_LANGUAGE_UNKNOWN(l) \ | |
210 ((l) == TG_UNKNOWN_LANGUAGE || (l) == UNKNOWN_LANGUAGE) | |
211 | |
212 // NOTE: Only add new encodings to the end of this list (but before | |
213 // NUM_ENCODINGS). | |
214 // NOTE: If you add an encoding here, you must also modify basistech_encoding() | |
215 // and google2/com/google/i18n/Encoding.java | |
216 enum Encoding { | |
217 ISO_8859_1 = 0, // 0: Teragram ASCII | |
218 ISO_8859_2, // 1: Teragram Latin2 | |
219 ISO_8859_3, // 2: in BasisTech but not in Teragram | |
220 ISO_8859_4, // 3: Teragram Latin4 | |
221 ISO_8859_5, // 4: Teragram ISO-8859-5 | |
222 ISO_8859_6, // 5: Teragram Arabic | |
223 ISO_8859_7, // 6: Teragram Greek | |
224 ISO_8859_8, // 7: Teragram Hebrew | |
225 ISO_8859_9, // 8: in BasisTech but not in Teragram | |
226 ISO_8859_10, // 9: in BasisTech but not in Teragram | |
227 JAPANESE_EUC_JP, // 10: Teragram EUC_JP | |
228 JAPANESE_SHIFT_JIS, // 11: Teragram SJS | |
229 JAPANESE_JIS, // 12: Teragram JIS | |
230 CHINESE_BIG5, // 13: Teragram BIG5 | |
231 CHINESE_GB, // 14: Teragram GB | |
232 CHINESE_EUC_CN, // 15: Teragram EUC-CN | |
233 KOREAN_EUC_KR, // 16: Teragram KSC | |
234 UNICODE_ENCODING, // 17: Teragram Unicode, changed to UNICODE_ENCODING | |
235 // from UNICODE, which is predefined by WINDOW | |
236 CHINESE_EUC_DEC, // 18: Teragram EUC | |
237 CHINESE_CNS, // 19: Teragram CNS | |
238 CHINESE_BIG5_CP950, // 20: Teragram BIG5_CP950 | |
239 JAPANESE_CP932, // 21: Teragram CP932 | |
240 UTF8, // 22 | |
241 UNKNOWN_ENCODING, // 23 | |
242 ASCII_7BIT, // 24: ISO_8859_1 with all characters <= 127. | |
243 // Should be present only in the crawler | |
244 // and in the repository, | |
245 // *never* as a result of Document::encoding(). | |
246 RUSSIAN_KOI8_R, // 25: Teragram KOI8R | |
247 RUSSIAN_CP1251, // 26: Teragram CP1251 | |
248 | |
249 //---------------------------------------------------------- | |
250 // These are _not_ output from teragram. Instead, they are as | |
251 // detected in the headers of usenet articles. | |
252 MSFT_CP1252, // 27: CP1252 aka MSFT euro ascii | |
253 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian | |
254 MSFT_CP1250, // 29: CP1250 aka MSFT eastern european | |
255 ISO_8859_15, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized | |
256 //---------------------------------------------------------- | |
257 | |
258 //---------------------------------------------------------- | |
259 // These are in BasisTech but not in Teragram. They are | |
260 // needed for new interface languages. Now detected by | |
261 // research langid | |
262 MSFT_CP1254, // 31: used for Turkish | |
263 MSFT_CP1257, // 32: used in Baltic countries | |
264 //---------------------------------------------------------- | |
265 | |
266 //---------------------------------------------------------- | |
267 //---------------------------------------------------------- | |
268 // New encodings detected by Teragram | |
269 ISO_8859_11, // 33: aka TIS-620, used for Thai | |
270 MSFT_CP874, // 34: used for Thai | |
271 MSFT_CP1256, // 35: used for Arabic | |
272 | |
273 //---------------------------------------------------------- | |
274 // Detected as ISO_8859_8 by Teragram, but can be found in META tags | |
275 MSFT_CP1255, // 36: Logical Hebrew Microsoft | |
276 ISO_8859_8_I, // 37: Iso Hebrew Logical | |
277 HEBREW_VISUAL, // 38: Iso Hebrew Visual | |
278 //---------------------------------------------------------- | |
279 | |
280 //---------------------------------------------------------- | |
281 // Detected by research langid | |
282 CZECH_CP852, // 39 | |
283 CZECH_CSN_369103, // 40: aka ISO_IR_139 aka KOI8_CS | |
284 MSFT_CP1253, // 41: used for Greek | |
285 RUSSIAN_CP866, // 42 | |
286 //---------------------------------------------------------- | |
287 HZ_ENCODING, | |
288 ISO2022_CN, | |
289 ISO2022_KR, | |
290 | |
291 NUM_ENCODINGS // Always keep this at the end. It is not a | |
292 // valid Encoding enum, it is only used to | |
293 // indicate the total number of Encodings. | |
294 }; | |
295 | |
296 const int kNumLanguages = NUM_LANGUAGES; | |
297 const int kNumEncodings = NUM_ENCODINGS; | |
298 | |
299 #endif // OMAHA_BASE_LANG_ENC_H_ | |
OLD | NEW |