Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(90)

Side by Side Diff: base/lang_enc.h

Issue 624713003: Keep only base/extractor.[cc|h]. (Closed) Base URL: https://chromium.googlesource.com/external/omaha.git@master
Patch Set: Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « base/highres_timer_unittest.cc ('k') | base/localization.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2004-2009 Google Inc.
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 // ========================================================================
15 //
16 // This file is for i18n. It contains two enums, namely Language and
17 // Encoding, where Language is the linguistic convention, and Encoding
18 // contains information on both language encoding and character set.
19 //
20 // The language and encoding are both based on Teragram's conventions,
21 // except for some common ISO-8859 encodings that are not detected by
22 // Teragram but might be in the future.
23 //
24 // This file also includes functions that do mappings among
25 // Language/Encoding enums, language/encoding string names (typically
26 // the output from Language Encoding identifier), and language codes
27 // (iso 639), and two-letter country codes (iso 3166)
28 //
29 // NOTE: Both Language and Encoding enums should always start from
30 // zero value. This assumption has been made and used.
31
32 #ifndef OMAHA_BASE_LANG_ENC_H_
33 #define OMAHA_BASE_LANG_ENC_H_
34
35 #include <windows.h>
36
37 // some of the popular encoding aliases
38 #define LATIN1 ISO_8859_1
39 #define LATIN2 ISO_8859_2
40 #define LATIN3 ISO_8859_3
41 #define LATIN4 ISO_8859_4
42 #define CYRILLIC ISO_8859_5
43 #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
44 #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
45 #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
46 #define LATIN5 ISO_8859_9
47 #define LATIN6 ISO_8859_10
48 #define KOREAN_HANGUL KOREAN_EUC_KR
49
50 // NOTE: Only add new languages to the end of this list (but before
51 // NUM_LANGUAGES).
52 enum Language {
53 ENGLISH = 0, /* 0 */
54 DANISH, /* 1 */
55 DUTCH, /* 2 */
56 FINNISH, /* 3 */
57 FRENCH, /* 4 */
58 GERMAN, /* 5 */
59 HEBREW, /* 6 */
60 ITALIAN, /* 7 */
61 JAPANESE, /* 8 */
62 KOREAN, /* 9 */
63 NORWEGIAN, /* 10 */
64 POLISH, /* 11 */
65 PORTUGUESE, /* 12 */
66 RUSSIAN, /* 13 */
67 SPANISH, /* 14 */
68 SWEDISH, /* 15 */
69 CHINESE, /* 16 */
70 CZECH, /* 17 */
71 GREEK, /* 18 */
72 ICELANDIC, /* 19 */
73 LATVIAN, /* 20 */
74 LITHUANIAN, /* 21 */
75 ROMANIAN, /* 22 */
76 HUNGARIAN, /* 23 */
77 ESTONIAN, /* 24 */
78 TG_UNKNOWN_LANGUAGE, /* 25 */
79 UNKNOWN_LANGUAGE, /* 26 */
80 BULGARIAN, /* 27 */
81 CROATIAN, /* 28 */
82 SERBIAN, /* 29 */
83 IRISH, /* 30 */
84 GALICIAN, /* 31 */
85 TAGALOG, /* 32 */
86 TURKISH, /* 33 */
87 UKRAINIAN, /* 34 */
88 HINDI, /* 35 */
89 MACEDONIAN, /* 36 */
90 BENGALI, /* 37 */
91 INDONESIAN, /* 38 */
92 LATIN, /* 39 */
93 MALAY, /* 40 */
94 MALAYALAM, /* 41 */
95 WELSH, /* 42 */
96 NEPALI, /* 43 */
97 TELUGU, /* 44 */
98 ALBANIAN, /* 45 */
99 TAMIL, /* 46 */
100 BELARUSIAN, /* 47 */
101 JAVANESE, /* 48 */
102 OCCITAN, /* 49 */
103 URDU, /* 50 */
104 BIHARI, /* 51 */
105 GUJARATI, /* 52 */
106 THAI, /* 53 */
107 ARABIC, /* 54 */
108 CATALAN, /* 55 */
109 ESPERANTO, /* 56 */
110 BASQUE, /* 57 */
111 INTERLINGUA, /* 58 */
112 KANNADA, /* 59 */
113 PUNJABI, /* 60 */
114 SCOTS_GAELIC, /* 61 */
115 SWAHILI, /* 62 */
116 SLOVENIAN, /* 63 */
117 MARATHI, /* 64 */
118 MALTESE, /* 65 */
119 VIETNAMESE, /* 66 */
120 FRISIAN, /* 67 */
121 SLOVAK, /* 68 */
122 CHINESE_T, /* 69 */ // This is added to solve the problem of
123 // distinguishing Traditional and Simplified
124 // Chinese when the encoding is UTF8.
125 FAROESE, /* 70 */
126 SUNDANESE, /* 71 */
127 UZBEK, /* 72 */
128 AMHARIC, /* 73 */
129 AZERBAIJANI, /* 74 */
130 GEORGIAN, /* 75 */
131 TIGRINYA, /* 76 */
132 PERSIAN, /* 77 */
133 BOSNIAN, /* 78 */
134 SINHALESE, /* 79 */
135 NORWEGIAN_N, /* 80 */
136 PORTUGUESE_P, /* 81 */
137 PORTUGUESE_B, /* 82 */
138 XHOSA, /* 83 */
139 ZULU, /* 84 */
140 GUARANI, /* 85 */
141 SESOTHO, /* 86 */
142 TURKMEN, /* 87 */
143 KYRGYZ, /* 88 */
144 BRETON, /* 89 */
145 TWI, /* 90 */
146 YIDDISH, /* 91 */
147 ORIYA, /* 92 */
148 SERBO_CROATIAN, /* 93 */
149 SOMALI, /* 94 */
150 UIGHUR, /* 95 */
151 KURDISH, /* 96 */
152 MONGOLIAN, /* 97 */
153 ARMENIAN, /* 98 */
154 LAOTHIAN, /* 99 */
155 SINDHI, /* 100! */
156 RHAETO_ROMANCE, /* 101 */
157 CHINESE_JAPANESE_KOREAN, /* 103 */ // Not really a language
158 PSEUDOTRANSLATION, /* 104 */ // Not really a language
159 NUM_LANGUAGES, // Always keep this at the end. It is not a
160 // valid Language enum, it is only used to
161 // indicate the total number of Languages.
162 };
163
164
165 // Language codes for those languages we support, used to map to IDs from
166 // the Language enumeration. We could have used the Rfc1766ToLcid from the
167 // Win32 system's mlang.dll to map these to LCIDs, but a) we don't want to
168 // have to load mlang.dll and b) we are using our own language IDs.
169 const TCHAR* const kLangCodeChinesePrc = _T("zh_cn");
170 const TCHAR* const kLangCodeChineseTaiwan = _T("zh_tw");
171 const TCHAR* const kLangCodeCjk = _T("cjk");
172 const TCHAR* const kLangCodeDutch = _T("nl");
173 const TCHAR* const kLangCodeEnglish = _T("en");
174 const TCHAR* const kLangCodeFrench = _T("fr");
175 const TCHAR* const kLangCodeGerman = _T("de");
176 const TCHAR* const kLangCodeItalian = _T("it");
177 const TCHAR* const kLangCodeJapanese = _T("ja");
178 const TCHAR* const kLangCodeKorean = _T("ko");
179 const TCHAR* const kLangCodePseudo = _T("x");
180 const TCHAR* const kLangCodeSpanish = _T("es");
181
182
183 // Maps language codes to languages. Terminated by a { NULL, UNKNOWN_LANGUAGE }
184 // item.
185 struct CodeToLanguage {
186 const TCHAR* code;
187 Language language;
188 };
189
190 SELECTANY CodeToLanguage codes_to_languages[] = {
191 { kLangCodeChinesePrc, CHINESE },
192 { kLangCodeChineseTaiwan, CHINESE_T },
193 { kLangCodeCjk, CHINESE_JAPANESE_KOREAN },
194 { kLangCodeDutch, DUTCH },
195 { kLangCodeEnglish, ENGLISH },
196 { kLangCodeFrench, FRENCH },
197 { kLangCodeGerman, GERMAN },
198 { kLangCodeItalian, ITALIAN },
199 { kLangCodeJapanese, JAPANESE },
200 { kLangCodeKorean, KOREAN },
201 { kLangCodePseudo, PSEUDOTRANSLATION },
202 { kLangCodeSpanish, SPANISH },
203 { NULL, UNKNOWN_LANGUAGE }
204 };
205
206
207
208 // Macro to wrap the notion of "unknown language".
209 #define IS_LANGUAGE_UNKNOWN(l) \
210 ((l) == TG_UNKNOWN_LANGUAGE || (l) == UNKNOWN_LANGUAGE)
211
212 // NOTE: Only add new encodings to the end of this list (but before
213 // NUM_ENCODINGS).
214 // NOTE: If you add an encoding here, you must also modify basistech_encoding()
215 // and google2/com/google/i18n/Encoding.java
216 enum Encoding {
217 ISO_8859_1 = 0, // 0: Teragram ASCII
218 ISO_8859_2, // 1: Teragram Latin2
219 ISO_8859_3, // 2: in BasisTech but not in Teragram
220 ISO_8859_4, // 3: Teragram Latin4
221 ISO_8859_5, // 4: Teragram ISO-8859-5
222 ISO_8859_6, // 5: Teragram Arabic
223 ISO_8859_7, // 6: Teragram Greek
224 ISO_8859_8, // 7: Teragram Hebrew
225 ISO_8859_9, // 8: in BasisTech but not in Teragram
226 ISO_8859_10, // 9: in BasisTech but not in Teragram
227 JAPANESE_EUC_JP, // 10: Teragram EUC_JP
228 JAPANESE_SHIFT_JIS, // 11: Teragram SJS
229 JAPANESE_JIS, // 12: Teragram JIS
230 CHINESE_BIG5, // 13: Teragram BIG5
231 CHINESE_GB, // 14: Teragram GB
232 CHINESE_EUC_CN, // 15: Teragram EUC-CN
233 KOREAN_EUC_KR, // 16: Teragram KSC
234 UNICODE_ENCODING, // 17: Teragram Unicode, changed to UNICODE_ENCODING
235 // from UNICODE, which is predefined by WINDOW
236 CHINESE_EUC_DEC, // 18: Teragram EUC
237 CHINESE_CNS, // 19: Teragram CNS
238 CHINESE_BIG5_CP950, // 20: Teragram BIG5_CP950
239 JAPANESE_CP932, // 21: Teragram CP932
240 UTF8, // 22
241 UNKNOWN_ENCODING, // 23
242 ASCII_7BIT, // 24: ISO_8859_1 with all characters <= 127.
243 // Should be present only in the crawler
244 // and in the repository,
245 // *never* as a result of Document::encoding().
246 RUSSIAN_KOI8_R, // 25: Teragram KOI8R
247 RUSSIAN_CP1251, // 26: Teragram CP1251
248
249 //----------------------------------------------------------
250 // These are _not_ output from teragram. Instead, they are as
251 // detected in the headers of usenet articles.
252 MSFT_CP1252, // 27: CP1252 aka MSFT euro ascii
253 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian
254 MSFT_CP1250, // 29: CP1250 aka MSFT eastern european
255 ISO_8859_15, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized
256 //----------------------------------------------------------
257
258 //----------------------------------------------------------
259 // These are in BasisTech but not in Teragram. They are
260 // needed for new interface languages. Now detected by
261 // research langid
262 MSFT_CP1254, // 31: used for Turkish
263 MSFT_CP1257, // 32: used in Baltic countries
264 //----------------------------------------------------------
265
266 //----------------------------------------------------------
267 //----------------------------------------------------------
268 // New encodings detected by Teragram
269 ISO_8859_11, // 33: aka TIS-620, used for Thai
270 MSFT_CP874, // 34: used for Thai
271 MSFT_CP1256, // 35: used for Arabic
272
273 //----------------------------------------------------------
274 // Detected as ISO_8859_8 by Teragram, but can be found in META tags
275 MSFT_CP1255, // 36: Logical Hebrew Microsoft
276 ISO_8859_8_I, // 37: Iso Hebrew Logical
277 HEBREW_VISUAL, // 38: Iso Hebrew Visual
278 //----------------------------------------------------------
279
280 //----------------------------------------------------------
281 // Detected by research langid
282 CZECH_CP852, // 39
283 CZECH_CSN_369103, // 40: aka ISO_IR_139 aka KOI8_CS
284 MSFT_CP1253, // 41: used for Greek
285 RUSSIAN_CP866, // 42
286 //----------------------------------------------------------
287 HZ_ENCODING,
288 ISO2022_CN,
289 ISO2022_KR,
290
291 NUM_ENCODINGS // Always keep this at the end. It is not a
292 // valid Encoding enum, it is only used to
293 // indicate the total number of Encodings.
294 };
295
296 const int kNumLanguages = NUM_LANGUAGES;
297 const int kNumEncodings = NUM_ENCODINGS;
298
299 #endif // OMAHA_BASE_LANG_ENC_H_
OLDNEW
« no previous file with comments | « base/highres_timer_unittest.cc ('k') | base/localization.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698