base/lang_enc.h - Issue 624713003: Keep only base/extractor.[cc|h].

Side by Side Diff: base/lang_enc.h

Issue 624713003: Keep only base/extractor.[cc|h]. (Closed) Base URL: https://chromium.googlesource.com/external/omaha.git@master

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2004-2009 Google Inc.

2 //

3 // Licensed under the Apache License, Version 2.0 (the "License");

4 // you may not use this file except in compliance with the License.

5 // You may obtain a copy of the License at

6 //

7 // http://www.apache.org/licenses/LICENSE-2.0

8 //

9 // Unless required by applicable law or agreed to in writing, software

10 // distributed under the License is distributed on an "AS IS" BASIS,

11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

12 // See the License for the specific language governing permissions and

13 // limitations under the License.

14 // ========================================================================

15 //

16 // This file is for i18n. It contains two enums, namely Language and

17 // Encoding, where Language is the linguistic convention, and Encoding

18 // contains information on both language encoding and character set.

19 //

20 // The language and encoding are both based on Teragram's conventions,

21 // except for some common ISO-8859 encodings that are not detected by

22 // Teragram but might be in the future.

23 //

24 // This file also includes functions that do mappings among

25 // Language/Encoding enums, language/encoding string names (typically

26 // the output from Language Encoding identifier), and language codes

27 // (iso 639), and two-letter country codes (iso 3166)

28 //

29 // NOTE: Both Language and Encoding enums should always start from

30 // zero value. This assumption has been made and used.

31

32 #ifndef OMAHA_BASE_LANG_ENC_H_

33 #define OMAHA_BASE_LANG_ENC_H_

34

35 #include <windows.h>

36

37 // some of the popular encoding aliases

38 #define LATIN1 ISO_8859_1

39 #define LATIN2 ISO_8859_2

40 #define LATIN3 ISO_8859_3

41 #define LATIN4 ISO_8859_4

42 #define CYRILLIC ISO_8859_5

43 #define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language

44 #define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language

45 #define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language

46 #define LATIN5 ISO_8859_9

47 #define LATIN6 ISO_8859_10

48 #define KOREAN_HANGUL KOREAN_EUC_KR

49

50 // NOTE: Only add new languages to the end of this list (but before

51 // NUM_LANGUAGES).

52 enum Language {

53 ENGLISH = 0, /* 0 */

54 DANISH, /* 1 */

55 DUTCH, /* 2 */

56 FINNISH, /* 3 */

57 FRENCH, /* 4 */

58 GERMAN, /* 5 */

59 HEBREW, /* 6 */

60 ITALIAN, /* 7 */

61 JAPANESE, /* 8 */

62 KOREAN, /* 9 */

63 NORWEGIAN, /* 10 */

64 POLISH, /* 11 */

65 PORTUGUESE, /* 12 */

66 RUSSIAN, /* 13 */

67 SPANISH, /* 14 */

68 SWEDISH, /* 15 */

69 CHINESE, /* 16 */

70 CZECH, /* 17 */

71 GREEK, /* 18 */

72 ICELANDIC, /* 19 */

73 LATVIAN, /* 20 */

74 LITHUANIAN, /* 21 */

75 ROMANIAN, /* 22 */

76 HUNGARIAN, /* 23 */

77 ESTONIAN, /* 24 */

78 TG_UNKNOWN_LANGUAGE, /* 25 */

79 UNKNOWN_LANGUAGE, /* 26 */

80 BULGARIAN, /* 27 */

81 CROATIAN, /* 28 */

82 SERBIAN, /* 29 */

83 IRISH, /* 30 */

84 GALICIAN, /* 31 */

85 TAGALOG, /* 32 */

86 TURKISH, /* 33 */

87 UKRAINIAN, /* 34 */

88 HINDI, /* 35 */

89 MACEDONIAN, /* 36 */

90 BENGALI, /* 37 */

91 INDONESIAN, /* 38 */

92 LATIN, /* 39 */

93 MALAY, /* 40 */

94 MALAYALAM, /* 41 */

95 WELSH, /* 42 */

96 NEPALI, /* 43 */

97 TELUGU, /* 44 */

98 ALBANIAN, /* 45 */

99 TAMIL, /* 46 */

100 BELARUSIAN, /* 47 */

101 JAVANESE, /* 48 */

102 OCCITAN, /* 49 */

103 URDU, /* 50 */

104 BIHARI, /* 51 */

105 GUJARATI, /* 52 */

106 THAI, /* 53 */

107 ARABIC, /* 54 */

108 CATALAN, /* 55 */

109 ESPERANTO, /* 56 */

110 BASQUE, /* 57 */

111 INTERLINGUA, /* 58 */

112 KANNADA, /* 59 */

113 PUNJABI, /* 60 */

114 SCOTS_GAELIC, /* 61 */

115 SWAHILI, /* 62 */

116 SLOVENIAN, /* 63 */

117 MARATHI, /* 64 */

118 MALTESE, /* 65 */

119 VIETNAMESE, /* 66 */

120 FRISIAN, /* 67 */

121 SLOVAK, /* 68 */

122 CHINESE_T, /* 69 */ // This is added to solve the problem of

123 // distinguishing Traditional and Simplified

124 // Chinese when the encoding is UTF8.

125 FAROESE, /* 70 */

126 SUNDANESE, /* 71 */

127 UZBEK, /* 72 */

128 AMHARIC, /* 73 */

129 AZERBAIJANI, /* 74 */

130 GEORGIAN, /* 75 */

131 TIGRINYA, /* 76 */

132 PERSIAN, /* 77 */

133 BOSNIAN, /* 78 */

134 SINHALESE, /* 79 */

135 NORWEGIAN_N, /* 80 */

136 PORTUGUESE_P, /* 81 */

137 PORTUGUESE_B, /* 82 */

138 XHOSA, /* 83 */

139 ZULU, /* 84 */

140 GUARANI, /* 85 */

141 SESOTHO, /* 86 */

142 TURKMEN, /* 87 */

143 KYRGYZ, /* 88 */

144 BRETON, /* 89 */

145 TWI, /* 90 */

146 YIDDISH, /* 91 */

147 ORIYA, /* 92 */

148 SERBO_CROATIAN, /* 93 */

149 SOMALI, /* 94 */

150 UIGHUR, /* 95 */

151 KURDISH, /* 96 */

152 MONGOLIAN, /* 97 */

153 ARMENIAN, /* 98 */

154 LAOTHIAN, /* 99 */

155 SINDHI, /* 100! */

156 RHAETO_ROMANCE, /* 101 */

157 CHINESE_JAPANESE_KOREAN, /* 103 */ // Not really a language

158 PSEUDOTRANSLATION, /* 104 */ // Not really a language

159 NUM_LANGUAGES, // Always keep this at the end. It is not a

160 // valid Language enum, it is only used to

161 // indicate the total number of Languages.

162 };

163

164

165 // Language codes for those languages we support, used to map to IDs from

166 // the Language enumeration. We could have used the Rfc1766ToLcid from the

167 // Win32 system's mlang.dll to map these to LCIDs, but a) we don't want to

168 // have to load mlang.dll and b) we are using our own language IDs.

169 const TCHAR* const kLangCodeChinesePrc = _T("zh_cn");

170 const TCHAR* const kLangCodeChineseTaiwan = _T("zh_tw");

171 const TCHAR* const kLangCodeCjk = _T("cjk");

172 const TCHAR* const kLangCodeDutch = _T("nl");

173 const TCHAR* const kLangCodeEnglish = _T("en");

174 const TCHAR* const kLangCodeFrench = _T("fr");

175 const TCHAR* const kLangCodeGerman = _T("de");

176 const TCHAR* const kLangCodeItalian = _T("it");

177 const TCHAR* const kLangCodeJapanese = _T("ja");

178 const TCHAR* const kLangCodeKorean = _T("ko");

179 const TCHAR* const kLangCodePseudo = _T("x");

180 const TCHAR* const kLangCodeSpanish = _T("es");

181

182

183 // Maps language codes to languages. Terminated by a { NULL, UNKNOWN_LANGUAGE }

184 // item.

185 struct CodeToLanguage {

186 const TCHAR* code;

187 Language language;

188 };

189

190 SELECTANY CodeToLanguage codes_to_languages[] = {

191 { kLangCodeChinesePrc, CHINESE },

192 { kLangCodeChineseTaiwan, CHINESE_T },

193 { kLangCodeCjk, CHINESE_JAPANESE_KOREAN },

194 { kLangCodeDutch, DUTCH },

195 { kLangCodeEnglish, ENGLISH },

196 { kLangCodeFrench, FRENCH },

197 { kLangCodeGerman, GERMAN },

198 { kLangCodeItalian, ITALIAN },

199 { kLangCodeJapanese, JAPANESE },

200 { kLangCodeKorean, KOREAN },

201 { kLangCodePseudo, PSEUDOTRANSLATION },

202 { kLangCodeSpanish, SPANISH },

203 { NULL, UNKNOWN_LANGUAGE }

204 };

205

206

207

208 // Macro to wrap the notion of "unknown language".

209 #define IS_LANGUAGE_UNKNOWN(l) \

210 ((l) == TG_UNKNOWN_LANGUAGE \|\| (l) == UNKNOWN_LANGUAGE)

211

212 // NOTE: Only add new encodings to the end of this list (but before

213 // NUM_ENCODINGS).

214 // NOTE: If you add an encoding here, you must also modify basistech_encoding()

215 // and google2/com/google/i18n/Encoding.java

216 enum Encoding {

217 ISO_8859_1 = 0, // 0: Teragram ASCII

218 ISO_8859_2, // 1: Teragram Latin2

219 ISO_8859_3, // 2: in BasisTech but not in Teragram

220 ISO_8859_4, // 3: Teragram Latin4

221 ISO_8859_5, // 4: Teragram ISO-8859-5

222 ISO_8859_6, // 5: Teragram Arabic

223 ISO_8859_7, // 6: Teragram Greek

224 ISO_8859_8, // 7: Teragram Hebrew

225 ISO_8859_9, // 8: in BasisTech but not in Teragram

226 ISO_8859_10, // 9: in BasisTech but not in Teragram

227 JAPANESE_EUC_JP, // 10: Teragram EUC_JP

228 JAPANESE_SHIFT_JIS, // 11: Teragram SJS

229 JAPANESE_JIS, // 12: Teragram JIS

230 CHINESE_BIG5, // 13: Teragram BIG5

231 CHINESE_GB, // 14: Teragram GB

232 CHINESE_EUC_CN, // 15: Teragram EUC-CN

233 KOREAN_EUC_KR, // 16: Teragram KSC

234 UNICODE_ENCODING, // 17: Teragram Unicode, changed to UNICODE_ENCODING

235 // from UNICODE, which is predefined by WINDOW

236 CHINESE_EUC_DEC, // 18: Teragram EUC

237 CHINESE_CNS, // 19: Teragram CNS

238 CHINESE_BIG5_CP950, // 20: Teragram BIG5_CP950

239 JAPANESE_CP932, // 21: Teragram CP932

240 UTF8, // 22

241 UNKNOWN_ENCODING, // 23

242 ASCII_7BIT, // 24: ISO_8859_1 with all characters <= 127.

243 // Should be present only in the crawler

244 // and in the repository,

245 // never as a result of Document::encoding().

246 RUSSIAN_KOI8_R, // 25: Teragram KOI8R

247 RUSSIAN_CP1251, // 26: Teragram CP1251

248

249 //----------------------------------------------------------

250 // These are _not_ output from teragram. Instead, they are as

251 // detected in the headers of usenet articles.

252 MSFT_CP1252, // 27: CP1252 aka MSFT euro ascii

253 RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian

254 MSFT_CP1250, // 29: CP1250 aka MSFT eastern european

255 ISO_8859_15, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized

256 //----------------------------------------------------------

257

258 //----------------------------------------------------------

259 // These are in BasisTech but not in Teragram. They are

260 // needed for new interface languages. Now detected by

261 // research langid

262 MSFT_CP1254, // 31: used for Turkish

263 MSFT_CP1257, // 32: used in Baltic countries

264 //----------------------------------------------------------

265

266 //----------------------------------------------------------

267 //----------------------------------------------------------

268 // New encodings detected by Teragram

269 ISO_8859_11, // 33: aka TIS-620, used for Thai

270 MSFT_CP874, // 34: used for Thai

271 MSFT_CP1256, // 35: used for Arabic

272

273 //----------------------------------------------------------

274 // Detected as ISO_8859_8 by Teragram, but can be found in META tags

275 MSFT_CP1255, // 36: Logical Hebrew Microsoft

276 ISO_8859_8_I, // 37: Iso Hebrew Logical

277 HEBREW_VISUAL, // 38: Iso Hebrew Visual

278 //----------------------------------------------------------

279

280 //----------------------------------------------------------

281 // Detected by research langid

282 CZECH_CP852, // 39

283 CZECH_CSN_369103, // 40: aka ISO_IR_139 aka KOI8_CS

284 MSFT_CP1253, // 41: used for Greek

285 RUSSIAN_CP866, // 42

286 //----------------------------------------------------------

287 HZ_ENCODING,

288 ISO2022_CN,

289 ISO2022_KR,

290

291 NUM_ENCODINGS // Always keep this at the end. It is not a

292 // valid Encoding enum, it is only used to

293 // indicate the total number of Encodings.

294 };

295

296 const int kNumLanguages = NUM_LANGUAGES;

297 const int kNumEncodings = NUM_ENCODINGS;

298

299 #endif // OMAHA_BASE_LANG_ENC_H_

OLD	NEW

« no previous file with comments | « base/highres_timer_unittest.cc ('k') | base/localization.h » ('j') | no next file with comments »