Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(78)

Side by Side Diff: third_party/cld/encodings/internal/encodings.cc

Issue 1956183002: CL for perf tryjob on linux (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Author: jrm@google.com (Jim Meehan)
3 // found in the LICENSE file.
4 3
5 #include "encodings/public/encodings.h" 4 #include "encodings/public/encodings.h"
6 5
7 6 #include <string.h> // for strcasecmp
8 // We do not use it, just to please a compiler and minimize ported 7 //#include <hash_map> // for _Hashtable_iterator, etc
9 // code changes. 8 #include <utility> // for pair
9
10 //#include "base/googleinit.h" // for REGISTER_MODULE_INITIALIZER
11 //#include "base/logging.h" // for operator<<, Check_EQImpl, etc
12 //#include "base/macros.h" // for COMPILE_ASSERT, etc
13 //#include "base/mutex.h" // for Mutex, MutexLock
14 //#include "util/hash/case_insensitive_hash.h"
15 //#include "util/hash/hash.h"
16 #include "encodings/compact_lang_det/win/cld_basictypes.h"
17 #include "encodings/compact_lang_det/win/cld_logging.h"
18 #include "encodings/compact_lang_det/win/cld_macros.h"
19
20 struct EncodingInfo {
21 // The standard name for this encoding.
22 //
23 const char* encoding_name_;
24
25 // The "preferred MIME name" of an encoding as specified by the IANA at:
26 // http://www.iana.org/assignments/character-sets
27 //
28 // Note that the preferred MIME name may differ slightly from the
29 // official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987
30 //
31 const char* mime_encoding_name_;
32
33 // NOTE: As of January 2007, it is a Google requirement that if an
34 // encoding has an IANA name, then encoding_name_ and
35 // mime_encoding_name_ must be the same string.
36 //
37 // However, there can be exceptions if there are compelling reasons.
38 // For example, Japanese mobile handsets require the name
39 // "Shift_JIS" in charset=... parameter in Content-Type headers to
40 // process emoji (emoticons) in their private encodings. In that
41 // case, mime_encoding_name_ should be "Shift_JIS", despite
42 // encoding_name_ actually is "X-KDDI-Shift_JIS".
43
44 // Some multi-byte encodings use byte values that coincide with the
45 // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE
46 // can misinterpret these, as indicated in an external XSS report from
47 // 2007-02-15. Here, we map these dangerous encodings to safer ones. We
48 // also use UTF8 instead of encodings that we don't support in our
49 // output, and we generally try to be conservative in what we send out.
50 // Where the client asks for single- or double-byte encodings that are
51 // not as common, we substitute a more common single- or double-byte
52 // encoding, if there is one, thereby preserving the client's intent
53 // to use less space than UTF-8. This also means that characters
54 // outside the destination set will be converted to HTML NCRs (&#NNN;)
55 // if requested.
56
57 Encoding preferred_web_output_encoding_;
58 };
59
60 static const EncodingInfo kEncodingInfoTable[] = {
61 { "ASCII", "ISO-8859-1", ISO_8859_1},
62 { "Latin2", "ISO-8859-2", ISO_8859_2},
63 { "Latin3", "ISO-8859-3", UTF8},
64 // MSIE 6 does not support ISO-8859-3 (XSS issue)
65 { "Latin4", "ISO-8859-4", ISO_8859_4},
66 { "ISO-8859-5", "ISO-8859-5", ISO_8859_5},
67 { "Arabic", "ISO-8859-6", ISO_8859_6},
68 { "Greek", "ISO-8859-7", ISO_8859_7},
69 { "Hebrew", "ISO-8859-8", MSFT_CP1255},
70 // we do not endorse the visual order
71 { "Latin5", "ISO-8859-9", ISO_8859_9},
72 { "Latin6", "ISO-8859-10", UTF8},
73 // MSIE does not support ISO-8859-10 (XSS issue)
74 { "EUC-JP", "EUC-JP", JAPANESE_EUC_JP},
75 { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS},
76 { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
77 // due to potential confusion with HTML syntax chars
78 { "BIG5", "Big5", CHINESE_BIG5},
79 { "GB", "GB2312", CHINESE_GB},
80 { "EUC-CN",
81 "EUC-CN",
82 // Misnamed. Should be EUC-TW.
83 CHINESE_BIG5},
84 // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW,
85 // and EUC-TW is rare, so we prefer Big5 for output.
86 { "KSC", "EUC-KR", KOREAN_EUC_KR},
87 { "Unicode",
88 "UTF-16LE",
89 // Internet Explorer doesn't recognize "ISO-10646-UCS-2"
90 UTF8
91 // due to potential confusion with HTML syntax chars
92 },
93 { "EUC",
94 "EUC", // Misnamed. Should be EUC-TW.
95 CHINESE_BIG5
96 // MSIE does not recognize "EUC" (XSS issue),
97 // and EUC-TW is rare, so we prefer Big5 for output.
98 },
99 { "CNS",
100 "CNS", // Misnamed. Should be EUC-TW.
101 CHINESE_BIG5},
102 // MSIE does not recognize "CNS" (XSS issue),
103 // and EUC-TW is rare, so we prefer Big5 for output.
104 { "BIG5-CP950",
105 "BIG5-CP950", // Not an IANA name
106 CHINESE_BIG5
107 // MSIE does not recognize "BIG5-CP950" (XSS issue)
108 },
109 { "CP932", "CP932", // Not an IANA name
110 JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue)
111 { "UTF8", "UTF-8", UTF8},
112 { "Unknown",
113 "x-unknown", // Not an IANA name
114 UTF8}, // UTF-8 is our default output encoding
115 { "ASCII-7-bit", "US-ASCII", ASCII_7BIT},
116 { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R},
117 { "CP1251", "windows-1251", RUSSIAN_CP1251},
118 { "CP1252", "windows-1252", MSFT_CP1252},
119 { "KOI8U",
120 "KOI8-U",
121 ISO_8859_5}, // because koi8-u is not as common
122 { "CP1250", "windows-1250", MSFT_CP1250},
123 { "ISO-8859-15", "ISO-8859-15", ISO_8859_15},
124 { "CP1254", "windows-1254", MSFT_CP1254},
125 { "CP1257", "windows-1257", MSFT_CP1257},
126 { "ISO-8859-11", "ISO-8859-11", ISO_8859_11},
127 { "CP874", "windows-874", MSFT_CP874},
128 { "CP1256", "windows-1256", MSFT_CP1256},
129 { "CP1255", "windows-1255", MSFT_CP1255},
130 { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255},
131 // Java does not support iso-8859-8-i
132 { "VISUAL", "ISO-8859-8", MSFT_CP1255},
133 // we do not endorse the visual order
134 { "CP852", "cp852", MSFT_CP1250},
135 // because cp852 is not as common
136 { "CSN_369103", "csn_369103", MSFT_CP1250},
137 // MSIE does not recognize "csn_369103" (XSS issue)
138 { "CP1253", "windows-1253", MSFT_CP1253},
139 { "CP866", "IBM866", RUSSIAN_CP1251},
140 // because cp866 is not as common
141 { "ISO-8859-13", "ISO-8859-13", UTF8},
142 // because iso-8859-13 is not widely supported
143 { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR},
144 // due to potential confusion with HTML syntax chars
145 { "GBK", "GBK", GBK},
146 { "GB18030", "GB18030", GBK},
147 // because gb18030 is not widely supported
148 { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5},
149 // because Big5-HKSCS is not widely supported
150 { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB},
151 // due to potential confusion with HTML syntax chars
152 { "TSCII", "tscii", UTF8},
153 // we do not have an output converter for this font encoding
154 { "TAM", "tam", UTF8},
155 // we do not have an output converter for this font encoding
156 { "TAB", "tab", UTF8},
157 // we do not have an output converter for this font encoding
158 { "JAGRAN", "jagran", UTF8},
159 // we do not have an output converter for this font encoding
160 { "MACINTOSH", "MACINTOSH", ISO_8859_1},
161 // because macintosh is relatively uncommon
162 { "UTF7", "UTF-7",
163 UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated
164 { "BHASKAR", "bhaskar",
165 UTF8}, // we do not have an output converter for this font encoding
166 { "HTCHANAKYA", "htchanakya", // not an IANA charset name.
167 UTF8}, // we do not have an output converter for this font encoding
168 { "UTF-16BE", "UTF-16BE",
169 UTF8}, // due to potential confusion with HTML syntax chars
170 { "UTF-16LE", "UTF-16LE",
171 UTF8}, // due to potential confusion with HTML syntax chars
172 { "UTF-32BE", "UTF-32BE",
173 UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
174 { "UTF-32LE", "UTF-32LE",
175 UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web
176 { "X-BINARYENC", "x-binaryenc", // Not an IANA name
177 UTF8}, // because this one is not intended for output (just input)
178 { "HZ-GB-2312", "HZ-GB-2312",
179 CHINESE_GB}, // due to potential confusion with HTML syntax chars
180 { "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name
181 UTF8}, // because this one is not intended for output (just input)
182 { "X-TAM-ELANGO", "x-tam-elango",
183 UTF8}, // we do not have an output converter for this font encoding
184 { "X-TAM-LTTMBARANI", "x-tam-lttmbarani",
185 UTF8}, // we do not have an output converter for this font encoding
186 { "X-TAM-SHREE", "x-tam-shree",
187 UTF8}, // we do not have an output converter for this font encoding
188 { "X-TAM-TBOOMIS", "x-tam-tboomis",
189 UTF8}, // we do not have an output converter for this font encoding
190 { "X-TAM-TMNEWS", "x-tam-tmnews",
191 UTF8}, // we do not have an output converter for this font encoding
192 { "X-TAM-WEBTAMIL", "x-tam-webtamil",
193 UTF8}, // we do not have an output converter for this font encoding
194
195 { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
196 // KDDI version of Shift_JIS with Google Emoji PUA mappings.
197 // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses
198 // "Shift_JIS" in HTTP headers and email messages.
199
200 { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
201 // DoCoMo version of Shift_JIS with Google Emoji PUA mappings.
202 // See the comment at KDDI_SHIFT_JIS for other issues.
203
204 { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},
205 // SoftBank version of Shift_JIS with Google Emoji PUA mappings.
206 // See the comment at KDDI_SHIFT_JIS for other issues.
207
208 { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
209 // KDDI version of ISO-2022-JP with Google Emoji PUA mappings.
210 // See the comment at KDDI_SHIFT_JIS for other issues.
211 // The preferred Web encoding is due to potential confusion with
212 // HTML syntax chars.
213
214 { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},
215 // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings.
216 // See the comment at KDDI_SHIFT_JIS for other issues.
217 // The preferred Web encoding is due to potential confusion with
218 // HTML syntax chars.
219
220 // Please refer to NOTE: section in the comments in the definition
221 // of "struct I18NInfoByEncoding", before adding new encodings.
222
223 };
224
225
226
227 COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS,
228 kEncodingInfoTable_has_incorrect_size);
229
230 Encoding default_encoding() {return LATIN1;}
231
232 // *************************************************************
233 // Encoding predicates
234 // IsValidEncoding()
235 // IsEncEncCompatible
236 // IsEncodingWithSupportedLanguage
237 // IsSupersetOfAscii7Bit
238 // Is8BitEncoding
239 // IsCJKEncoding
240 // IsHebrewEncoding
241 // IsRightToLeftEncoding
242 // IsLogicalRightToLeftEncoding
243 // IsVisualRightToLeftEncoding
244 // IsIso2022Encoding
245 // IsIso2022JpOrVariant
246 // IsShiftJisOrVariant
247 // IsJapaneseCellPhoneCarrierSpecificEncoding
248 // *************************************************************
249
250 bool IsValidEncoding(Encoding enc) {
251 return ((enc >= 0) && (enc < kNumEncodings));
252 }
253
254 bool IsEncEncCompatible(const Encoding from, const Encoding to) {
255 // Tests compatibility between the "from" and "to" encodings; in
256 // the typical case -- when both are valid known encodings -- this
257 // returns true iff converting from first to second is a no-op.
258 if (!IsValidEncoding(from) || !IsValidEncoding(to)) {
259 return false; // we only work with valid encodings...
260 } else if (to == from) {
261 return true; // the trivial common case
262 }
263
264 if (to == UNKNOWN_ENCODING) {
265 return true; // all valid encodings are compatible with the unknown
266 }
267
268 if (from == UNKNOWN_ENCODING) {
269 return false; // no unknown encoding is compatible with one that is
270 }
271
272 if (from == ASCII_7BIT) {
273 return IsSupersetOfAscii7Bit(to);
274 }
275
276 return (from == ISO_8859_1 && to == MSFT_CP1252) ||
277 (from == ISO_8859_8 && to == HEBREW_VISUAL) ||
278 (from == HEBREW_VISUAL && to == ISO_8859_8) ||
279 (from == ISO_8859_9 && to == MSFT_CP1254) ||
280 (from == ISO_8859_11 && to == MSFT_CP874) ||
281 (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) ||
282 (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) ||
283 (from == CHINESE_GB && to == GBK) ||
284 (from == CHINESE_GB && to == GB18030) ||
285 (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) ||
286 (from == CHINESE_EUC_CN && to == CHINESE_CNS) ||
287 (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) ||
288 (from == CHINESE_EUC_DEC && to == CHINESE_CNS) ||
289 (from == CHINESE_CNS && to == CHINESE_EUC_CN) ||
290 (from == CHINESE_CNS && to == CHINESE_EUC_DEC);
291 }
292
293 // To be a superset of 7-bit Ascii means that bytes 0...127 in the given
294 // encoding represent the same characters as they do in ISO_8859_1.
295
296 // TODO: This list could be expanded. Many other encodings are supersets
297 // of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two
298 // encodings that I know for a fact should *not* be in this list.
299 bool IsSupersetOfAscii7Bit(Encoding e) {
300 switch (e) {
301 case ISO_8859_1:
302 case ISO_8859_2:
303 case ISO_8859_3:
304 case ISO_8859_4:
305 case ISO_8859_5:
306 case ISO_8859_6:
307 case ISO_8859_7:
308 case ISO_8859_8:
309 case ISO_8859_9:
310 case ISO_8859_10:
311 case JAPANESE_EUC_JP:
312 case JAPANESE_SHIFT_JIS:
313 case CHINESE_BIG5:
314 case CHINESE_GB:
315 case CHINESE_EUC_CN:
316 case KOREAN_EUC_KR:
317 case CHINESE_EUC_DEC:
318 case CHINESE_CNS:
319 case CHINESE_BIG5_CP950:
320 case JAPANESE_CP932:
321 case UTF8:
322 case UNKNOWN_ENCODING:
323 case ASCII_7BIT:
324 case RUSSIAN_KOI8_R:
325 case RUSSIAN_CP1251:
326 case MSFT_CP1252:
327 case RUSSIAN_KOI8_RU:
328 case MSFT_CP1250:
329 case ISO_8859_15:
330 case MSFT_CP1254:
331 case MSFT_CP1257:
332 case ISO_8859_11:
333 case MSFT_CP874:
334 case MSFT_CP1256:
335 case MSFT_CP1255:
336 case ISO_8859_8_I:
337 case HEBREW_VISUAL:
338 case CZECH_CP852:
339 case MSFT_CP1253:
340 case RUSSIAN_CP866:
341 case ISO_8859_13:
342 case GBK:
343 case GB18030:
344 case BIG5_HKSCS:
345 case MACINTOSH_ROMAN:
346 return true;
347 default:
348 return false;
349 }
350 }
351
352 // To be an 8-bit encoding means that there are fewer than 256 symbols.
353 // Each byte determines a new character; there are no multi-byte sequences.
354
355 // TODO: This list could maybe be expanded. Other encodings may be 8-bit.
356 bool Is8BitEncoding(Encoding e) {
357 switch (e) {
358 case ASCII_7BIT:
359 case ISO_8859_1:
360 case ISO_8859_2:
361 case ISO_8859_3:
362 case ISO_8859_4:
363 case ISO_8859_5:
364 case ISO_8859_6:
365 case ISO_8859_7:
366 case ISO_8859_8:
367 case ISO_8859_8_I:
368 case ISO_8859_9:
369 case ISO_8859_10:
370 case ISO_8859_11:
371 case ISO_8859_13:
372 case ISO_8859_15:
373 case MSFT_CP1252:
374 case MSFT_CP1253:
375 case MSFT_CP1254:
376 case MSFT_CP1255:
377 case MSFT_CP1256:
378 case MSFT_CP1257:
379 case RUSSIAN_KOI8_R:
380 case RUSSIAN_KOI8_RU:
381 case RUSSIAN_CP866:
382 return true;
383 default:
384 return false;
385 }
386 }
387
388 bool IsCJKEncoding(Encoding e) {
389 switch (e) {
390 case JAPANESE_EUC_JP:
391 case JAPANESE_SHIFT_JIS:
392 case JAPANESE_JIS:
393 case CHINESE_BIG5:
394 case CHINESE_GB:
395 case CHINESE_EUC_CN:
396 case KOREAN_EUC_KR:
397 case CHINESE_EUC_DEC:
398 case CHINESE_CNS:
399 case CHINESE_BIG5_CP950:
400 case JAPANESE_CP932:
401 case ISO_2022_KR:
402 case GBK:
403 case GB18030:
404 case BIG5_HKSCS:
405 case ISO_2022_CN:
406 case HZ_GB_2312:
407 return true;
408 default:
409 return false;
410 }
411 }
412
413 bool IsHebrewEncoding(Encoding e) {
414 return (e == ISO_8859_8 ||
415 e == ISO_8859_8_I ||
416 e == MSFT_CP1255 ||
417 e == HEBREW_VISUAL);
418 }
419
420
421
422 bool IsRightToLeftEncoding(Encoding enc) {
423 switch (enc) {
424 case MSFT_CP1255:
425 case MSFT_CP1256:
426 case ARABIC_ENCODING:
427 case HEBREW_ENCODING:
428 case ISO_8859_8_I:
429 case HEBREW_VISUAL:
430 return true;
431 default:
432 return false;
433 }
434 }
435
436 bool IsLogicalRightToLeftEncoding(Encoding enc) {
437 return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc);
438 }
439
440 // Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)
441 // is NOT visual.
442 bool IsVisualRightToLeftEncoding(Encoding enc) {
443 switch (enc) {
444 case HEBREW_ENCODING:
445 case HEBREW_VISUAL:
446 return true;
447 default:
448 return false;
449 }
450 }
451
452
453
454
455
456 bool IsIso2022Encoding(Encoding enc) {
457 return (IsIso2022JpOrVariant(enc) ||
458 enc == ISO_2022_KR ||
459 enc == ISO_2022_CN);
460 }
461
462 bool IsIso2022JpOrVariant(Encoding enc) {
463 return (enc == JAPANESE_JIS ||
464 enc == KDDI_ISO_2022_JP ||
465 enc == SOFTBANK_ISO_2022_JP);
466 }
467
468 bool IsShiftJisOrVariant(Encoding enc) {
469 return (enc == JAPANESE_SHIFT_JIS ||
470 enc == JAPANESE_CP932 ||
471 enc == KDDI_SHIFT_JIS ||
472 enc == DOCOMO_SHIFT_JIS ||
473 enc == SOFTBANK_SHIFT_JIS);
474 }
475
476 bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) {
477 return (enc == KDDI_ISO_2022_JP ||
478 enc == KDDI_SHIFT_JIS ||
479 enc == DOCOMO_SHIFT_JIS ||
480 enc == SOFTBANK_SHIFT_JIS ||
481 enc == SOFTBANK_ISO_2022_JP);
482 }
483
484
485 // *************************************************************
486 // ENCODING NAMES
487 // EncodingName() [Encoding to name]
488 // MimeEncodingName() [Encoding to name]
489 // EncodingFromName() [name to Encoding]
490 // EncodingNameAliasToEncoding() [name to Encoding]
491 // default_encoding_name()
492 // invalid_encoding_name()
493 // *************************************************************
494
10 const char * EncodingName(const Encoding enc) { 495 const char * EncodingName(const Encoding enc) {
11 return ""; 496 if ( (enc < 0) || (enc >= kNumEncodings) )
12 } 497 return invalid_encoding_name();
498 return kEncodingInfoTable[enc].encoding_name_;
499 }
500
501 // TODO: Unify MimeEncodingName and EncodingName, or determine why
502 // such a unification is not possible.
503
504 const char * MimeEncodingName(Encoding enc) {
505 if ( (enc < 0) || (enc >= kNumEncodings) )
506 return ""; // TODO(jrm) Should this be invalid_encoding_name()?
507 return kEncodingInfoTable[enc].mime_encoding_name_;
508 }
509
510 bool EncodingFromName(const char* enc_name, Encoding *encoding) {
511 *encoding = UNKNOWN_ENCODING;
512 if ( enc_name == NULL ) return false;
513
514 for ( int i = 0; i < kNumEncodings; i++ ) {
515 if ( !strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) {
516 *encoding = static_cast<Encoding>(i);
517 return true;
518 }
519 }
520 return false;
521 }
522
523 #if 0
524 // The encoding_map maps standard and non-standard encoding-names
525 // (strings) to Encoding enums. It is used only by
526 // EncodingNameAliasToEncoding. Note that the map uses
527 // case-insensitive hash and comparison functions.
528
529 typedef hash_map <const char *, Encoding,
530 CStringAlnumCaseHash,
531 CStringAlnumCaseEqual> EncodingMap;
532
533 static EncodingMap encoding_map;
534
535 // Mutex for locking the code that initializes encoding_map.
536 // static Mutex encodings_init_mutex(base::LINKER_INITIALIZED);
537
538 void InitEncodings() {
539 // For thread safety, keep a mutex while initializing this map.
540 // Also allow this function to be called more than once and
541 // gracefully exiting if that occurs.
542 // MutexLock lock(&encodings_init_mutex);
543 if (!encoding_map.empty()) {
544 // Already initialized
545 return;
546 }
547
548 // Initialize the map with all the "standard" encoding names,
549 // i.e., the ones returned by EncodingName and MimeEncodingName.
550 //
551 // First, add internal encoding names returned by EncodingName().
552 for (int i = 0; i < NUM_ENCODINGS; ++i) {
553 Encoding e = static_cast<Encoding>(i);
554 // Internal encoding names must be unique.
555 // The internal names are guaranteed to be unique by the CHECK_EQ.
556 const char *encoding_name = EncodingName(e);
557 CHECK_EQ(0, encoding_map.count(encoding_name))
558 << "Duplicate found for " << encoding_name;
559 encoding_map[encoding_name] = e;
560 }
561 // Then, add mime encoding names returned by MimeEncodingName().
562 // We don't override existing entries, to give precedence to entries
563 // added earlier.
564 for (int i = 0; i < NUM_ENCODINGS; ++i) {
565 Encoding e = static_cast<Encoding>(i);
566 // Note that MimeEncodingName() can return the same mime encoding
567 // name for different encoding enums like JAPANESE_SHIFT_JIS and
568 // KDDI_SHIFT_JIS. In that case, the encoding enum first seen
569 // will be the value for the encoding name in the map.
570 const char *mime_encoding_name = MimeEncodingName(e);
571 if (encoding_map.count(mime_encoding_name) == 0) {
572 encoding_map[mime_encoding_name] = e;
573 }
574 }
575
576 // Add some non-standard names: alternate spellings, common typos,
577 // etc. (It does no harm to add names already in the map.) Note
578 // that although the map is case-insensitive, by convention the
579 // keys are written here in lower case. For ease of maintenance,
580 // they are listed in alphabetical order.
581 encoding_map["5601"] = KOREAN_EUC_KR;
582 encoding_map["646"] = ASCII_7BIT;
583 encoding_map["852"] = CZECH_CP852;
584 encoding_map["866"] = RUSSIAN_CP866;
585 encoding_map["8859-1"] = ISO_8859_1;
586 encoding_map["ansi-1251"] = RUSSIAN_CP1251;
587 encoding_map["ansi_x3.4-1968"] = ASCII_7BIT;
588 encoding_map["arabic"] = ISO_8859_6;
589 encoding_map["ascii"] = ISO_8859_1;
590 encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard
591 encoding_map["asmo-708"] = ISO_8859_6;
592 encoding_map["bhaskar"] = BHASKAR;
593 encoding_map["big5"] = CHINESE_BIG5;
594 encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard
595 encoding_map["big5-hkscs"] = BIG5_HKSCS;
596 encoding_map["chinese"] = CHINESE_GB;
597 encoding_map["cns"] = CHINESE_CNS; // not iana standard
598 encoding_map["cns11643"] = CHINESE_CNS;
599 encoding_map["cp1250"] = MSFT_CP1250; // not iana standard
600 encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard
601 encoding_map["cp1252"] = MSFT_CP1252; // not iana standard
602 encoding_map["cp1253"] = MSFT_CP1253; // not iana standard
603 encoding_map["cp1254"] = MSFT_CP1254; // not iana standard
604 encoding_map["cp1255"] = MSFT_CP1255;
605 encoding_map["cp1256"] = MSFT_CP1256;
606 encoding_map["cp1257"] = MSFT_CP1257; // not iana standard
607 encoding_map["cp819"] = ISO_8859_1;
608 encoding_map["cp852"] = CZECH_CP852;
609 encoding_map["cp866"] = RUSSIAN_CP866;
610 encoding_map["cp-866"] = RUSSIAN_CP866;
611 encoding_map["cp874"] = MSFT_CP874;
612 encoding_map["cp932"] = JAPANESE_CP932; // not iana standard
613 encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard
614 encoding_map["csbig5"] = CHINESE_BIG5;
615 encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP;
616 encoding_map["cseuckr"] = KOREAN_EUC_KR;
617 encoding_map["csgb2312"] = CHINESE_GB;
618 encoding_map["csibm852"] = CZECH_CP852;
619 encoding_map["csibm866"] = RUSSIAN_CP866;
620 encoding_map["csiso2022jp"] = JAPANESE_JIS;
621 encoding_map["csiso2022kr"] = ISO_2022_KR;
622 encoding_map["csiso58gb231280"] = CHINESE_GB;
623 encoding_map["csiso88598i"] = ISO_8859_8_I;
624 encoding_map["csisolatin1"] = ISO_8859_1;
625 encoding_map["csisolatin2"] = ISO_8859_2;
626 encoding_map["csisolatin3"] = ISO_8859_3;
627 encoding_map["csisolatin4"] = ISO_8859_4;
628 encoding_map["csisolatin5"] = ISO_8859_9;
629 encoding_map["csisolatin6"] = ISO_8859_10;
630 encoding_map["csisolatinarabic"] = ISO_8859_6;
631 encoding_map["csisolatincyrillic"] = ISO_8859_5;
632 encoding_map["csisolatingreek"] = ISO_8859_7;
633 encoding_map["csisolatinhebrew"] = ISO_8859_8;
634 encoding_map["csksc56011987"] = KOREAN_EUC_KR;
635 encoding_map["csmacintosh"] = MACINTOSH_ROMAN;
636 encoding_map["csn-369103"] = CZECH_CSN_369103;
637 encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS;
638 encoding_map["csunicode"] = UTF16BE;
639 encoding_map["csunicode11"] = UTF16BE;
640 encoding_map["csunicode11utf7"] = UTF7;
641 encoding_map["csunicodeascii"] = UTF16BE;
642 encoding_map["csunicodelatin1"] = UTF16BE;
643 encoding_map["cyrillic"] = ISO_8859_5;
644 encoding_map["ecma-114"] = ISO_8859_6;
645 encoding_map["ecma-118"] = ISO_8859_7;
646 encoding_map["elot_928"] = ISO_8859_7;
647 encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard
648 encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard
649 encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard
650 encoding_map["euc-jp"] = JAPANESE_EUC_JP;
651 encoding_map["euc-kr"] = KOREAN_EUC_KR;
652 encoding_map["eucgb2312_cn"] = CHINESE_GB;
653 encoding_map["gb"] = CHINESE_GB; // not iana standard
654 encoding_map["gb18030"] = GB18030;
655 encoding_map["gb2132"] = CHINESE_GB; // common typo
656 encoding_map["gb2312"] = CHINESE_GB;
657 encoding_map["gb_2312-80"] = CHINESE_GB;
658 encoding_map["gbk"] = GBK;
659 encoding_map["greek"] = ISO_8859_7;
660 encoding_map["greek8"] = ISO_8859_7;
661 encoding_map["hebrew"] = ISO_8859_8;
662 encoding_map["htchanakya"] = HTCHANAKYA;
663 encoding_map["hz-gb-2312"] = HZ_GB_2312;
664 encoding_map["ibm819"] = ISO_8859_1;
665 encoding_map["ibm852"] = CZECH_CP852;
666 encoding_map["ibm874"] = MSFT_CP874;
667 encoding_map["iso-10646"] = UTF16BE;
668 encoding_map["iso-10646-j-1"] = UTF16BE;
669 encoding_map["iso-10646-ucs-2"] = UNICODE;
670 encoding_map["iso-10646-ucs-4"] = UTF32BE;
671 encoding_map["iso-10646-ucs-basic"] = UTF16BE;
672 encoding_map["iso-10646-unicode-latin1"] = UTF16BE;
673 encoding_map["iso-2022-cn"] = ISO_2022_CN;
674 encoding_map["iso-2022-jp"] = JAPANESE_JIS;
675 encoding_map["iso-2022-kr"] = ISO_2022_KR;
676 encoding_map["iso-8559-1"] = ISO_8859_1; // common typo
677 encoding_map["iso-874"] = MSFT_CP874;
678 encoding_map["iso-8858-1"] = ISO_8859_1; // common typo
679 // iso-8859-0 was a temporary name, eventually renamed iso-8859-15
680 encoding_map["iso-8859-0"] = ISO_8859_15;
681 encoding_map["iso-8859-1"] = ISO_8859_1;
682 encoding_map["iso-8859-10"] = ISO_8859_10;
683 encoding_map["iso-8859-11"] = ISO_8859_11;
684 encoding_map["iso-8859-13"] = ISO_8859_13;
685 encoding_map["iso-8859-15"] = ISO_8859_15;
686 encoding_map["iso-8859-2"] = ISO_8859_2;
687 encoding_map["iso-8859-3"] = ISO_8859_3;
688 encoding_map["iso-8859-4"] = ISO_8859_4;
689 encoding_map["iso-8859-5"] = ISO_8859_5;
690 encoding_map["iso-8859-6"] = ISO_8859_6;
691 encoding_map["iso-8859-7"] = ISO_8859_7;
692 encoding_map["iso-8859-8"] = ISO_8859_8;
693 encoding_map["iso-8859-8-i"] = ISO_8859_8_I;
694 encoding_map["iso-8859-9"] = ISO_8859_9;
695 encoding_map["iso-9959-1"] = ISO_8859_1; // common typo
696 encoding_map["iso-ir-100"] = ISO_8859_1;
697 encoding_map["iso-ir-101"] = ISO_8859_2;
698 encoding_map["iso-ir-109"] = ISO_8859_3;
699 encoding_map["iso-ir-110"] = ISO_8859_4;
700 encoding_map["iso-ir-126"] = ISO_8859_7;
701 encoding_map["iso-ir-127"] = ISO_8859_6;
702 encoding_map["iso-ir-138"] = ISO_8859_8;
703 encoding_map["iso-ir-144"] = ISO_8859_5;
704 encoding_map["iso-ir-148"] = ISO_8859_9;
705 encoding_map["iso-ir-149"] = KOREAN_EUC_KR;
706 encoding_map["iso-ir-157"] = ISO_8859_10;
707 encoding_map["iso-ir-58"] = CHINESE_GB;
708 encoding_map["iso-latin-1"] = ISO_8859_1;
709 encoding_map["iso_2022-cn"] = ISO_2022_CN;
710 encoding_map["iso_2022-kr"] = ISO_2022_KR;
711 encoding_map["iso_8859-1"] = ISO_8859_1;
712 encoding_map["iso_8859-10:1992"] = ISO_8859_10;
713 encoding_map["iso_8859-11"] = ISO_8859_11;
714 encoding_map["iso_8859-13"] = ISO_8859_13;
715 encoding_map["iso_8859-15"] = ISO_8859_15;
716 encoding_map["iso_8859-1:1987"] = ISO_8859_1;
717 encoding_map["iso_8859-2"] = ISO_8859_2;
718 encoding_map["iso_8859-2:1987"] = ISO_8859_2;
719 encoding_map["iso_8859-3"] = ISO_8859_3;
720 encoding_map["iso_8859-3:1988"] = ISO_8859_3;
721 encoding_map["iso_8859-4"] = ISO_8859_4;
722 encoding_map["iso_8859-4:1988"] = ISO_8859_4;
723 encoding_map["iso_8859-5"] = ISO_8859_5;
724 encoding_map["iso_8859-5:1988"] = ISO_8859_5;
725 encoding_map["iso_8859-6"] = ISO_8859_6;
726 encoding_map["iso_8859-6:1987"] = ISO_8859_6;
727 encoding_map["iso_8859-7"] = ISO_8859_7;
728 encoding_map["iso_8859-7:1987"] = ISO_8859_7;
729 encoding_map["iso_8859-8"] = ISO_8859_8;
730 encoding_map["iso_8859-8:1988:"] = ISO_8859_8;
731 encoding_map["iso_8859-9"] = ISO_8859_9;
732 encoding_map["iso_8859-9:1989"] = ISO_8859_9;
733 encoding_map["jagran"] = JAGRAN;
734 encoding_map["jis"] = JAPANESE_JIS; // not iana standard
735 encoding_map["koi8-cs"] = CZECH_CSN_369103;
736 encoding_map["koi8-r"] = RUSSIAN_KOI8_R;
737 encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard
738 encoding_map["koi8-u"] = RUSSIAN_KOI8_RU;
739 encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard
740 encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard
741 encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant
742 encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard
743 encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard
744 encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR;
745 encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard
746 encoding_map["l1"] = ISO_8859_1;
747 encoding_map["l2"] = ISO_8859_2;
748 encoding_map["l3"] = ISO_8859_3;
749 encoding_map["l4"] = ISO_8859_4;
750 encoding_map["l5"] = ISO_8859_9;
751 encoding_map["l6"] = ISO_8859_10;
752 encoding_map["latin-1"] = ISO_8859_1; // not iana standard
753 encoding_map["latin1"] = ISO_8859_1;
754 encoding_map["latin2"] = ISO_8859_2;
755 encoding_map["latin3"] = ISO_8859_3;
756 encoding_map["latin4"] = ISO_8859_4;
757 encoding_map["latin5"] = ISO_8859_9;
758 encoding_map["latin6"] = ISO_8859_10;
759 encoding_map["mac"] = MACINTOSH_ROMAN;
760 encoding_map["macintosh"] = MACINTOSH_ROMAN;
761 encoding_map["macintosh-roman"] = MACINTOSH_ROMAN;
762 encoding_map["ms932"] = JAPANESE_CP932; // not iana standard
763 encoding_map["ms_kanji"] = JAPANESE_CP932;
764 encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS;
765 encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS;
766 encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard
767 encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard
768 encoding_map["sun_eu_greek"] = ISO_8859_7;
769 encoding_map["tab"] = TAMIL_BI;
770 encoding_map["tam"] = TAMIL_MONO;
771 encoding_map["tis-620"] = ISO_8859_11;
772 encoding_map["tscii"] = TSCII;
773 encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard
774 encoding_map["unicode"] = UNICODE; // not iana standard
775 encoding_map["unicode-1-1-utf-7"] = UTF7;
776 encoding_map["unicode-1-1-utf-8"] = UTF8;
777 encoding_map["unicode-2-0-utf-7"] = UTF7;
778 encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard
779 encoding_map["us"] = ISO_8859_1;
780 encoding_map["us-ascii"] = ISO_8859_1;
781 encoding_map["utf-16be"] = UTF16BE;
782 encoding_map["utf-16le"] = UTF16LE;
783 encoding_map["utf-32be"] = UTF32BE;
784 encoding_map["utf-32le"] = UTF32LE;
785 encoding_map["utf-7"] = UTF7;
786 encoding_map["utf-8"] = UTF8;
787 encoding_map["utf7"] = UTF7;
788 encoding_map["utf8"] = UTF8; // not iana standard
789 encoding_map["visual"] = HEBREW_VISUAL;
790 encoding_map["win-1250"] = MSFT_CP1250; // not iana standard
791 encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard
792 encoding_map["window-874"] = MSFT_CP874;
793 encoding_map["windows-1250"] = MSFT_CP1250;
794 encoding_map["windows-1251"] = RUSSIAN_CP1251;
795 encoding_map["windows-1252"] = MSFT_CP1252;
796 encoding_map["windows-1253"] = MSFT_CP1253;
797 encoding_map["windows-1254"] = MSFT_CP1254;
798 encoding_map["windows-1255"] = MSFT_CP1255;
799 encoding_map["windows-1256"] = MSFT_CP1256;
800 encoding_map["windows-1257"] = MSFT_CP1257;
801 encoding_map["windows-31j"] = JAPANESE_CP932;
802 encoding_map["windows-874"] = MSFT_CP874;
803 encoding_map["windows-936"] = GBK;
804 encoding_map["x-big5"] = CHINESE_BIG5;
805 encoding_map["x-binaryenc"] = BINARYENC; // not iana standard
806 encoding_map["x-cp1250"] = MSFT_CP1250;
807 encoding_map["x-cp1251"] = RUSSIAN_CP1251;
808 encoding_map["x-cp1252"] = MSFT_CP1252;
809 encoding_map["x-cp1253"] = MSFT_CP1253;
810 encoding_map["x-cp1254"] = MSFT_CP1254;
811 encoding_map["x-cp1255"] = MSFT_CP1255;
812 encoding_map["x-cp1256"] = MSFT_CP1256;
813 encoding_map["x-cp1257"] = MSFT_CP1257;
814 encoding_map["x-euc-jp"] = JAPANESE_EUC_JP;
815 encoding_map["x-euc-tw"] = CHINESE_CNS;
816 encoding_map["x-gbk"] = GBK;
817 encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE;
818 encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE;
819 encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE;
820 encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE;
821 encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard
822 encoding_map["x-mac-roman"] = MACINTOSH_ROMAN;
823 encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard
824 encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS;
825 encoding_map["x-unicode-2-0-utf-7"] = UTF7;
826 encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard
827 encoding_map["x-x-big5"] = CHINESE_BIG5;
828 encoding_map["zh_cn.euc"] = CHINESE_GB;
829 encoding_map["zh_tw-big5"] = CHINESE_BIG5;
830 encoding_map["zh_tw-euc"] = CHINESE_CNS;
831
832 // Remove they entry for the empty string, if any.
833 encoding_map.erase("");
834 }
835
836 REGISTER_MODULE_INITIALIZER(encodings, {
837 InitEncodings();
838 });
839
840 // ----------------------------------------------------------------------
841 // EncodingNameAliasToEncoding()
842 //
843 // This function takes an encoding name/alias and returns the Encoding
844 // enum. The input is case insensitive. It is the union of the common
845 // IANA standard names, the charset names used in Netscape Navigator,
846 // and some common names we have been using.
847 // See: http://www.iana.org/assignments/character-sets
848 // http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html
849 //
850 // UNKNOWN_ENCODING is returned if none matches.
851 //
852 // TODO: Check if it is possible to remove the non-standard,
853 // non-netscape-use names. It is because this routine is used for
854 // encoding detections from html meta info. Non-standard names may
855 // introduce noise on encoding detection.
856 //
857 // TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,
858 // or determine why such a unification is not possible.
859 // ----------------------------------------------------------------------
860 Encoding EncodingNameAliasToEncoding(const char *encoding_name) {
861 if (!encoding_name) {
862 return UNKNOWN_ENCODING;
863 }
864
865 // The map is initialized during InitGoogle() in a thread-safe manner.
866 CHECK(!encoding_map.empty()) << ": Must call InitGoogle()";
867
868 EncodingMap::iterator emi = encoding_map.find(encoding_name);
869 if (emi != encoding_map.end()) {
870 return emi->second;
871 } else {
872 return UNKNOWN_ENCODING;
873 }
874 }
875 #endif
876
877 const char* default_encoding_name() {
878 return kEncodingInfoTable[LATIN1].encoding_name_;
879 }
880
881 static const char* const kInvalidEncodingName = "invalid_encoding";
882
883 const char *invalid_encoding_name() {
884 return kInvalidEncodingName;
885 }
886
887
888
889 // *************************************************************
890 // Miscellany
891 // *************************************************************
892
893
894 Encoding PreferredWebOutputEncoding(Encoding enc) {
895 return IsValidEncoding(enc)
896 ? kEncodingInfoTable[enc].preferred_web_output_encoding_
897 : UTF8;
898 }
OLDNEW
« no previous file with comments | « third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc ('k') | tools/run-perf-test.cfg » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698