third_party/cld/encodings/internal/encodings.cc - Issue 1956183002: CL for perf tryjob on linux

Side by Side Diff: third_party/cld/encodings/internal/encodings.cc

Issue 1956183002: CL for perf tryjob on linux (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.	1 // Copyright 2008 Google Inc. All Rights Reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Author: jrm@google.com (Jim Meehan)

3 // found in the LICENSE file.

4	3

5 #include "encodings/public/encodings.h"	4 #include "encodings/public/encodings.h"

6	5

7	6 #include <string.h> // for strcasecmp

8 // We do not use it, just to please a compiler and minimize ported	7 //#include <hash_map> // for _Hashtable_iterator, etc

9 // code changes.	8 #include <utility> // for pair

	9

	10 //#include "base/googleinit.h" // for REGISTER_MODULE_INITIALIZER

	11 //#include "base/logging.h" // for operator<<, Check_EQImpl, etc

	12 //#include "base/macros.h" // for COMPILE_ASSERT, etc

	13 //#include "base/mutex.h" // for Mutex, MutexLock

	14 //#include "util/hash/case_insensitive_hash.h"

	15 //#include "util/hash/hash.h"

	16 #include "encodings/compact_lang_det/win/cld_basictypes.h"

	17 #include "encodings/compact_lang_det/win/cld_logging.h"

	18 #include "encodings/compact_lang_det/win/cld_macros.h"

	19

	20 struct EncodingInfo {

	21 // The standard name for this encoding.

	22 //

	23 const char* encoding_name_;

	24

	25 // The "preferred MIME name" of an encoding as specified by the IANA at:

	26 // http://www.iana.org/assignments/character-sets

	27 //

	28 // Note that the preferred MIME name may differ slightly from the

	29 // official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987

	30 //

	31 const char* mime_encoding_name_;

	32

	33 // NOTE: As of January 2007, it is a Google requirement that if an

	34 // encoding has an IANA name, then encoding_name_ and

	35 // mime_encoding_name_ must be the same string.

	36 //

	37 // However, there can be exceptions if there are compelling reasons.

	38 // For example, Japanese mobile handsets require the name

	39 // "Shift_JIS" in charset=... parameter in Content-Type headers to

	40 // process emoji (emoticons) in their private encodings. In that

	41 // case, mime_encoding_name_ should be "Shift_JIS", despite

	42 // encoding_name_ actually is "X-KDDI-Shift_JIS".

	43

	44 // Some multi-byte encodings use byte values that coincide with the

	45 // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE

	46 // can misinterpret these, as indicated in an external XSS report from

	47 // 2007-02-15. Here, we map these dangerous encodings to safer ones. We

	48 // also use UTF8 instead of encodings that we don't support in our

	49 // output, and we generally try to be conservative in what we send out.

	50 // Where the client asks for single- or double-byte encodings that are

	51 // not as common, we substitute a more common single- or double-byte

	52 // encoding, if there is one, thereby preserving the client's intent

	53 // to use less space than UTF-8. This also means that characters

	54 // outside the destination set will be converted to HTML NCRs (&#NNN;)

	55 // if requested.

	56

	57 Encoding preferred_web_output_encoding_;

	58 };

	59

	60 static const EncodingInfo kEncodingInfoTable[] = {

	61 { "ASCII", "ISO-8859-1", ISO_8859_1},

	62 { "Latin2", "ISO-8859-2", ISO_8859_2},

	63 { "Latin3", "ISO-8859-3", UTF8},

	64 // MSIE 6 does not support ISO-8859-3 (XSS issue)

	65 { "Latin4", "ISO-8859-4", ISO_8859_4},

	66 { "ISO-8859-5", "ISO-8859-5", ISO_8859_5},

	67 { "Arabic", "ISO-8859-6", ISO_8859_6},

	68 { "Greek", "ISO-8859-7", ISO_8859_7},

	69 { "Hebrew", "ISO-8859-8", MSFT_CP1255},

	70 // we do not endorse the visual order

	71 { "Latin5", "ISO-8859-9", ISO_8859_9},

	72 { "Latin6", "ISO-8859-10", UTF8},

	73 // MSIE does not support ISO-8859-10 (XSS issue)

	74 { "EUC-JP", "EUC-JP", JAPANESE_EUC_JP},

	75 { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS},

	76 { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS},

	77 // due to potential confusion with HTML syntax chars

	78 { "BIG5", "Big5", CHINESE_BIG5},

	79 { "GB", "GB2312", CHINESE_GB},

	80 { "EUC-CN",

	81 "EUC-CN",

	82 // Misnamed. Should be EUC-TW.

	83 CHINESE_BIG5},

	84 // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW,

	85 // and EUC-TW is rare, so we prefer Big5 for output.

	86 { "KSC", "EUC-KR", KOREAN_EUC_KR},

	87 { "Unicode",

	88 "UTF-16LE",

	89 // Internet Explorer doesn't recognize "ISO-10646-UCS-2"

	90 UTF8

	91 // due to potential confusion with HTML syntax chars

	92 },

	93 { "EUC",

	94 "EUC", // Misnamed. Should be EUC-TW.

	95 CHINESE_BIG5

	96 // MSIE does not recognize "EUC" (XSS issue),

	97 // and EUC-TW is rare, so we prefer Big5 for output.

	98 },

	99 { "CNS",

	100 "CNS", // Misnamed. Should be EUC-TW.

	101 CHINESE_BIG5},

	102 // MSIE does not recognize "CNS" (XSS issue),

	103 // and EUC-TW is rare, so we prefer Big5 for output.

	104 { "BIG5-CP950",

	105 "BIG5-CP950", // Not an IANA name

	106 CHINESE_BIG5

	107 // MSIE does not recognize "BIG5-CP950" (XSS issue)

	108 },

	109 { "CP932", "CP932", // Not an IANA name

	110 JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue)

	111 { "UTF8", "UTF-8", UTF8},

	112 { "Unknown",

	113 "x-unknown", // Not an IANA name

	114 UTF8}, // UTF-8 is our default output encoding

	115 { "ASCII-7-bit", "US-ASCII", ASCII_7BIT},

	116 { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R},

	117 { "CP1251", "windows-1251", RUSSIAN_CP1251},

	118 { "CP1252", "windows-1252", MSFT_CP1252},

	119 { "KOI8U",

	120 "KOI8-U",

	121 ISO_8859_5}, // because koi8-u is not as common

	122 { "CP1250", "windows-1250", MSFT_CP1250},

	123 { "ISO-8859-15", "ISO-8859-15", ISO_8859_15},

	124 { "CP1254", "windows-1254", MSFT_CP1254},

	125 { "CP1257", "windows-1257", MSFT_CP1257},

	126 { "ISO-8859-11", "ISO-8859-11", ISO_8859_11},

	127 { "CP874", "windows-874", MSFT_CP874},

	128 { "CP1256", "windows-1256", MSFT_CP1256},

	129 { "CP1255", "windows-1255", MSFT_CP1255},

	130 { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255},

	131 // Java does not support iso-8859-8-i

	132 { "VISUAL", "ISO-8859-8", MSFT_CP1255},

	133 // we do not endorse the visual order

	134 { "CP852", "cp852", MSFT_CP1250},

	135 // because cp852 is not as common

	136 { "CSN_369103", "csn_369103", MSFT_CP1250},

	137 // MSIE does not recognize "csn_369103" (XSS issue)

	138 { "CP1253", "windows-1253", MSFT_CP1253},

	139 { "CP866", "IBM866", RUSSIAN_CP1251},

	140 // because cp866 is not as common

	141 { "ISO-8859-13", "ISO-8859-13", UTF8},

	142 // because iso-8859-13 is not widely supported

	143 { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR},

	144 // due to potential confusion with HTML syntax chars

	145 { "GBK", "GBK", GBK},

	146 { "GB18030", "GB18030", GBK},

	147 // because gb18030 is not widely supported

	148 { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5},

	149 // because Big5-HKSCS is not widely supported

	150 { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB},

	151 // due to potential confusion with HTML syntax chars

	152 { "TSCII", "tscii", UTF8},

	153 // we do not have an output converter for this font encoding

	154 { "TAM", "tam", UTF8},

	155 // we do not have an output converter for this font encoding

	156 { "TAB", "tab", UTF8},

	157 // we do not have an output converter for this font encoding

	158 { "JAGRAN", "jagran", UTF8},

	159 // we do not have an output converter for this font encoding

	160 { "MACINTOSH", "MACINTOSH", ISO_8859_1},

	161 // because macintosh is relatively uncommon

	162 { "UTF7", "UTF-7",

	163 UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated

	164 { "BHASKAR", "bhaskar",

	165 UTF8}, // we do not have an output converter for this font encoding

	166 { "HTCHANAKYA", "htchanakya", // not an IANA charset name.

	167 UTF8}, // we do not have an output converter for this font encoding

	168 { "UTF-16BE", "UTF-16BE",

	169 UTF8}, // due to potential confusion with HTML syntax chars

	170 { "UTF-16LE", "UTF-16LE",

	171 UTF8}, // due to potential confusion with HTML syntax chars

	172 { "UTF-32BE", "UTF-32BE",

	173 UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web

	174 { "UTF-32LE", "UTF-32LE",

	175 UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web

	176 { "X-BINARYENC", "x-binaryenc", // Not an IANA name

	177 UTF8}, // because this one is not intended for output (just input)

	178 { "HZ-GB-2312", "HZ-GB-2312",

	179 CHINESE_GB}, // due to potential confusion with HTML syntax chars

	180 { "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name

	181 UTF8}, // because this one is not intended for output (just input)

	182 { "X-TAM-ELANGO", "x-tam-elango",

	183 UTF8}, // we do not have an output converter for this font encoding

	184 { "X-TAM-LTTMBARANI", "x-tam-lttmbarani",

	185 UTF8}, // we do not have an output converter for this font encoding

	186 { "X-TAM-SHREE", "x-tam-shree",

	187 UTF8}, // we do not have an output converter for this font encoding

	188 { "X-TAM-TBOOMIS", "x-tam-tboomis",

	189 UTF8}, // we do not have an output converter for this font encoding

	190 { "X-TAM-TMNEWS", "x-tam-tmnews",

	191 UTF8}, // we do not have an output converter for this font encoding

	192 { "X-TAM-WEBTAMIL", "x-tam-webtamil",

	193 UTF8}, // we do not have an output converter for this font encoding

	194

	195 { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},

	196 // KDDI version of Shift_JIS with Google Emoji PUA mappings.

	197 // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses

	198 // "Shift_JIS" in HTTP headers and email messages.

	199

	200 { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},

	201 // DoCoMo version of Shift_JIS with Google Emoji PUA mappings.

	202 // See the comment at KDDI_SHIFT_JIS for other issues.

	203

	204 { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},

	205 // SoftBank version of Shift_JIS with Google Emoji PUA mappings.

	206 // See the comment at KDDI_SHIFT_JIS for other issues.

	207

	208 { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},

	209 // KDDI version of ISO-2022-JP with Google Emoji PUA mappings.

	210 // See the comment at KDDI_SHIFT_JIS for other issues.

	211 // The preferred Web encoding is due to potential confusion with

	212 // HTML syntax chars.

	213

	214 { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},

	215 // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings.

	216 // See the comment at KDDI_SHIFT_JIS for other issues.

	217 // The preferred Web encoding is due to potential confusion with

	218 // HTML syntax chars.

	219

	220 // Please refer to NOTE: section in the comments in the definition

	221 // of "struct I18NInfoByEncoding", before adding new encodings.

	222

	223 };

	224

	225

	226

	227 COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS,

	228 kEncodingInfoTable_has_incorrect_size);

	229

	230 Encoding default_encoding() {return LATIN1;}

	231

	232 // *************************************************************

	233 // Encoding predicates

	234 // IsValidEncoding()

	235 // IsEncEncCompatible

	236 // IsEncodingWithSupportedLanguage

	237 // IsSupersetOfAscii7Bit

	238 // Is8BitEncoding

	239 // IsCJKEncoding

	240 // IsHebrewEncoding

	241 // IsRightToLeftEncoding

	242 // IsLogicalRightToLeftEncoding

	243 // IsVisualRightToLeftEncoding

	244 // IsIso2022Encoding

	245 // IsIso2022JpOrVariant

	246 // IsShiftJisOrVariant

	247 // IsJapaneseCellPhoneCarrierSpecificEncoding

	248 // *************************************************************

	249

	250 bool IsValidEncoding(Encoding enc) {

	251 return ((enc >= 0) && (enc < kNumEncodings));

	252 }

	253

	254 bool IsEncEncCompatible(const Encoding from, const Encoding to) {

	255 // Tests compatibility between the "from" and "to" encodings; in

	256 // the typical case -- when both are valid known encodings -- this

	257 // returns true iff converting from first to second is a no-op.

	258 if (!IsValidEncoding(from) \|\| !IsValidEncoding(to)) {

	259 return false; // we only work with valid encodings...

	260 } else if (to == from) {

	261 return true; // the trivial common case

	262 }

	263

	264 if (to == UNKNOWN_ENCODING) {

	265 return true; // all valid encodings are compatible with the unknown

	266 }

	267

	268 if (from == UNKNOWN_ENCODING) {

	269 return false; // no unknown encoding is compatible with one that is

	270 }

	271

	272 if (from == ASCII_7BIT) {

	273 return IsSupersetOfAscii7Bit(to);

	274 }

	275

	276 return (from == ISO_8859_1 && to == MSFT_CP1252) \|\|

	277 (from == ISO_8859_8 && to == HEBREW_VISUAL) \|\|

	278 (from == HEBREW_VISUAL && to == ISO_8859_8) \|\|

	279 (from == ISO_8859_9 && to == MSFT_CP1254) \|\|

	280 (from == ISO_8859_11 && to == MSFT_CP874) \|\|

	281 (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) \|\|

	282 (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) \|\|

	283 (from == CHINESE_GB && to == GBK) \|\|

	284 (from == CHINESE_GB && to == GB18030) \|\|

	285 (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) \|\|

	286 (from == CHINESE_EUC_CN && to == CHINESE_CNS) \|\|

	287 (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) \|\|

	288 (from == CHINESE_EUC_DEC && to == CHINESE_CNS) \|\|

	289 (from == CHINESE_CNS && to == CHINESE_EUC_CN) \|\|

	290 (from == CHINESE_CNS && to == CHINESE_EUC_DEC);

	291 }

	292

	293 // To be a superset of 7-bit Ascii means that bytes 0...127 in the given

	294 // encoding represent the same characters as they do in ISO_8859_1.

	295

	296 // TODO: This list could be expanded. Many other encodings are supersets

	297 // of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two

	298 // encodings that I know for a fact should not be in this list.

	299 bool IsSupersetOfAscii7Bit(Encoding e) {

	300 switch (e) {

	301 case ISO_8859_1:

	302 case ISO_8859_2:

	303 case ISO_8859_3:

	304 case ISO_8859_4:

	305 case ISO_8859_5:

	306 case ISO_8859_6:

	307 case ISO_8859_7:

	308 case ISO_8859_8:

	309 case ISO_8859_9:

	310 case ISO_8859_10:

	311 case JAPANESE_EUC_JP:

	312 case JAPANESE_SHIFT_JIS:

	313 case CHINESE_BIG5:

	314 case CHINESE_GB:

	315 case CHINESE_EUC_CN:

	316 case KOREAN_EUC_KR:

	317 case CHINESE_EUC_DEC:

	318 case CHINESE_CNS:

	319 case CHINESE_BIG5_CP950:

	320 case JAPANESE_CP932:

	321 case UTF8:

	322 case UNKNOWN_ENCODING:

	323 case ASCII_7BIT:

	324 case RUSSIAN_KOI8_R:

	325 case RUSSIAN_CP1251:

	326 case MSFT_CP1252:

	327 case RUSSIAN_KOI8_RU:

	328 case MSFT_CP1250:

	329 case ISO_8859_15:

	330 case MSFT_CP1254:

	331 case MSFT_CP1257:

	332 case ISO_8859_11:

	333 case MSFT_CP874:

	334 case MSFT_CP1256:

	335 case MSFT_CP1255:

	336 case ISO_8859_8_I:

	337 case HEBREW_VISUAL:

	338 case CZECH_CP852:

	339 case MSFT_CP1253:

	340 case RUSSIAN_CP866:

	341 case ISO_8859_13:

	342 case GBK:

	343 case GB18030:

	344 case BIG5_HKSCS:

	345 case MACINTOSH_ROMAN:

	346 return true;

	347 default:

	348 return false;

	349 }

	350 }

	351

	352 // To be an 8-bit encoding means that there are fewer than 256 symbols.

	353 // Each byte determines a new character; there are no multi-byte sequences.

	354

	355 // TODO: This list could maybe be expanded. Other encodings may be 8-bit.

	356 bool Is8BitEncoding(Encoding e) {

	357 switch (e) {

	358 case ASCII_7BIT:

	359 case ISO_8859_1:

	360 case ISO_8859_2:

	361 case ISO_8859_3:

	362 case ISO_8859_4:

	363 case ISO_8859_5:

	364 case ISO_8859_6:

	365 case ISO_8859_7:

	366 case ISO_8859_8:

	367 case ISO_8859_8_I:

	368 case ISO_8859_9:

	369 case ISO_8859_10:

	370 case ISO_8859_11:

	371 case ISO_8859_13:

	372 case ISO_8859_15:

	373 case MSFT_CP1252:

	374 case MSFT_CP1253:

	375 case MSFT_CP1254:

	376 case MSFT_CP1255:

	377 case MSFT_CP1256:

	378 case MSFT_CP1257:

	379 case RUSSIAN_KOI8_R:

	380 case RUSSIAN_KOI8_RU:

	381 case RUSSIAN_CP866:

	382 return true;

	383 default:

	384 return false;

	385 }

	386 }

	387

	388 bool IsCJKEncoding(Encoding e) {

	389 switch (e) {

	390 case JAPANESE_EUC_JP:

	391 case JAPANESE_SHIFT_JIS:

	392 case JAPANESE_JIS:

	393 case CHINESE_BIG5:

	394 case CHINESE_GB:

	395 case CHINESE_EUC_CN:

	396 case KOREAN_EUC_KR:

	397 case CHINESE_EUC_DEC:

	398 case CHINESE_CNS:

	399 case CHINESE_BIG5_CP950:

	400 case JAPANESE_CP932:

	401 case ISO_2022_KR:

	402 case GBK:

	403 case GB18030:

	404 case BIG5_HKSCS:

	405 case ISO_2022_CN:

	406 case HZ_GB_2312:

	407 return true;

	408 default:

	409 return false;

	410 }

	411 }

	412

	413 bool IsHebrewEncoding(Encoding e) {

	414 return (e == ISO_8859_8 \|\|

	415 e == ISO_8859_8_I \|\|

	416 e == MSFT_CP1255 \|\|

	417 e == HEBREW_VISUAL);

	418 }

	419

	420

	421

	422 bool IsRightToLeftEncoding(Encoding enc) {

	423 switch (enc) {

	424 case MSFT_CP1255:

	425 case MSFT_CP1256:

	426 case ARABIC_ENCODING:

	427 case HEBREW_ENCODING:

	428 case ISO_8859_8_I:

	429 case HEBREW_VISUAL:

	430 return true;

	431 default:

	432 return false;

	433 }

	434 }

	435

	436 bool IsLogicalRightToLeftEncoding(Encoding enc) {

	437 return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc);

	438 }

	439

	440 // Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)

	441 // is NOT visual.

	442 bool IsVisualRightToLeftEncoding(Encoding enc) {

	443 switch (enc) {

	444 case HEBREW_ENCODING:

	445 case HEBREW_VISUAL:

	446 return true;

	447 default:

	448 return false;

	449 }

	450 }

	451

	452

	453

	454

	455

	456 bool IsIso2022Encoding(Encoding enc) {

	457 return (IsIso2022JpOrVariant(enc) \|\|

	458 enc == ISO_2022_KR \|\|

	459 enc == ISO_2022_CN);

	460 }

	461

	462 bool IsIso2022JpOrVariant(Encoding enc) {

	463 return (enc == JAPANESE_JIS \|\|

	464 enc == KDDI_ISO_2022_JP \|\|

	465 enc == SOFTBANK_ISO_2022_JP);

	466 }

	467

	468 bool IsShiftJisOrVariant(Encoding enc) {

	469 return (enc == JAPANESE_SHIFT_JIS \|\|

	470 enc == JAPANESE_CP932 \|\|

	471 enc == KDDI_SHIFT_JIS \|\|

	472 enc == DOCOMO_SHIFT_JIS \|\|

	473 enc == SOFTBANK_SHIFT_JIS);

	474 }

	475

	476 bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) {

	477 return (enc == KDDI_ISO_2022_JP \|\|

	478 enc == KDDI_SHIFT_JIS \|\|

	479 enc == DOCOMO_SHIFT_JIS \|\|

	480 enc == SOFTBANK_SHIFT_JIS \|\|

	481 enc == SOFTBANK_ISO_2022_JP);

	482 }

	483

	484

	485 // *************************************************************

	486 // ENCODING NAMES

	487 // EncodingName() [Encoding to name]

	488 // MimeEncodingName() [Encoding to name]

	489 // EncodingFromName() [name to Encoding]

	490 // EncodingNameAliasToEncoding() [name to Encoding]

	491 // default_encoding_name()

	492 // invalid_encoding_name()

	493 // *************************************************************

	494

10 const char * EncodingName(const Encoding enc) {	495 const char * EncodingName(const Encoding enc) {

11 return "";	496 if ( (enc < 0) \|\| (enc >= kNumEncodings) )

12 }	497 return invalid_encoding_name();

	498 return kEncodingInfoTable[enc].encoding_name_;

	499 }

	500

	501 // TODO: Unify MimeEncodingName and EncodingName, or determine why

	502 // such a unification is not possible.

	503

	504 const char * MimeEncodingName(Encoding enc) {

	505 if ( (enc < 0) \|\| (enc >= kNumEncodings) )

	506 return ""; // TODO(jrm) Should this be invalid_encoding_name()?

	507 return kEncodingInfoTable[enc].mime_encoding_name_;

	508 }

	509

	510 bool EncodingFromName(const char* enc_name, Encoding *encoding) {

	511 *encoding = UNKNOWN_ENCODING;

	512 if ( enc_name == NULL ) return false;

	513

	514 for ( int i = 0; i < kNumEncodings; i++ ) {

	515 if ( !strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) {

	516 *encoding = static_cast<Encoding>(i);

	517 return true;

	518 }

	519 }

	520 return false;

	521 }

	522

	523 #if 0

	524 // The encoding_map maps standard and non-standard encoding-names

	525 // (strings) to Encoding enums. It is used only by

	526 // EncodingNameAliasToEncoding. Note that the map uses

	527 // case-insensitive hash and comparison functions.

	528

	529 typedef hash_map <const char *, Encoding,

	530 CStringAlnumCaseHash,

	531 CStringAlnumCaseEqual> EncodingMap;

	532

	533 static EncodingMap encoding_map;

	534

	535 // Mutex for locking the code that initializes encoding_map.

	536 // static Mutex encodings_init_mutex(base::LINKER_INITIALIZED);

	537

	538 void InitEncodings() {

	539 // For thread safety, keep a mutex while initializing this map.

	540 // Also allow this function to be called more than once and

	541 // gracefully exiting if that occurs.

	542 // MutexLock lock(&encodings_init_mutex);

	543 if (!encoding_map.empty()) {

	544 // Already initialized

	545 return;

	546 }

	547

	548 // Initialize the map with all the "standard" encoding names,

	549 // i.e., the ones returned by EncodingName and MimeEncodingName.

	550 //

	551 // First, add internal encoding names returned by EncodingName().

	552 for (int i = 0; i < NUM_ENCODINGS; ++i) {

	553 Encoding e = static_cast<Encoding>(i);

	554 // Internal encoding names must be unique.

	555 // The internal names are guaranteed to be unique by the CHECK_EQ.

	556 const char *encoding_name = EncodingName(e);

	557 CHECK_EQ(0, encoding_map.count(encoding_name))

	558 << "Duplicate found for " << encoding_name;

	559 encoding_map[encoding_name] = e;

	560 }

	561 // Then, add mime encoding names returned by MimeEncodingName().

	562 // We don't override existing entries, to give precedence to entries

	563 // added earlier.

	564 for (int i = 0; i < NUM_ENCODINGS; ++i) {

	565 Encoding e = static_cast<Encoding>(i);

	566 // Note that MimeEncodingName() can return the same mime encoding

	567 // name for different encoding enums like JAPANESE_SHIFT_JIS and

	568 // KDDI_SHIFT_JIS. In that case, the encoding enum first seen

	569 // will be the value for the encoding name in the map.

	570 const char *mime_encoding_name = MimeEncodingName(e);

	571 if (encoding_map.count(mime_encoding_name) == 0) {

	572 encoding_map[mime_encoding_name] = e;

	573 }

	574 }

	575

	576 // Add some non-standard names: alternate spellings, common typos,

	577 // etc. (It does no harm to add names already in the map.) Note

	578 // that although the map is case-insensitive, by convention the

	579 // keys are written here in lower case. For ease of maintenance,

	580 // they are listed in alphabetical order.

	581 encoding_map["5601"] = KOREAN_EUC_KR;

	582 encoding_map["646"] = ASCII_7BIT;

	583 encoding_map["852"] = CZECH_CP852;

	584 encoding_map["866"] = RUSSIAN_CP866;

	585 encoding_map["8859-1"] = ISO_8859_1;

	586 encoding_map["ansi-1251"] = RUSSIAN_CP1251;

	587 encoding_map["ansi_x3.4-1968"] = ASCII_7BIT;

	588 encoding_map["arabic"] = ISO_8859_6;

	589 encoding_map["ascii"] = ISO_8859_1;

	590 encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard

	591 encoding_map["asmo-708"] = ISO_8859_6;

	592 encoding_map["bhaskar"] = BHASKAR;

	593 encoding_map["big5"] = CHINESE_BIG5;

	594 encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard

	595 encoding_map["big5-hkscs"] = BIG5_HKSCS;

	596 encoding_map["chinese"] = CHINESE_GB;

	597 encoding_map["cns"] = CHINESE_CNS; // not iana standard

	598 encoding_map["cns11643"] = CHINESE_CNS;

	599 encoding_map["cp1250"] = MSFT_CP1250; // not iana standard

	600 encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard

	601 encoding_map["cp1252"] = MSFT_CP1252; // not iana standard

	602 encoding_map["cp1253"] = MSFT_CP1253; // not iana standard

	603 encoding_map["cp1254"] = MSFT_CP1254; // not iana standard

	604 encoding_map["cp1255"] = MSFT_CP1255;

	605 encoding_map["cp1256"] = MSFT_CP1256;

	606 encoding_map["cp1257"] = MSFT_CP1257; // not iana standard

	607 encoding_map["cp819"] = ISO_8859_1;

	608 encoding_map["cp852"] = CZECH_CP852;

	609 encoding_map["cp866"] = RUSSIAN_CP866;

	610 encoding_map["cp-866"] = RUSSIAN_CP866;

	611 encoding_map["cp874"] = MSFT_CP874;

	612 encoding_map["cp932"] = JAPANESE_CP932; // not iana standard

	613 encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard

	614 encoding_map["csbig5"] = CHINESE_BIG5;

	615 encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP;

	616 encoding_map["cseuckr"] = KOREAN_EUC_KR;

	617 encoding_map["csgb2312"] = CHINESE_GB;

	618 encoding_map["csibm852"] = CZECH_CP852;

	619 encoding_map["csibm866"] = RUSSIAN_CP866;

	620 encoding_map["csiso2022jp"] = JAPANESE_JIS;

	621 encoding_map["csiso2022kr"] = ISO_2022_KR;

	622 encoding_map["csiso58gb231280"] = CHINESE_GB;

	623 encoding_map["csiso88598i"] = ISO_8859_8_I;

	624 encoding_map["csisolatin1"] = ISO_8859_1;

	625 encoding_map["csisolatin2"] = ISO_8859_2;

	626 encoding_map["csisolatin3"] = ISO_8859_3;

	627 encoding_map["csisolatin4"] = ISO_8859_4;

	628 encoding_map["csisolatin5"] = ISO_8859_9;

	629 encoding_map["csisolatin6"] = ISO_8859_10;

	630 encoding_map["csisolatinarabic"] = ISO_8859_6;

	631 encoding_map["csisolatincyrillic"] = ISO_8859_5;

	632 encoding_map["csisolatingreek"] = ISO_8859_7;

	633 encoding_map["csisolatinhebrew"] = ISO_8859_8;

	634 encoding_map["csksc56011987"] = KOREAN_EUC_KR;

	635 encoding_map["csmacintosh"] = MACINTOSH_ROMAN;

	636 encoding_map["csn-369103"] = CZECH_CSN_369103;

	637 encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS;

	638 encoding_map["csunicode"] = UTF16BE;

	639 encoding_map["csunicode11"] = UTF16BE;

	640 encoding_map["csunicode11utf7"] = UTF7;

	641 encoding_map["csunicodeascii"] = UTF16BE;

	642 encoding_map["csunicodelatin1"] = UTF16BE;

	643 encoding_map["cyrillic"] = ISO_8859_5;

	644 encoding_map["ecma-114"] = ISO_8859_6;

	645 encoding_map["ecma-118"] = ISO_8859_7;

	646 encoding_map["elot_928"] = ISO_8859_7;

	647 encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard

	648 encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard

	649 encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard

	650 encoding_map["euc-jp"] = JAPANESE_EUC_JP;

	651 encoding_map["euc-kr"] = KOREAN_EUC_KR;

	652 encoding_map["eucgb2312_cn"] = CHINESE_GB;

	653 encoding_map["gb"] = CHINESE_GB; // not iana standard

	654 encoding_map["gb18030"] = GB18030;

	655 encoding_map["gb2132"] = CHINESE_GB; // common typo

	656 encoding_map["gb2312"] = CHINESE_GB;

	657 encoding_map["gb_2312-80"] = CHINESE_GB;

	658 encoding_map["gbk"] = GBK;

	659 encoding_map["greek"] = ISO_8859_7;

	660 encoding_map["greek8"] = ISO_8859_7;

	661 encoding_map["hebrew"] = ISO_8859_8;

	662 encoding_map["htchanakya"] = HTCHANAKYA;

	663 encoding_map["hz-gb-2312"] = HZ_GB_2312;

	664 encoding_map["ibm819"] = ISO_8859_1;

	665 encoding_map["ibm852"] = CZECH_CP852;

	666 encoding_map["ibm874"] = MSFT_CP874;

	667 encoding_map["iso-10646"] = UTF16BE;

	668 encoding_map["iso-10646-j-1"] = UTF16BE;

	669 encoding_map["iso-10646-ucs-2"] = UNICODE;

	670 encoding_map["iso-10646-ucs-4"] = UTF32BE;

	671 encoding_map["iso-10646-ucs-basic"] = UTF16BE;

	672 encoding_map["iso-10646-unicode-latin1"] = UTF16BE;

	673 encoding_map["iso-2022-cn"] = ISO_2022_CN;

	674 encoding_map["iso-2022-jp"] = JAPANESE_JIS;

	675 encoding_map["iso-2022-kr"] = ISO_2022_KR;

	676 encoding_map["iso-8559-1"] = ISO_8859_1; // common typo

	677 encoding_map["iso-874"] = MSFT_CP874;

	678 encoding_map["iso-8858-1"] = ISO_8859_1; // common typo

	679 // iso-8859-0 was a temporary name, eventually renamed iso-8859-15

	680 encoding_map["iso-8859-0"] = ISO_8859_15;

	681 encoding_map["iso-8859-1"] = ISO_8859_1;

	682 encoding_map["iso-8859-10"] = ISO_8859_10;

	683 encoding_map["iso-8859-11"] = ISO_8859_11;

	684 encoding_map["iso-8859-13"] = ISO_8859_13;

	685 encoding_map["iso-8859-15"] = ISO_8859_15;

	686 encoding_map["iso-8859-2"] = ISO_8859_2;

	687 encoding_map["iso-8859-3"] = ISO_8859_3;

	688 encoding_map["iso-8859-4"] = ISO_8859_4;

	689 encoding_map["iso-8859-5"] = ISO_8859_5;

	690 encoding_map["iso-8859-6"] = ISO_8859_6;

	691 encoding_map["iso-8859-7"] = ISO_8859_7;

	692 encoding_map["iso-8859-8"] = ISO_8859_8;

	693 encoding_map["iso-8859-8-i"] = ISO_8859_8_I;

	694 encoding_map["iso-8859-9"] = ISO_8859_9;

	695 encoding_map["iso-9959-1"] = ISO_8859_1; // common typo

	696 encoding_map["iso-ir-100"] = ISO_8859_1;

	697 encoding_map["iso-ir-101"] = ISO_8859_2;

	698 encoding_map["iso-ir-109"] = ISO_8859_3;

	699 encoding_map["iso-ir-110"] = ISO_8859_4;

	700 encoding_map["iso-ir-126"] = ISO_8859_7;

	701 encoding_map["iso-ir-127"] = ISO_8859_6;

	702 encoding_map["iso-ir-138"] = ISO_8859_8;

	703 encoding_map["iso-ir-144"] = ISO_8859_5;

	704 encoding_map["iso-ir-148"] = ISO_8859_9;

	705 encoding_map["iso-ir-149"] = KOREAN_EUC_KR;

	706 encoding_map["iso-ir-157"] = ISO_8859_10;

	707 encoding_map["iso-ir-58"] = CHINESE_GB;

	708 encoding_map["iso-latin-1"] = ISO_8859_1;

	709 encoding_map["iso_2022-cn"] = ISO_2022_CN;

	710 encoding_map["iso_2022-kr"] = ISO_2022_KR;

	711 encoding_map["iso_8859-1"] = ISO_8859_1;

	712 encoding_map["iso_8859-10:1992"] = ISO_8859_10;

	713 encoding_map["iso_8859-11"] = ISO_8859_11;

	714 encoding_map["iso_8859-13"] = ISO_8859_13;

	715 encoding_map["iso_8859-15"] = ISO_8859_15;

	716 encoding_map["iso_8859-1:1987"] = ISO_8859_1;

	717 encoding_map["iso_8859-2"] = ISO_8859_2;

	718 encoding_map["iso_8859-2:1987"] = ISO_8859_2;

	719 encoding_map["iso_8859-3"] = ISO_8859_3;

	720 encoding_map["iso_8859-3:1988"] = ISO_8859_3;

	721 encoding_map["iso_8859-4"] = ISO_8859_4;

	722 encoding_map["iso_8859-4:1988"] = ISO_8859_4;

	723 encoding_map["iso_8859-5"] = ISO_8859_5;

	724 encoding_map["iso_8859-5:1988"] = ISO_8859_5;

	725 encoding_map["iso_8859-6"] = ISO_8859_6;

	726 encoding_map["iso_8859-6:1987"] = ISO_8859_6;

	727 encoding_map["iso_8859-7"] = ISO_8859_7;

	728 encoding_map["iso_8859-7:1987"] = ISO_8859_7;

	729 encoding_map["iso_8859-8"] = ISO_8859_8;

	730 encoding_map["iso_8859-8:1988:"] = ISO_8859_8;

	731 encoding_map["iso_8859-9"] = ISO_8859_9;

	732 encoding_map["iso_8859-9:1989"] = ISO_8859_9;

	733 encoding_map["jagran"] = JAGRAN;

	734 encoding_map["jis"] = JAPANESE_JIS; // not iana standard

	735 encoding_map["koi8-cs"] = CZECH_CSN_369103;

	736 encoding_map["koi8-r"] = RUSSIAN_KOI8_R;

	737 encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard

	738 encoding_map["koi8-u"] = RUSSIAN_KOI8_RU;

	739 encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard

	740 encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard

	741 encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant

	742 encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard

	743 encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard

	744 encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR;

	745 encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard

	746 encoding_map["l1"] = ISO_8859_1;

	747 encoding_map["l2"] = ISO_8859_2;

	748 encoding_map["l3"] = ISO_8859_3;

	749 encoding_map["l4"] = ISO_8859_4;

	750 encoding_map["l5"] = ISO_8859_9;

	751 encoding_map["l6"] = ISO_8859_10;

	752 encoding_map["latin-1"] = ISO_8859_1; // not iana standard

	753 encoding_map["latin1"] = ISO_8859_1;

	754 encoding_map["latin2"] = ISO_8859_2;

	755 encoding_map["latin3"] = ISO_8859_3;

	756 encoding_map["latin4"] = ISO_8859_4;

	757 encoding_map["latin5"] = ISO_8859_9;

	758 encoding_map["latin6"] = ISO_8859_10;

	759 encoding_map["mac"] = MACINTOSH_ROMAN;

	760 encoding_map["macintosh"] = MACINTOSH_ROMAN;

	761 encoding_map["macintosh-roman"] = MACINTOSH_ROMAN;

	762 encoding_map["ms932"] = JAPANESE_CP932; // not iana standard

	763 encoding_map["ms_kanji"] = JAPANESE_CP932;

	764 encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS;

	765 encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS;

	766 encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard

	767 encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard

	768 encoding_map["sun_eu_greek"] = ISO_8859_7;

	769 encoding_map["tab"] = TAMIL_BI;

	770 encoding_map["tam"] = TAMIL_MONO;

	771 encoding_map["tis-620"] = ISO_8859_11;

	772 encoding_map["tscii"] = TSCII;

	773 encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard

	774 encoding_map["unicode"] = UNICODE; // not iana standard

	775 encoding_map["unicode-1-1-utf-7"] = UTF7;

	776 encoding_map["unicode-1-1-utf-8"] = UTF8;

	777 encoding_map["unicode-2-0-utf-7"] = UTF7;

	778 encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard

	779 encoding_map["us"] = ISO_8859_1;

	780 encoding_map["us-ascii"] = ISO_8859_1;

	781 encoding_map["utf-16be"] = UTF16BE;

	782 encoding_map["utf-16le"] = UTF16LE;

	783 encoding_map["utf-32be"] = UTF32BE;

	784 encoding_map["utf-32le"] = UTF32LE;

	785 encoding_map["utf-7"] = UTF7;

	786 encoding_map["utf-8"] = UTF8;

	787 encoding_map["utf7"] = UTF7;

	788 encoding_map["utf8"] = UTF8; // not iana standard

	789 encoding_map["visual"] = HEBREW_VISUAL;

	790 encoding_map["win-1250"] = MSFT_CP1250; // not iana standard

	791 encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard

	792 encoding_map["window-874"] = MSFT_CP874;

	793 encoding_map["windows-1250"] = MSFT_CP1250;

	794 encoding_map["windows-1251"] = RUSSIAN_CP1251;

	795 encoding_map["windows-1252"] = MSFT_CP1252;

	796 encoding_map["windows-1253"] = MSFT_CP1253;

	797 encoding_map["windows-1254"] = MSFT_CP1254;

	798 encoding_map["windows-1255"] = MSFT_CP1255;

	799 encoding_map["windows-1256"] = MSFT_CP1256;

	800 encoding_map["windows-1257"] = MSFT_CP1257;

	801 encoding_map["windows-31j"] = JAPANESE_CP932;

	802 encoding_map["windows-874"] = MSFT_CP874;

	803 encoding_map["windows-936"] = GBK;

	804 encoding_map["x-big5"] = CHINESE_BIG5;

	805 encoding_map["x-binaryenc"] = BINARYENC; // not iana standard

	806 encoding_map["x-cp1250"] = MSFT_CP1250;

	807 encoding_map["x-cp1251"] = RUSSIAN_CP1251;

	808 encoding_map["x-cp1252"] = MSFT_CP1252;

	809 encoding_map["x-cp1253"] = MSFT_CP1253;

	810 encoding_map["x-cp1254"] = MSFT_CP1254;

	811 encoding_map["x-cp1255"] = MSFT_CP1255;

	812 encoding_map["x-cp1256"] = MSFT_CP1256;

	813 encoding_map["x-cp1257"] = MSFT_CP1257;

	814 encoding_map["x-euc-jp"] = JAPANESE_EUC_JP;

	815 encoding_map["x-euc-tw"] = CHINESE_CNS;

	816 encoding_map["x-gbk"] = GBK;

	817 encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE;

	818 encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE;

	819 encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE;

	820 encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE;

	821 encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard

	822 encoding_map["x-mac-roman"] = MACINTOSH_ROMAN;

	823 encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard

	824 encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS;

	825 encoding_map["x-unicode-2-0-utf-7"] = UTF7;

	826 encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard

	827 encoding_map["x-x-big5"] = CHINESE_BIG5;

	828 encoding_map["zh_cn.euc"] = CHINESE_GB;

	829 encoding_map["zh_tw-big5"] = CHINESE_BIG5;

	830 encoding_map["zh_tw-euc"] = CHINESE_CNS;

	831

	832 // Remove they entry for the empty string, if any.

	833 encoding_map.erase("");

	834 }

	835

	836 REGISTER_MODULE_INITIALIZER(encodings, {

	837 InitEncodings();

	838 });

	839

	840 // ----------------------------------------------------------------------

	841 // EncodingNameAliasToEncoding()

	842 //

	843 // This function takes an encoding name/alias and returns the Encoding

	844 // enum. The input is case insensitive. It is the union of the common

	845 // IANA standard names, the charset names used in Netscape Navigator,

	846 // and some common names we have been using.

	847 // See: http://www.iana.org/assignments/character-sets

	848 // http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html

	849 //

	850 // UNKNOWN_ENCODING is returned if none matches.

	851 //

	852 // TODO: Check if it is possible to remove the non-standard,

	853 // non-netscape-use names. It is because this routine is used for

	854 // encoding detections from html meta info. Non-standard names may

	855 // introduce noise on encoding detection.

	856 //

	857 // TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,

	858 // or determine why such a unification is not possible.

	859 // ----------------------------------------------------------------------

	860 Encoding EncodingNameAliasToEncoding(const char *encoding_name) {

	861 if (!encoding_name) {

	862 return UNKNOWN_ENCODING;

	863 }

	864

	865 // The map is initialized during InitGoogle() in a thread-safe manner.

	866 CHECK(!encoding_map.empty()) << ": Must call InitGoogle()";

	867

	868 EncodingMap::iterator emi = encoding_map.find(encoding_name);

	869 if (emi != encoding_map.end()) {

	870 return emi->second;

	871 } else {

	872 return UNKNOWN_ENCODING;

	873 }

	874 }

	875 #endif

	876

	877 const char* default_encoding_name() {

	878 return kEncodingInfoTable[LATIN1].encoding_name_;

	879 }

	880

	881 static const char* const kInvalidEncodingName = "invalid_encoding";

	882

	883 const char *invalid_encoding_name() {

	884 return kInvalidEncodingName;

	885 }

	886

	887

	888

	889 // *************************************************************

	890 // Miscellany

	891 // *************************************************************

	892

	893

	894 Encoding PreferredWebOutputEncoding(Encoding enc) {

	895 return IsValidEncoding(enc)

	896 ? kEncodingInfoTable[enc].preferred_web_output_encoding_

	897 : UTF8;

	898 }

OLD	NEW

« no previous file with comments | « third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc ('k') | tools/run-perf-test.cfg » ('j') | no next file with comments »