OLD | NEW |
1 // Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. | 1 // Copyright 2008 Google Inc. All Rights Reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Author: jrm@google.com (Jim Meehan) |
3 // found in the LICENSE file. | |
4 | 3 |
5 #include "encodings/public/encodings.h" | 4 #include "encodings/public/encodings.h" |
6 | 5 |
7 | 6 #include <string.h> // for strcasecmp |
8 // We do not use it, just to please a compiler and minimize ported | 7 //#include <hash_map> // for _Hashtable_iterator, etc |
9 // code changes. | 8 #include <utility> // for pair |
| 9 |
| 10 //#include "base/googleinit.h" // for REGISTER_MODULE_INITIALIZER |
| 11 //#include "base/logging.h" // for operator<<, Check_EQImpl, etc |
| 12 //#include "base/macros.h" // for COMPILE_ASSERT, etc |
| 13 //#include "base/mutex.h" // for Mutex, MutexLock |
| 14 //#include "util/hash/case_insensitive_hash.h" |
| 15 //#include "util/hash/hash.h" |
| 16 #include "encodings/compact_lang_det/win/cld_basictypes.h" |
| 17 #include "encodings/compact_lang_det/win/cld_logging.h" |
| 18 #include "encodings/compact_lang_det/win/cld_macros.h" |
| 19 |
| 20 struct EncodingInfo { |
| 21 // The standard name for this encoding. |
| 22 // |
| 23 const char* encoding_name_; |
| 24 |
| 25 // The "preferred MIME name" of an encoding as specified by the IANA at: |
| 26 // http://www.iana.org/assignments/character-sets |
| 27 // |
| 28 // Note that the preferred MIME name may differ slightly from the |
| 29 // official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987 |
| 30 // |
| 31 const char* mime_encoding_name_; |
| 32 |
| 33 // NOTE: As of January 2007, it is a Google requirement that if an |
| 34 // encoding has an IANA name, then encoding_name_ and |
| 35 // mime_encoding_name_ must be the same string. |
| 36 // |
| 37 // However, there can be exceptions if there are compelling reasons. |
| 38 // For example, Japanese mobile handsets require the name |
| 39 // "Shift_JIS" in charset=... parameter in Content-Type headers to |
| 40 // process emoji (emoticons) in their private encodings. In that |
| 41 // case, mime_encoding_name_ should be "Shift_JIS", despite |
| 42 // encoding_name_ actually is "X-KDDI-Shift_JIS". |
| 43 |
| 44 // Some multi-byte encodings use byte values that coincide with the |
| 45 // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE |
| 46 // can misinterpret these, as indicated in an external XSS report from |
| 47 // 2007-02-15. Here, we map these dangerous encodings to safer ones. We |
| 48 // also use UTF8 instead of encodings that we don't support in our |
| 49 // output, and we generally try to be conservative in what we send out. |
| 50 // Where the client asks for single- or double-byte encodings that are |
| 51 // not as common, we substitute a more common single- or double-byte |
| 52 // encoding, if there is one, thereby preserving the client's intent |
| 53 // to use less space than UTF-8. This also means that characters |
| 54 // outside the destination set will be converted to HTML NCRs (&#NNN;) |
| 55 // if requested. |
| 56 |
| 57 Encoding preferred_web_output_encoding_; |
| 58 }; |
| 59 |
| 60 static const EncodingInfo kEncodingInfoTable[] = { |
| 61 { "ASCII", "ISO-8859-1", ISO_8859_1}, |
| 62 { "Latin2", "ISO-8859-2", ISO_8859_2}, |
| 63 { "Latin3", "ISO-8859-3", UTF8}, |
| 64 // MSIE 6 does not support ISO-8859-3 (XSS issue) |
| 65 { "Latin4", "ISO-8859-4", ISO_8859_4}, |
| 66 { "ISO-8859-5", "ISO-8859-5", ISO_8859_5}, |
| 67 { "Arabic", "ISO-8859-6", ISO_8859_6}, |
| 68 { "Greek", "ISO-8859-7", ISO_8859_7}, |
| 69 { "Hebrew", "ISO-8859-8", MSFT_CP1255}, |
| 70 // we do not endorse the visual order |
| 71 { "Latin5", "ISO-8859-9", ISO_8859_9}, |
| 72 { "Latin6", "ISO-8859-10", UTF8}, |
| 73 // MSIE does not support ISO-8859-10 (XSS issue) |
| 74 { "EUC-JP", "EUC-JP", JAPANESE_EUC_JP}, |
| 75 { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS}, |
| 76 { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, |
| 77 // due to potential confusion with HTML syntax chars |
| 78 { "BIG5", "Big5", CHINESE_BIG5}, |
| 79 { "GB", "GB2312", CHINESE_GB}, |
| 80 { "EUC-CN", |
| 81 "EUC-CN", |
| 82 // Misnamed. Should be EUC-TW. |
| 83 CHINESE_BIG5}, |
| 84 // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW, |
| 85 // and EUC-TW is rare, so we prefer Big5 for output. |
| 86 { "KSC", "EUC-KR", KOREAN_EUC_KR}, |
| 87 { "Unicode", |
| 88 "UTF-16LE", |
| 89 // Internet Explorer doesn't recognize "ISO-10646-UCS-2" |
| 90 UTF8 |
| 91 // due to potential confusion with HTML syntax chars |
| 92 }, |
| 93 { "EUC", |
| 94 "EUC", // Misnamed. Should be EUC-TW. |
| 95 CHINESE_BIG5 |
| 96 // MSIE does not recognize "EUC" (XSS issue), |
| 97 // and EUC-TW is rare, so we prefer Big5 for output. |
| 98 }, |
| 99 { "CNS", |
| 100 "CNS", // Misnamed. Should be EUC-TW. |
| 101 CHINESE_BIG5}, |
| 102 // MSIE does not recognize "CNS" (XSS issue), |
| 103 // and EUC-TW is rare, so we prefer Big5 for output. |
| 104 { "BIG5-CP950", |
| 105 "BIG5-CP950", // Not an IANA name |
| 106 CHINESE_BIG5 |
| 107 // MSIE does not recognize "BIG5-CP950" (XSS issue) |
| 108 }, |
| 109 { "CP932", "CP932", // Not an IANA name |
| 110 JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue) |
| 111 { "UTF8", "UTF-8", UTF8}, |
| 112 { "Unknown", |
| 113 "x-unknown", // Not an IANA name |
| 114 UTF8}, // UTF-8 is our default output encoding |
| 115 { "ASCII-7-bit", "US-ASCII", ASCII_7BIT}, |
| 116 { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R}, |
| 117 { "CP1251", "windows-1251", RUSSIAN_CP1251}, |
| 118 { "CP1252", "windows-1252", MSFT_CP1252}, |
| 119 { "KOI8U", |
| 120 "KOI8-U", |
| 121 ISO_8859_5}, // because koi8-u is not as common |
| 122 { "CP1250", "windows-1250", MSFT_CP1250}, |
| 123 { "ISO-8859-15", "ISO-8859-15", ISO_8859_15}, |
| 124 { "CP1254", "windows-1254", MSFT_CP1254}, |
| 125 { "CP1257", "windows-1257", MSFT_CP1257}, |
| 126 { "ISO-8859-11", "ISO-8859-11", ISO_8859_11}, |
| 127 { "CP874", "windows-874", MSFT_CP874}, |
| 128 { "CP1256", "windows-1256", MSFT_CP1256}, |
| 129 { "CP1255", "windows-1255", MSFT_CP1255}, |
| 130 { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255}, |
| 131 // Java does not support iso-8859-8-i |
| 132 { "VISUAL", "ISO-8859-8", MSFT_CP1255}, |
| 133 // we do not endorse the visual order |
| 134 { "CP852", "cp852", MSFT_CP1250}, |
| 135 // because cp852 is not as common |
| 136 { "CSN_369103", "csn_369103", MSFT_CP1250}, |
| 137 // MSIE does not recognize "csn_369103" (XSS issue) |
| 138 { "CP1253", "windows-1253", MSFT_CP1253}, |
| 139 { "CP866", "IBM866", RUSSIAN_CP1251}, |
| 140 // because cp866 is not as common |
| 141 { "ISO-8859-13", "ISO-8859-13", UTF8}, |
| 142 // because iso-8859-13 is not widely supported |
| 143 { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR}, |
| 144 // due to potential confusion with HTML syntax chars |
| 145 { "GBK", "GBK", GBK}, |
| 146 { "GB18030", "GB18030", GBK}, |
| 147 // because gb18030 is not widely supported |
| 148 { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5}, |
| 149 // because Big5-HKSCS is not widely supported |
| 150 { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB}, |
| 151 // due to potential confusion with HTML syntax chars |
| 152 { "TSCII", "tscii", UTF8}, |
| 153 // we do not have an output converter for this font encoding |
| 154 { "TAM", "tam", UTF8}, |
| 155 // we do not have an output converter for this font encoding |
| 156 { "TAB", "tab", UTF8}, |
| 157 // we do not have an output converter for this font encoding |
| 158 { "JAGRAN", "jagran", UTF8}, |
| 159 // we do not have an output converter for this font encoding |
| 160 { "MACINTOSH", "MACINTOSH", ISO_8859_1}, |
| 161 // because macintosh is relatively uncommon |
| 162 { "UTF7", "UTF-7", |
| 163 UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated |
| 164 { "BHASKAR", "bhaskar", |
| 165 UTF8}, // we do not have an output converter for this font encoding |
| 166 { "HTCHANAKYA", "htchanakya", // not an IANA charset name. |
| 167 UTF8}, // we do not have an output converter for this font encoding |
| 168 { "UTF-16BE", "UTF-16BE", |
| 169 UTF8}, // due to potential confusion with HTML syntax chars |
| 170 { "UTF-16LE", "UTF-16LE", |
| 171 UTF8}, // due to potential confusion with HTML syntax chars |
| 172 { "UTF-32BE", "UTF-32BE", |
| 173 UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web |
| 174 { "UTF-32LE", "UTF-32LE", |
| 175 UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web |
| 176 { "X-BINARYENC", "x-binaryenc", // Not an IANA name |
| 177 UTF8}, // because this one is not intended for output (just input) |
| 178 { "HZ-GB-2312", "HZ-GB-2312", |
| 179 CHINESE_GB}, // due to potential confusion with HTML syntax chars |
| 180 { "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name |
| 181 UTF8}, // because this one is not intended for output (just input) |
| 182 { "X-TAM-ELANGO", "x-tam-elango", |
| 183 UTF8}, // we do not have an output converter for this font encoding |
| 184 { "X-TAM-LTTMBARANI", "x-tam-lttmbarani", |
| 185 UTF8}, // we do not have an output converter for this font encoding |
| 186 { "X-TAM-SHREE", "x-tam-shree", |
| 187 UTF8}, // we do not have an output converter for this font encoding |
| 188 { "X-TAM-TBOOMIS", "x-tam-tboomis", |
| 189 UTF8}, // we do not have an output converter for this font encoding |
| 190 { "X-TAM-TMNEWS", "x-tam-tmnews", |
| 191 UTF8}, // we do not have an output converter for this font encoding |
| 192 { "X-TAM-WEBTAMIL", "x-tam-webtamil", |
| 193 UTF8}, // we do not have an output converter for this font encoding |
| 194 |
| 195 { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, |
| 196 // KDDI version of Shift_JIS with Google Emoji PUA mappings. |
| 197 // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses |
| 198 // "Shift_JIS" in HTTP headers and email messages. |
| 199 |
| 200 { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, |
| 201 // DoCoMo version of Shift_JIS with Google Emoji PUA mappings. |
| 202 // See the comment at KDDI_SHIFT_JIS for other issues. |
| 203 |
| 204 { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, |
| 205 // SoftBank version of Shift_JIS with Google Emoji PUA mappings. |
| 206 // See the comment at KDDI_SHIFT_JIS for other issues. |
| 207 |
| 208 { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, |
| 209 // KDDI version of ISO-2022-JP with Google Emoji PUA mappings. |
| 210 // See the comment at KDDI_SHIFT_JIS for other issues. |
| 211 // The preferred Web encoding is due to potential confusion with |
| 212 // HTML syntax chars. |
| 213 |
| 214 { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, |
| 215 // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings. |
| 216 // See the comment at KDDI_SHIFT_JIS for other issues. |
| 217 // The preferred Web encoding is due to potential confusion with |
| 218 // HTML syntax chars. |
| 219 |
| 220 // Please refer to NOTE: section in the comments in the definition |
| 221 // of "struct I18NInfoByEncoding", before adding new encodings. |
| 222 |
| 223 }; |
| 224 |
| 225 |
| 226 |
| 227 COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS, |
| 228 kEncodingInfoTable_has_incorrect_size); |
| 229 |
| 230 Encoding default_encoding() {return LATIN1;} |
| 231 |
| 232 // ************************************************************* |
| 233 // Encoding predicates |
| 234 // IsValidEncoding() |
| 235 // IsEncEncCompatible |
| 236 // IsEncodingWithSupportedLanguage |
| 237 // IsSupersetOfAscii7Bit |
| 238 // Is8BitEncoding |
| 239 // IsCJKEncoding |
| 240 // IsHebrewEncoding |
| 241 // IsRightToLeftEncoding |
| 242 // IsLogicalRightToLeftEncoding |
| 243 // IsVisualRightToLeftEncoding |
| 244 // IsIso2022Encoding |
| 245 // IsIso2022JpOrVariant |
| 246 // IsShiftJisOrVariant |
| 247 // IsJapaneseCellPhoneCarrierSpecificEncoding |
| 248 // ************************************************************* |
| 249 |
| 250 bool IsValidEncoding(Encoding enc) { |
| 251 return ((enc >= 0) && (enc < kNumEncodings)); |
| 252 } |
| 253 |
| 254 bool IsEncEncCompatible(const Encoding from, const Encoding to) { |
| 255 // Tests compatibility between the "from" and "to" encodings; in |
| 256 // the typical case -- when both are valid known encodings -- this |
| 257 // returns true iff converting from first to second is a no-op. |
| 258 if (!IsValidEncoding(from) || !IsValidEncoding(to)) { |
| 259 return false; // we only work with valid encodings... |
| 260 } else if (to == from) { |
| 261 return true; // the trivial common case |
| 262 } |
| 263 |
| 264 if (to == UNKNOWN_ENCODING) { |
| 265 return true; // all valid encodings are compatible with the unknown |
| 266 } |
| 267 |
| 268 if (from == UNKNOWN_ENCODING) { |
| 269 return false; // no unknown encoding is compatible with one that is |
| 270 } |
| 271 |
| 272 if (from == ASCII_7BIT) { |
| 273 return IsSupersetOfAscii7Bit(to); |
| 274 } |
| 275 |
| 276 return (from == ISO_8859_1 && to == MSFT_CP1252) || |
| 277 (from == ISO_8859_8 && to == HEBREW_VISUAL) || |
| 278 (from == HEBREW_VISUAL && to == ISO_8859_8) || |
| 279 (from == ISO_8859_9 && to == MSFT_CP1254) || |
| 280 (from == ISO_8859_11 && to == MSFT_CP874) || |
| 281 (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) || |
| 282 (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) || |
| 283 (from == CHINESE_GB && to == GBK) || |
| 284 (from == CHINESE_GB && to == GB18030) || |
| 285 (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) || |
| 286 (from == CHINESE_EUC_CN && to == CHINESE_CNS) || |
| 287 (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) || |
| 288 (from == CHINESE_EUC_DEC && to == CHINESE_CNS) || |
| 289 (from == CHINESE_CNS && to == CHINESE_EUC_CN) || |
| 290 (from == CHINESE_CNS && to == CHINESE_EUC_DEC); |
| 291 } |
| 292 |
| 293 // To be a superset of 7-bit Ascii means that bytes 0...127 in the given |
| 294 // encoding represent the same characters as they do in ISO_8859_1. |
| 295 |
| 296 // TODO: This list could be expanded. Many other encodings are supersets |
| 297 // of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two |
| 298 // encodings that I know for a fact should *not* be in this list. |
| 299 bool IsSupersetOfAscii7Bit(Encoding e) { |
| 300 switch (e) { |
| 301 case ISO_8859_1: |
| 302 case ISO_8859_2: |
| 303 case ISO_8859_3: |
| 304 case ISO_8859_4: |
| 305 case ISO_8859_5: |
| 306 case ISO_8859_6: |
| 307 case ISO_8859_7: |
| 308 case ISO_8859_8: |
| 309 case ISO_8859_9: |
| 310 case ISO_8859_10: |
| 311 case JAPANESE_EUC_JP: |
| 312 case JAPANESE_SHIFT_JIS: |
| 313 case CHINESE_BIG5: |
| 314 case CHINESE_GB: |
| 315 case CHINESE_EUC_CN: |
| 316 case KOREAN_EUC_KR: |
| 317 case CHINESE_EUC_DEC: |
| 318 case CHINESE_CNS: |
| 319 case CHINESE_BIG5_CP950: |
| 320 case JAPANESE_CP932: |
| 321 case UTF8: |
| 322 case UNKNOWN_ENCODING: |
| 323 case ASCII_7BIT: |
| 324 case RUSSIAN_KOI8_R: |
| 325 case RUSSIAN_CP1251: |
| 326 case MSFT_CP1252: |
| 327 case RUSSIAN_KOI8_RU: |
| 328 case MSFT_CP1250: |
| 329 case ISO_8859_15: |
| 330 case MSFT_CP1254: |
| 331 case MSFT_CP1257: |
| 332 case ISO_8859_11: |
| 333 case MSFT_CP874: |
| 334 case MSFT_CP1256: |
| 335 case MSFT_CP1255: |
| 336 case ISO_8859_8_I: |
| 337 case HEBREW_VISUAL: |
| 338 case CZECH_CP852: |
| 339 case MSFT_CP1253: |
| 340 case RUSSIAN_CP866: |
| 341 case ISO_8859_13: |
| 342 case GBK: |
| 343 case GB18030: |
| 344 case BIG5_HKSCS: |
| 345 case MACINTOSH_ROMAN: |
| 346 return true; |
| 347 default: |
| 348 return false; |
| 349 } |
| 350 } |
| 351 |
| 352 // To be an 8-bit encoding means that there are fewer than 256 symbols. |
| 353 // Each byte determines a new character; there are no multi-byte sequences. |
| 354 |
| 355 // TODO: This list could maybe be expanded. Other encodings may be 8-bit. |
| 356 bool Is8BitEncoding(Encoding e) { |
| 357 switch (e) { |
| 358 case ASCII_7BIT: |
| 359 case ISO_8859_1: |
| 360 case ISO_8859_2: |
| 361 case ISO_8859_3: |
| 362 case ISO_8859_4: |
| 363 case ISO_8859_5: |
| 364 case ISO_8859_6: |
| 365 case ISO_8859_7: |
| 366 case ISO_8859_8: |
| 367 case ISO_8859_8_I: |
| 368 case ISO_8859_9: |
| 369 case ISO_8859_10: |
| 370 case ISO_8859_11: |
| 371 case ISO_8859_13: |
| 372 case ISO_8859_15: |
| 373 case MSFT_CP1252: |
| 374 case MSFT_CP1253: |
| 375 case MSFT_CP1254: |
| 376 case MSFT_CP1255: |
| 377 case MSFT_CP1256: |
| 378 case MSFT_CP1257: |
| 379 case RUSSIAN_KOI8_R: |
| 380 case RUSSIAN_KOI8_RU: |
| 381 case RUSSIAN_CP866: |
| 382 return true; |
| 383 default: |
| 384 return false; |
| 385 } |
| 386 } |
| 387 |
| 388 bool IsCJKEncoding(Encoding e) { |
| 389 switch (e) { |
| 390 case JAPANESE_EUC_JP: |
| 391 case JAPANESE_SHIFT_JIS: |
| 392 case JAPANESE_JIS: |
| 393 case CHINESE_BIG5: |
| 394 case CHINESE_GB: |
| 395 case CHINESE_EUC_CN: |
| 396 case KOREAN_EUC_KR: |
| 397 case CHINESE_EUC_DEC: |
| 398 case CHINESE_CNS: |
| 399 case CHINESE_BIG5_CP950: |
| 400 case JAPANESE_CP932: |
| 401 case ISO_2022_KR: |
| 402 case GBK: |
| 403 case GB18030: |
| 404 case BIG5_HKSCS: |
| 405 case ISO_2022_CN: |
| 406 case HZ_GB_2312: |
| 407 return true; |
| 408 default: |
| 409 return false; |
| 410 } |
| 411 } |
| 412 |
| 413 bool IsHebrewEncoding(Encoding e) { |
| 414 return (e == ISO_8859_8 || |
| 415 e == ISO_8859_8_I || |
| 416 e == MSFT_CP1255 || |
| 417 e == HEBREW_VISUAL); |
| 418 } |
| 419 |
| 420 |
| 421 |
| 422 bool IsRightToLeftEncoding(Encoding enc) { |
| 423 switch (enc) { |
| 424 case MSFT_CP1255: |
| 425 case MSFT_CP1256: |
| 426 case ARABIC_ENCODING: |
| 427 case HEBREW_ENCODING: |
| 428 case ISO_8859_8_I: |
| 429 case HEBREW_VISUAL: |
| 430 return true; |
| 431 default: |
| 432 return false; |
| 433 } |
| 434 } |
| 435 |
| 436 bool IsLogicalRightToLeftEncoding(Encoding enc) { |
| 437 return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc); |
| 438 } |
| 439 |
| 440 // Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6) |
| 441 // is NOT visual. |
| 442 bool IsVisualRightToLeftEncoding(Encoding enc) { |
| 443 switch (enc) { |
| 444 case HEBREW_ENCODING: |
| 445 case HEBREW_VISUAL: |
| 446 return true; |
| 447 default: |
| 448 return false; |
| 449 } |
| 450 } |
| 451 |
| 452 |
| 453 |
| 454 |
| 455 |
| 456 bool IsIso2022Encoding(Encoding enc) { |
| 457 return (IsIso2022JpOrVariant(enc) || |
| 458 enc == ISO_2022_KR || |
| 459 enc == ISO_2022_CN); |
| 460 } |
| 461 |
| 462 bool IsIso2022JpOrVariant(Encoding enc) { |
| 463 return (enc == JAPANESE_JIS || |
| 464 enc == KDDI_ISO_2022_JP || |
| 465 enc == SOFTBANK_ISO_2022_JP); |
| 466 } |
| 467 |
| 468 bool IsShiftJisOrVariant(Encoding enc) { |
| 469 return (enc == JAPANESE_SHIFT_JIS || |
| 470 enc == JAPANESE_CP932 || |
| 471 enc == KDDI_SHIFT_JIS || |
| 472 enc == DOCOMO_SHIFT_JIS || |
| 473 enc == SOFTBANK_SHIFT_JIS); |
| 474 } |
| 475 |
| 476 bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) { |
| 477 return (enc == KDDI_ISO_2022_JP || |
| 478 enc == KDDI_SHIFT_JIS || |
| 479 enc == DOCOMO_SHIFT_JIS || |
| 480 enc == SOFTBANK_SHIFT_JIS || |
| 481 enc == SOFTBANK_ISO_2022_JP); |
| 482 } |
| 483 |
| 484 |
| 485 // ************************************************************* |
| 486 // ENCODING NAMES |
| 487 // EncodingName() [Encoding to name] |
| 488 // MimeEncodingName() [Encoding to name] |
| 489 // EncodingFromName() [name to Encoding] |
| 490 // EncodingNameAliasToEncoding() [name to Encoding] |
| 491 // default_encoding_name() |
| 492 // invalid_encoding_name() |
| 493 // ************************************************************* |
| 494 |
10 const char * EncodingName(const Encoding enc) { | 495 const char * EncodingName(const Encoding enc) { |
11 return ""; | 496 if ( (enc < 0) || (enc >= kNumEncodings) ) |
12 } | 497 return invalid_encoding_name(); |
| 498 return kEncodingInfoTable[enc].encoding_name_; |
| 499 } |
| 500 |
| 501 // TODO: Unify MimeEncodingName and EncodingName, or determine why |
| 502 // such a unification is not possible. |
| 503 |
| 504 const char * MimeEncodingName(Encoding enc) { |
| 505 if ( (enc < 0) || (enc >= kNumEncodings) ) |
| 506 return ""; // TODO(jrm) Should this be invalid_encoding_name()? |
| 507 return kEncodingInfoTable[enc].mime_encoding_name_; |
| 508 } |
| 509 |
| 510 bool EncodingFromName(const char* enc_name, Encoding *encoding) { |
| 511 *encoding = UNKNOWN_ENCODING; |
| 512 if ( enc_name == NULL ) return false; |
| 513 |
| 514 for ( int i = 0; i < kNumEncodings; i++ ) { |
| 515 if ( !strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) { |
| 516 *encoding = static_cast<Encoding>(i); |
| 517 return true; |
| 518 } |
| 519 } |
| 520 return false; |
| 521 } |
| 522 |
| 523 #if 0 |
| 524 // The encoding_map maps standard and non-standard encoding-names |
| 525 // (strings) to Encoding enums. It is used only by |
| 526 // EncodingNameAliasToEncoding. Note that the map uses |
| 527 // case-insensitive hash and comparison functions. |
| 528 |
| 529 typedef hash_map <const char *, Encoding, |
| 530 CStringAlnumCaseHash, |
| 531 CStringAlnumCaseEqual> EncodingMap; |
| 532 |
| 533 static EncodingMap encoding_map; |
| 534 |
| 535 // Mutex for locking the code that initializes encoding_map. |
| 536 // static Mutex encodings_init_mutex(base::LINKER_INITIALIZED); |
| 537 |
| 538 void InitEncodings() { |
| 539 // For thread safety, keep a mutex while initializing this map. |
| 540 // Also allow this function to be called more than once and |
| 541 // gracefully exiting if that occurs. |
| 542 // MutexLock lock(&encodings_init_mutex); |
| 543 if (!encoding_map.empty()) { |
| 544 // Already initialized |
| 545 return; |
| 546 } |
| 547 |
| 548 // Initialize the map with all the "standard" encoding names, |
| 549 // i.e., the ones returned by EncodingName and MimeEncodingName. |
| 550 // |
| 551 // First, add internal encoding names returned by EncodingName(). |
| 552 for (int i = 0; i < NUM_ENCODINGS; ++i) { |
| 553 Encoding e = static_cast<Encoding>(i); |
| 554 // Internal encoding names must be unique. |
| 555 // The internal names are guaranteed to be unique by the CHECK_EQ. |
| 556 const char *encoding_name = EncodingName(e); |
| 557 CHECK_EQ(0, encoding_map.count(encoding_name)) |
| 558 << "Duplicate found for " << encoding_name; |
| 559 encoding_map[encoding_name] = e; |
| 560 } |
| 561 // Then, add mime encoding names returned by MimeEncodingName(). |
| 562 // We don't override existing entries, to give precedence to entries |
| 563 // added earlier. |
| 564 for (int i = 0; i < NUM_ENCODINGS; ++i) { |
| 565 Encoding e = static_cast<Encoding>(i); |
| 566 // Note that MimeEncodingName() can return the same mime encoding |
| 567 // name for different encoding enums like JAPANESE_SHIFT_JIS and |
| 568 // KDDI_SHIFT_JIS. In that case, the encoding enum first seen |
| 569 // will be the value for the encoding name in the map. |
| 570 const char *mime_encoding_name = MimeEncodingName(e); |
| 571 if (encoding_map.count(mime_encoding_name) == 0) { |
| 572 encoding_map[mime_encoding_name] = e; |
| 573 } |
| 574 } |
| 575 |
| 576 // Add some non-standard names: alternate spellings, common typos, |
| 577 // etc. (It does no harm to add names already in the map.) Note |
| 578 // that although the map is case-insensitive, by convention the |
| 579 // keys are written here in lower case. For ease of maintenance, |
| 580 // they are listed in alphabetical order. |
| 581 encoding_map["5601"] = KOREAN_EUC_KR; |
| 582 encoding_map["646"] = ASCII_7BIT; |
| 583 encoding_map["852"] = CZECH_CP852; |
| 584 encoding_map["866"] = RUSSIAN_CP866; |
| 585 encoding_map["8859-1"] = ISO_8859_1; |
| 586 encoding_map["ansi-1251"] = RUSSIAN_CP1251; |
| 587 encoding_map["ansi_x3.4-1968"] = ASCII_7BIT; |
| 588 encoding_map["arabic"] = ISO_8859_6; |
| 589 encoding_map["ascii"] = ISO_8859_1; |
| 590 encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard |
| 591 encoding_map["asmo-708"] = ISO_8859_6; |
| 592 encoding_map["bhaskar"] = BHASKAR; |
| 593 encoding_map["big5"] = CHINESE_BIG5; |
| 594 encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard |
| 595 encoding_map["big5-hkscs"] = BIG5_HKSCS; |
| 596 encoding_map["chinese"] = CHINESE_GB; |
| 597 encoding_map["cns"] = CHINESE_CNS; // not iana standard |
| 598 encoding_map["cns11643"] = CHINESE_CNS; |
| 599 encoding_map["cp1250"] = MSFT_CP1250; // not iana standard |
| 600 encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard |
| 601 encoding_map["cp1252"] = MSFT_CP1252; // not iana standard |
| 602 encoding_map["cp1253"] = MSFT_CP1253; // not iana standard |
| 603 encoding_map["cp1254"] = MSFT_CP1254; // not iana standard |
| 604 encoding_map["cp1255"] = MSFT_CP1255; |
| 605 encoding_map["cp1256"] = MSFT_CP1256; |
| 606 encoding_map["cp1257"] = MSFT_CP1257; // not iana standard |
| 607 encoding_map["cp819"] = ISO_8859_1; |
| 608 encoding_map["cp852"] = CZECH_CP852; |
| 609 encoding_map["cp866"] = RUSSIAN_CP866; |
| 610 encoding_map["cp-866"] = RUSSIAN_CP866; |
| 611 encoding_map["cp874"] = MSFT_CP874; |
| 612 encoding_map["cp932"] = JAPANESE_CP932; // not iana standard |
| 613 encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard |
| 614 encoding_map["csbig5"] = CHINESE_BIG5; |
| 615 encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP; |
| 616 encoding_map["cseuckr"] = KOREAN_EUC_KR; |
| 617 encoding_map["csgb2312"] = CHINESE_GB; |
| 618 encoding_map["csibm852"] = CZECH_CP852; |
| 619 encoding_map["csibm866"] = RUSSIAN_CP866; |
| 620 encoding_map["csiso2022jp"] = JAPANESE_JIS; |
| 621 encoding_map["csiso2022kr"] = ISO_2022_KR; |
| 622 encoding_map["csiso58gb231280"] = CHINESE_GB; |
| 623 encoding_map["csiso88598i"] = ISO_8859_8_I; |
| 624 encoding_map["csisolatin1"] = ISO_8859_1; |
| 625 encoding_map["csisolatin2"] = ISO_8859_2; |
| 626 encoding_map["csisolatin3"] = ISO_8859_3; |
| 627 encoding_map["csisolatin4"] = ISO_8859_4; |
| 628 encoding_map["csisolatin5"] = ISO_8859_9; |
| 629 encoding_map["csisolatin6"] = ISO_8859_10; |
| 630 encoding_map["csisolatinarabic"] = ISO_8859_6; |
| 631 encoding_map["csisolatincyrillic"] = ISO_8859_5; |
| 632 encoding_map["csisolatingreek"] = ISO_8859_7; |
| 633 encoding_map["csisolatinhebrew"] = ISO_8859_8; |
| 634 encoding_map["csksc56011987"] = KOREAN_EUC_KR; |
| 635 encoding_map["csmacintosh"] = MACINTOSH_ROMAN; |
| 636 encoding_map["csn-369103"] = CZECH_CSN_369103; |
| 637 encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS; |
| 638 encoding_map["csunicode"] = UTF16BE; |
| 639 encoding_map["csunicode11"] = UTF16BE; |
| 640 encoding_map["csunicode11utf7"] = UTF7; |
| 641 encoding_map["csunicodeascii"] = UTF16BE; |
| 642 encoding_map["csunicodelatin1"] = UTF16BE; |
| 643 encoding_map["cyrillic"] = ISO_8859_5; |
| 644 encoding_map["ecma-114"] = ISO_8859_6; |
| 645 encoding_map["ecma-118"] = ISO_8859_7; |
| 646 encoding_map["elot_928"] = ISO_8859_7; |
| 647 encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard |
| 648 encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard |
| 649 encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard |
| 650 encoding_map["euc-jp"] = JAPANESE_EUC_JP; |
| 651 encoding_map["euc-kr"] = KOREAN_EUC_KR; |
| 652 encoding_map["eucgb2312_cn"] = CHINESE_GB; |
| 653 encoding_map["gb"] = CHINESE_GB; // not iana standard |
| 654 encoding_map["gb18030"] = GB18030; |
| 655 encoding_map["gb2132"] = CHINESE_GB; // common typo |
| 656 encoding_map["gb2312"] = CHINESE_GB; |
| 657 encoding_map["gb_2312-80"] = CHINESE_GB; |
| 658 encoding_map["gbk"] = GBK; |
| 659 encoding_map["greek"] = ISO_8859_7; |
| 660 encoding_map["greek8"] = ISO_8859_7; |
| 661 encoding_map["hebrew"] = ISO_8859_8; |
| 662 encoding_map["htchanakya"] = HTCHANAKYA; |
| 663 encoding_map["hz-gb-2312"] = HZ_GB_2312; |
| 664 encoding_map["ibm819"] = ISO_8859_1; |
| 665 encoding_map["ibm852"] = CZECH_CP852; |
| 666 encoding_map["ibm874"] = MSFT_CP874; |
| 667 encoding_map["iso-10646"] = UTF16BE; |
| 668 encoding_map["iso-10646-j-1"] = UTF16BE; |
| 669 encoding_map["iso-10646-ucs-2"] = UNICODE; |
| 670 encoding_map["iso-10646-ucs-4"] = UTF32BE; |
| 671 encoding_map["iso-10646-ucs-basic"] = UTF16BE; |
| 672 encoding_map["iso-10646-unicode-latin1"] = UTF16BE; |
| 673 encoding_map["iso-2022-cn"] = ISO_2022_CN; |
| 674 encoding_map["iso-2022-jp"] = JAPANESE_JIS; |
| 675 encoding_map["iso-2022-kr"] = ISO_2022_KR; |
| 676 encoding_map["iso-8559-1"] = ISO_8859_1; // common typo |
| 677 encoding_map["iso-874"] = MSFT_CP874; |
| 678 encoding_map["iso-8858-1"] = ISO_8859_1; // common typo |
| 679 // iso-8859-0 was a temporary name, eventually renamed iso-8859-15 |
| 680 encoding_map["iso-8859-0"] = ISO_8859_15; |
| 681 encoding_map["iso-8859-1"] = ISO_8859_1; |
| 682 encoding_map["iso-8859-10"] = ISO_8859_10; |
| 683 encoding_map["iso-8859-11"] = ISO_8859_11; |
| 684 encoding_map["iso-8859-13"] = ISO_8859_13; |
| 685 encoding_map["iso-8859-15"] = ISO_8859_15; |
| 686 encoding_map["iso-8859-2"] = ISO_8859_2; |
| 687 encoding_map["iso-8859-3"] = ISO_8859_3; |
| 688 encoding_map["iso-8859-4"] = ISO_8859_4; |
| 689 encoding_map["iso-8859-5"] = ISO_8859_5; |
| 690 encoding_map["iso-8859-6"] = ISO_8859_6; |
| 691 encoding_map["iso-8859-7"] = ISO_8859_7; |
| 692 encoding_map["iso-8859-8"] = ISO_8859_8; |
| 693 encoding_map["iso-8859-8-i"] = ISO_8859_8_I; |
| 694 encoding_map["iso-8859-9"] = ISO_8859_9; |
| 695 encoding_map["iso-9959-1"] = ISO_8859_1; // common typo |
| 696 encoding_map["iso-ir-100"] = ISO_8859_1; |
| 697 encoding_map["iso-ir-101"] = ISO_8859_2; |
| 698 encoding_map["iso-ir-109"] = ISO_8859_3; |
| 699 encoding_map["iso-ir-110"] = ISO_8859_4; |
| 700 encoding_map["iso-ir-126"] = ISO_8859_7; |
| 701 encoding_map["iso-ir-127"] = ISO_8859_6; |
| 702 encoding_map["iso-ir-138"] = ISO_8859_8; |
| 703 encoding_map["iso-ir-144"] = ISO_8859_5; |
| 704 encoding_map["iso-ir-148"] = ISO_8859_9; |
| 705 encoding_map["iso-ir-149"] = KOREAN_EUC_KR; |
| 706 encoding_map["iso-ir-157"] = ISO_8859_10; |
| 707 encoding_map["iso-ir-58"] = CHINESE_GB; |
| 708 encoding_map["iso-latin-1"] = ISO_8859_1; |
| 709 encoding_map["iso_2022-cn"] = ISO_2022_CN; |
| 710 encoding_map["iso_2022-kr"] = ISO_2022_KR; |
| 711 encoding_map["iso_8859-1"] = ISO_8859_1; |
| 712 encoding_map["iso_8859-10:1992"] = ISO_8859_10; |
| 713 encoding_map["iso_8859-11"] = ISO_8859_11; |
| 714 encoding_map["iso_8859-13"] = ISO_8859_13; |
| 715 encoding_map["iso_8859-15"] = ISO_8859_15; |
| 716 encoding_map["iso_8859-1:1987"] = ISO_8859_1; |
| 717 encoding_map["iso_8859-2"] = ISO_8859_2; |
| 718 encoding_map["iso_8859-2:1987"] = ISO_8859_2; |
| 719 encoding_map["iso_8859-3"] = ISO_8859_3; |
| 720 encoding_map["iso_8859-3:1988"] = ISO_8859_3; |
| 721 encoding_map["iso_8859-4"] = ISO_8859_4; |
| 722 encoding_map["iso_8859-4:1988"] = ISO_8859_4; |
| 723 encoding_map["iso_8859-5"] = ISO_8859_5; |
| 724 encoding_map["iso_8859-5:1988"] = ISO_8859_5; |
| 725 encoding_map["iso_8859-6"] = ISO_8859_6; |
| 726 encoding_map["iso_8859-6:1987"] = ISO_8859_6; |
| 727 encoding_map["iso_8859-7"] = ISO_8859_7; |
| 728 encoding_map["iso_8859-7:1987"] = ISO_8859_7; |
| 729 encoding_map["iso_8859-8"] = ISO_8859_8; |
| 730 encoding_map["iso_8859-8:1988:"] = ISO_8859_8; |
| 731 encoding_map["iso_8859-9"] = ISO_8859_9; |
| 732 encoding_map["iso_8859-9:1989"] = ISO_8859_9; |
| 733 encoding_map["jagran"] = JAGRAN; |
| 734 encoding_map["jis"] = JAPANESE_JIS; // not iana standard |
| 735 encoding_map["koi8-cs"] = CZECH_CSN_369103; |
| 736 encoding_map["koi8-r"] = RUSSIAN_KOI8_R; |
| 737 encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard |
| 738 encoding_map["koi8-u"] = RUSSIAN_KOI8_RU; |
| 739 encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard |
| 740 encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard |
| 741 encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant |
| 742 encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard |
| 743 encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard |
| 744 encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR; |
| 745 encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard |
| 746 encoding_map["l1"] = ISO_8859_1; |
| 747 encoding_map["l2"] = ISO_8859_2; |
| 748 encoding_map["l3"] = ISO_8859_3; |
| 749 encoding_map["l4"] = ISO_8859_4; |
| 750 encoding_map["l5"] = ISO_8859_9; |
| 751 encoding_map["l6"] = ISO_8859_10; |
| 752 encoding_map["latin-1"] = ISO_8859_1; // not iana standard |
| 753 encoding_map["latin1"] = ISO_8859_1; |
| 754 encoding_map["latin2"] = ISO_8859_2; |
| 755 encoding_map["latin3"] = ISO_8859_3; |
| 756 encoding_map["latin4"] = ISO_8859_4; |
| 757 encoding_map["latin5"] = ISO_8859_9; |
| 758 encoding_map["latin6"] = ISO_8859_10; |
| 759 encoding_map["mac"] = MACINTOSH_ROMAN; |
| 760 encoding_map["macintosh"] = MACINTOSH_ROMAN; |
| 761 encoding_map["macintosh-roman"] = MACINTOSH_ROMAN; |
| 762 encoding_map["ms932"] = JAPANESE_CP932; // not iana standard |
| 763 encoding_map["ms_kanji"] = JAPANESE_CP932; |
| 764 encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS; |
| 765 encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS; |
| 766 encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard |
| 767 encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard |
| 768 encoding_map["sun_eu_greek"] = ISO_8859_7; |
| 769 encoding_map["tab"] = TAMIL_BI; |
| 770 encoding_map["tam"] = TAMIL_MONO; |
| 771 encoding_map["tis-620"] = ISO_8859_11; |
| 772 encoding_map["tscii"] = TSCII; |
| 773 encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard |
| 774 encoding_map["unicode"] = UNICODE; // not iana standard |
| 775 encoding_map["unicode-1-1-utf-7"] = UTF7; |
| 776 encoding_map["unicode-1-1-utf-8"] = UTF8; |
| 777 encoding_map["unicode-2-0-utf-7"] = UTF7; |
| 778 encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard |
| 779 encoding_map["us"] = ISO_8859_1; |
| 780 encoding_map["us-ascii"] = ISO_8859_1; |
| 781 encoding_map["utf-16be"] = UTF16BE; |
| 782 encoding_map["utf-16le"] = UTF16LE; |
| 783 encoding_map["utf-32be"] = UTF32BE; |
| 784 encoding_map["utf-32le"] = UTF32LE; |
| 785 encoding_map["utf-7"] = UTF7; |
| 786 encoding_map["utf-8"] = UTF8; |
| 787 encoding_map["utf7"] = UTF7; |
| 788 encoding_map["utf8"] = UTF8; // not iana standard |
| 789 encoding_map["visual"] = HEBREW_VISUAL; |
| 790 encoding_map["win-1250"] = MSFT_CP1250; // not iana standard |
| 791 encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard |
| 792 encoding_map["window-874"] = MSFT_CP874; |
| 793 encoding_map["windows-1250"] = MSFT_CP1250; |
| 794 encoding_map["windows-1251"] = RUSSIAN_CP1251; |
| 795 encoding_map["windows-1252"] = MSFT_CP1252; |
| 796 encoding_map["windows-1253"] = MSFT_CP1253; |
| 797 encoding_map["windows-1254"] = MSFT_CP1254; |
| 798 encoding_map["windows-1255"] = MSFT_CP1255; |
| 799 encoding_map["windows-1256"] = MSFT_CP1256; |
| 800 encoding_map["windows-1257"] = MSFT_CP1257; |
| 801 encoding_map["windows-31j"] = JAPANESE_CP932; |
| 802 encoding_map["windows-874"] = MSFT_CP874; |
| 803 encoding_map["windows-936"] = GBK; |
| 804 encoding_map["x-big5"] = CHINESE_BIG5; |
| 805 encoding_map["x-binaryenc"] = BINARYENC; // not iana standard |
| 806 encoding_map["x-cp1250"] = MSFT_CP1250; |
| 807 encoding_map["x-cp1251"] = RUSSIAN_CP1251; |
| 808 encoding_map["x-cp1252"] = MSFT_CP1252; |
| 809 encoding_map["x-cp1253"] = MSFT_CP1253; |
| 810 encoding_map["x-cp1254"] = MSFT_CP1254; |
| 811 encoding_map["x-cp1255"] = MSFT_CP1255; |
| 812 encoding_map["x-cp1256"] = MSFT_CP1256; |
| 813 encoding_map["x-cp1257"] = MSFT_CP1257; |
| 814 encoding_map["x-euc-jp"] = JAPANESE_EUC_JP; |
| 815 encoding_map["x-euc-tw"] = CHINESE_CNS; |
| 816 encoding_map["x-gbk"] = GBK; |
| 817 encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE; |
| 818 encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE; |
| 819 encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE; |
| 820 encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE; |
| 821 encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard |
| 822 encoding_map["x-mac-roman"] = MACINTOSH_ROMAN; |
| 823 encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard |
| 824 encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS; |
| 825 encoding_map["x-unicode-2-0-utf-7"] = UTF7; |
| 826 encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard |
| 827 encoding_map["x-x-big5"] = CHINESE_BIG5; |
| 828 encoding_map["zh_cn.euc"] = CHINESE_GB; |
| 829 encoding_map["zh_tw-big5"] = CHINESE_BIG5; |
| 830 encoding_map["zh_tw-euc"] = CHINESE_CNS; |
| 831 |
| 832 // Remove they entry for the empty string, if any. |
| 833 encoding_map.erase(""); |
| 834 } |
| 835 |
| 836 REGISTER_MODULE_INITIALIZER(encodings, { |
| 837 InitEncodings(); |
| 838 }); |
| 839 |
| 840 // ---------------------------------------------------------------------- |
| 841 // EncodingNameAliasToEncoding() |
| 842 // |
| 843 // This function takes an encoding name/alias and returns the Encoding |
| 844 // enum. The input is case insensitive. It is the union of the common |
| 845 // IANA standard names, the charset names used in Netscape Navigator, |
| 846 // and some common names we have been using. |
| 847 // See: http://www.iana.org/assignments/character-sets |
| 848 // http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html |
| 849 // |
| 850 // UNKNOWN_ENCODING is returned if none matches. |
| 851 // |
| 852 // TODO: Check if it is possible to remove the non-standard, |
| 853 // non-netscape-use names. It is because this routine is used for |
| 854 // encoding detections from html meta info. Non-standard names may |
| 855 // introduce noise on encoding detection. |
| 856 // |
| 857 // TODO: Unify EncodingNameAliasToEncoding and EncodingFromName, |
| 858 // or determine why such a unification is not possible. |
| 859 // ---------------------------------------------------------------------- |
| 860 Encoding EncodingNameAliasToEncoding(const char *encoding_name) { |
| 861 if (!encoding_name) { |
| 862 return UNKNOWN_ENCODING; |
| 863 } |
| 864 |
| 865 // The map is initialized during InitGoogle() in a thread-safe manner. |
| 866 CHECK(!encoding_map.empty()) << ": Must call InitGoogle()"; |
| 867 |
| 868 EncodingMap::iterator emi = encoding_map.find(encoding_name); |
| 869 if (emi != encoding_map.end()) { |
| 870 return emi->second; |
| 871 } else { |
| 872 return UNKNOWN_ENCODING; |
| 873 } |
| 874 } |
| 875 #endif |
| 876 |
| 877 const char* default_encoding_name() { |
| 878 return kEncodingInfoTable[LATIN1].encoding_name_; |
| 879 } |
| 880 |
| 881 static const char* const kInvalidEncodingName = "invalid_encoding"; |
| 882 |
| 883 const char *invalid_encoding_name() { |
| 884 return kInvalidEncodingName; |
| 885 } |
| 886 |
| 887 |
| 888 |
| 889 // ************************************************************* |
| 890 // Miscellany |
| 891 // ************************************************************* |
| 892 |
| 893 |
| 894 Encoding PreferredWebOutputEncoding(Encoding enc) { |
| 895 return IsValidEncoding(enc) |
| 896 ? kEncodingInfoTable[enc].preferred_web_output_encoding_ |
| 897 : UTF8; |
| 898 } |
OLD | NEW |