Index: third_party/cld/encodings/internal/encodings.cc |
diff --git a/third_party/cld/encodings/internal/encodings.cc b/third_party/cld/encodings/internal/encodings.cc |
index c8bf82ece85cd8389ff97d3bf881e41e593f90a1..82ef5e243581c09adfc058d4ebdc76bbf644108c 100644 |
--- a/third_party/cld/encodings/internal/encodings.cc |
+++ b/third_party/cld/encodings/internal/encodings.cc |
@@ -1,12 +1,898 @@ |
-// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. |
-// Use of this source code is governed by a BSD-style license that can be |
-// found in the LICENSE file. |
+// Copyright 2008 Google Inc. All Rights Reserved. |
+// Author: jrm@google.com (Jim Meehan) |
#include "encodings/public/encodings.h" |
+#include <string.h> // for strcasecmp |
+//#include <hash_map> // for _Hashtable_iterator, etc |
+#include <utility> // for pair |
+ |
+//#include "base/googleinit.h" // for REGISTER_MODULE_INITIALIZER |
+//#include "base/logging.h" // for operator<<, Check_EQImpl, etc |
+//#include "base/macros.h" // for COMPILE_ASSERT, etc |
+//#include "base/mutex.h" // for Mutex, MutexLock |
+//#include "util/hash/case_insensitive_hash.h" |
+//#include "util/hash/hash.h" |
+#include "encodings/compact_lang_det/win/cld_basictypes.h" |
+#include "encodings/compact_lang_det/win/cld_logging.h" |
+#include "encodings/compact_lang_det/win/cld_macros.h" |
+ |
+struct EncodingInfo { |
+ // The standard name for this encoding. |
+ // |
+ const char* encoding_name_; |
+ |
+ // The "preferred MIME name" of an encoding as specified by the IANA at: |
+ // http://www.iana.org/assignments/character-sets |
+ // |
+ // Note that the preferred MIME name may differ slightly from the |
+ // official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987 |
+ // |
+ const char* mime_encoding_name_; |
+ |
+ // NOTE: As of January 2007, it is a Google requirement that if an |
+ // encoding has an IANA name, then encoding_name_ and |
+ // mime_encoding_name_ must be the same string. |
+ // |
+ // However, there can be exceptions if there are compelling reasons. |
+ // For example, Japanese mobile handsets require the name |
+ // "Shift_JIS" in charset=... parameter in Content-Type headers to |
+ // process emoji (emoticons) in their private encodings. In that |
+ // case, mime_encoding_name_ should be "Shift_JIS", despite |
+ // encoding_name_ actually is "X-KDDI-Shift_JIS". |
+ |
+ // Some multi-byte encodings use byte values that coincide with the |
+ // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE |
+ // can misinterpret these, as indicated in an external XSS report from |
+ // 2007-02-15. Here, we map these dangerous encodings to safer ones. We |
+ // also use UTF8 instead of encodings that we don't support in our |
+ // output, and we generally try to be conservative in what we send out. |
+ // Where the client asks for single- or double-byte encodings that are |
+ // not as common, we substitute a more common single- or double-byte |
+ // encoding, if there is one, thereby preserving the client's intent |
+ // to use less space than UTF-8. This also means that characters |
+ // outside the destination set will be converted to HTML NCRs (&#NNN;) |
+ // if requested. |
+ |
+ Encoding preferred_web_output_encoding_; |
+}; |
+ |
+static const EncodingInfo kEncodingInfoTable[] = { |
+ { "ASCII", "ISO-8859-1", ISO_8859_1}, |
+ { "Latin2", "ISO-8859-2", ISO_8859_2}, |
+ { "Latin3", "ISO-8859-3", UTF8}, |
+ // MSIE 6 does not support ISO-8859-3 (XSS issue) |
+ { "Latin4", "ISO-8859-4", ISO_8859_4}, |
+ { "ISO-8859-5", "ISO-8859-5", ISO_8859_5}, |
+ { "Arabic", "ISO-8859-6", ISO_8859_6}, |
+ { "Greek", "ISO-8859-7", ISO_8859_7}, |
+ { "Hebrew", "ISO-8859-8", MSFT_CP1255}, |
+ // we do not endorse the visual order |
+ { "Latin5", "ISO-8859-9", ISO_8859_9}, |
+ { "Latin6", "ISO-8859-10", UTF8}, |
+ // MSIE does not support ISO-8859-10 (XSS issue) |
+ { "EUC-JP", "EUC-JP", JAPANESE_EUC_JP}, |
+ { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS}, |
+ { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, |
+ // due to potential confusion with HTML syntax chars |
+ { "BIG5", "Big5", CHINESE_BIG5}, |
+ { "GB", "GB2312", CHINESE_GB}, |
+ { "EUC-CN", |
+ "EUC-CN", |
+ // Misnamed. Should be EUC-TW. |
+ CHINESE_BIG5}, |
+ // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW, |
+ // and EUC-TW is rare, so we prefer Big5 for output. |
+ { "KSC", "EUC-KR", KOREAN_EUC_KR}, |
+ { "Unicode", |
+ "UTF-16LE", |
+ // Internet Explorer doesn't recognize "ISO-10646-UCS-2" |
+ UTF8 |
+ // due to potential confusion with HTML syntax chars |
+ }, |
+ { "EUC", |
+ "EUC", // Misnamed. Should be EUC-TW. |
+ CHINESE_BIG5 |
+ // MSIE does not recognize "EUC" (XSS issue), |
+ // and EUC-TW is rare, so we prefer Big5 for output. |
+ }, |
+ { "CNS", |
+ "CNS", // Misnamed. Should be EUC-TW. |
+ CHINESE_BIG5}, |
+ // MSIE does not recognize "CNS" (XSS issue), |
+ // and EUC-TW is rare, so we prefer Big5 for output. |
+ { "BIG5-CP950", |
+ "BIG5-CP950", // Not an IANA name |
+ CHINESE_BIG5 |
+ // MSIE does not recognize "BIG5-CP950" (XSS issue) |
+ }, |
+ { "CP932", "CP932", // Not an IANA name |
+ JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue) |
+ { "UTF8", "UTF-8", UTF8}, |
+ { "Unknown", |
+ "x-unknown", // Not an IANA name |
+ UTF8}, // UTF-8 is our default output encoding |
+ { "ASCII-7-bit", "US-ASCII", ASCII_7BIT}, |
+ { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R}, |
+ { "CP1251", "windows-1251", RUSSIAN_CP1251}, |
+ { "CP1252", "windows-1252", MSFT_CP1252}, |
+ { "KOI8U", |
+ "KOI8-U", |
+ ISO_8859_5}, // because koi8-u is not as common |
+ { "CP1250", "windows-1250", MSFT_CP1250}, |
+ { "ISO-8859-15", "ISO-8859-15", ISO_8859_15}, |
+ { "CP1254", "windows-1254", MSFT_CP1254}, |
+ { "CP1257", "windows-1257", MSFT_CP1257}, |
+ { "ISO-8859-11", "ISO-8859-11", ISO_8859_11}, |
+ { "CP874", "windows-874", MSFT_CP874}, |
+ { "CP1256", "windows-1256", MSFT_CP1256}, |
+ { "CP1255", "windows-1255", MSFT_CP1255}, |
+ { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255}, |
+ // Java does not support iso-8859-8-i |
+ { "VISUAL", "ISO-8859-8", MSFT_CP1255}, |
+ // we do not endorse the visual order |
+ { "CP852", "cp852", MSFT_CP1250}, |
+ // because cp852 is not as common |
+ { "CSN_369103", "csn_369103", MSFT_CP1250}, |
+ // MSIE does not recognize "csn_369103" (XSS issue) |
+ { "CP1253", "windows-1253", MSFT_CP1253}, |
+ { "CP866", "IBM866", RUSSIAN_CP1251}, |
+ // because cp866 is not as common |
+ { "ISO-8859-13", "ISO-8859-13", UTF8}, |
+ // because iso-8859-13 is not widely supported |
+ { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR}, |
+ // due to potential confusion with HTML syntax chars |
+ { "GBK", "GBK", GBK}, |
+ { "GB18030", "GB18030", GBK}, |
+ // because gb18030 is not widely supported |
+ { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5}, |
+ // because Big5-HKSCS is not widely supported |
+ { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB}, |
+ // due to potential confusion with HTML syntax chars |
+ { "TSCII", "tscii", UTF8}, |
+ // we do not have an output converter for this font encoding |
+ { "TAM", "tam", UTF8}, |
+ // we do not have an output converter for this font encoding |
+ { "TAB", "tab", UTF8}, |
+ // we do not have an output converter for this font encoding |
+ { "JAGRAN", "jagran", UTF8}, |
+ // we do not have an output converter for this font encoding |
+ { "MACINTOSH", "MACINTOSH", ISO_8859_1}, |
+ // because macintosh is relatively uncommon |
+ { "UTF7", "UTF-7", |
+ UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated |
+ { "BHASKAR", "bhaskar", |
+ UTF8}, // we do not have an output converter for this font encoding |
+ { "HTCHANAKYA", "htchanakya", // not an IANA charset name. |
+ UTF8}, // we do not have an output converter for this font encoding |
+ { "UTF-16BE", "UTF-16BE", |
+ UTF8}, // due to potential confusion with HTML syntax chars |
+ { "UTF-16LE", "UTF-16LE", |
+ UTF8}, // due to potential confusion with HTML syntax chars |
+ { "UTF-32BE", "UTF-32BE", |
+ UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web |
+ { "UTF-32LE", "UTF-32LE", |
+ UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web |
+ { "X-BINARYENC", "x-binaryenc", // Not an IANA name |
+ UTF8}, // because this one is not intended for output (just input) |
+ { "HZ-GB-2312", "HZ-GB-2312", |
+ CHINESE_GB}, // due to potential confusion with HTML syntax chars |
+ { "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name |
+ UTF8}, // because this one is not intended for output (just input) |
+ { "X-TAM-ELANGO", "x-tam-elango", |
+ UTF8}, // we do not have an output converter for this font encoding |
+ { "X-TAM-LTTMBARANI", "x-tam-lttmbarani", |
+ UTF8}, // we do not have an output converter for this font encoding |
+ { "X-TAM-SHREE", "x-tam-shree", |
+ UTF8}, // we do not have an output converter for this font encoding |
+ { "X-TAM-TBOOMIS", "x-tam-tboomis", |
+ UTF8}, // we do not have an output converter for this font encoding |
+ { "X-TAM-TMNEWS", "x-tam-tmnews", |
+ UTF8}, // we do not have an output converter for this font encoding |
+ { "X-TAM-WEBTAMIL", "x-tam-webtamil", |
+ UTF8}, // we do not have an output converter for this font encoding |
+ |
+ { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, |
+ // KDDI version of Shift_JIS with Google Emoji PUA mappings. |
+ // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses |
+ // "Shift_JIS" in HTTP headers and email messages. |
+ |
+ { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, |
+ // DoCoMo version of Shift_JIS with Google Emoji PUA mappings. |
+ // See the comment at KDDI_SHIFT_JIS for other issues. |
+ |
+ { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS}, |
+ // SoftBank version of Shift_JIS with Google Emoji PUA mappings. |
+ // See the comment at KDDI_SHIFT_JIS for other issues. |
+ |
+ { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, |
+ // KDDI version of ISO-2022-JP with Google Emoji PUA mappings. |
+ // See the comment at KDDI_SHIFT_JIS for other issues. |
+ // The preferred Web encoding is due to potential confusion with |
+ // HTML syntax chars. |
+ |
+ { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS}, |
+ // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings. |
+ // See the comment at KDDI_SHIFT_JIS for other issues. |
+ // The preferred Web encoding is due to potential confusion with |
+ // HTML syntax chars. |
+ |
+ // Please refer to NOTE: section in the comments in the definition |
+ // of "struct I18NInfoByEncoding", before adding new encodings. |
+ |
+}; |
+ |
+ |
+ |
+COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS, |
+ kEncodingInfoTable_has_incorrect_size); |
+ |
+Encoding default_encoding() {return LATIN1;} |
+ |
+// ************************************************************* |
+// Encoding predicates |
+// IsValidEncoding() |
+// IsEncEncCompatible |
+// IsEncodingWithSupportedLanguage |
+// IsSupersetOfAscii7Bit |
+// Is8BitEncoding |
+// IsCJKEncoding |
+// IsHebrewEncoding |
+// IsRightToLeftEncoding |
+// IsLogicalRightToLeftEncoding |
+// IsVisualRightToLeftEncoding |
+// IsIso2022Encoding |
+// IsIso2022JpOrVariant |
+// IsShiftJisOrVariant |
+// IsJapaneseCellPhoneCarrierSpecificEncoding |
+// ************************************************************* |
+ |
+bool IsValidEncoding(Encoding enc) { |
+ return ((enc >= 0) && (enc < kNumEncodings)); |
+} |
+ |
+bool IsEncEncCompatible(const Encoding from, const Encoding to) { |
+ // Tests compatibility between the "from" and "to" encodings; in |
+ // the typical case -- when both are valid known encodings -- this |
+ // returns true iff converting from first to second is a no-op. |
+ if (!IsValidEncoding(from) || !IsValidEncoding(to)) { |
+ return false; // we only work with valid encodings... |
+ } else if (to == from) { |
+ return true; // the trivial common case |
+ } |
+ |
+ if (to == UNKNOWN_ENCODING) { |
+ return true; // all valid encodings are compatible with the unknown |
+ } |
+ |
+ if (from == UNKNOWN_ENCODING) { |
+ return false; // no unknown encoding is compatible with one that is |
+ } |
+ |
+ if (from == ASCII_7BIT) { |
+ return IsSupersetOfAscii7Bit(to); |
+ } |
+ |
+ return (from == ISO_8859_1 && to == MSFT_CP1252) || |
+ (from == ISO_8859_8 && to == HEBREW_VISUAL) || |
+ (from == HEBREW_VISUAL && to == ISO_8859_8) || |
+ (from == ISO_8859_9 && to == MSFT_CP1254) || |
+ (from == ISO_8859_11 && to == MSFT_CP874) || |
+ (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) || |
+ (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) || |
+ (from == CHINESE_GB && to == GBK) || |
+ (from == CHINESE_GB && to == GB18030) || |
+ (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) || |
+ (from == CHINESE_EUC_CN && to == CHINESE_CNS) || |
+ (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) || |
+ (from == CHINESE_EUC_DEC && to == CHINESE_CNS) || |
+ (from == CHINESE_CNS && to == CHINESE_EUC_CN) || |
+ (from == CHINESE_CNS && to == CHINESE_EUC_DEC); |
+} |
+ |
+// To be a superset of 7-bit Ascii means that bytes 0...127 in the given |
+// encoding represent the same characters as they do in ISO_8859_1. |
+ |
+// TODO: This list could be expanded. Many other encodings are supersets |
+// of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two |
+// encodings that I know for a fact should *not* be in this list. |
+bool IsSupersetOfAscii7Bit(Encoding e) { |
+ switch (e) { |
+ case ISO_8859_1: |
+ case ISO_8859_2: |
+ case ISO_8859_3: |
+ case ISO_8859_4: |
+ case ISO_8859_5: |
+ case ISO_8859_6: |
+ case ISO_8859_7: |
+ case ISO_8859_8: |
+ case ISO_8859_9: |
+ case ISO_8859_10: |
+ case JAPANESE_EUC_JP: |
+ case JAPANESE_SHIFT_JIS: |
+ case CHINESE_BIG5: |
+ case CHINESE_GB: |
+ case CHINESE_EUC_CN: |
+ case KOREAN_EUC_KR: |
+ case CHINESE_EUC_DEC: |
+ case CHINESE_CNS: |
+ case CHINESE_BIG5_CP950: |
+ case JAPANESE_CP932: |
+ case UTF8: |
+ case UNKNOWN_ENCODING: |
+ case ASCII_7BIT: |
+ case RUSSIAN_KOI8_R: |
+ case RUSSIAN_CP1251: |
+ case MSFT_CP1252: |
+ case RUSSIAN_KOI8_RU: |
+ case MSFT_CP1250: |
+ case ISO_8859_15: |
+ case MSFT_CP1254: |
+ case MSFT_CP1257: |
+ case ISO_8859_11: |
+ case MSFT_CP874: |
+ case MSFT_CP1256: |
+ case MSFT_CP1255: |
+ case ISO_8859_8_I: |
+ case HEBREW_VISUAL: |
+ case CZECH_CP852: |
+ case MSFT_CP1253: |
+ case RUSSIAN_CP866: |
+ case ISO_8859_13: |
+ case GBK: |
+ case GB18030: |
+ case BIG5_HKSCS: |
+ case MACINTOSH_ROMAN: |
+ return true; |
+ default: |
+ return false; |
+ } |
+} |
+ |
+// To be an 8-bit encoding means that there are fewer than 256 symbols. |
+// Each byte determines a new character; there are no multi-byte sequences. |
+ |
+// TODO: This list could maybe be expanded. Other encodings may be 8-bit. |
+bool Is8BitEncoding(Encoding e) { |
+ switch (e) { |
+ case ASCII_7BIT: |
+ case ISO_8859_1: |
+ case ISO_8859_2: |
+ case ISO_8859_3: |
+ case ISO_8859_4: |
+ case ISO_8859_5: |
+ case ISO_8859_6: |
+ case ISO_8859_7: |
+ case ISO_8859_8: |
+ case ISO_8859_8_I: |
+ case ISO_8859_9: |
+ case ISO_8859_10: |
+ case ISO_8859_11: |
+ case ISO_8859_13: |
+ case ISO_8859_15: |
+ case MSFT_CP1252: |
+ case MSFT_CP1253: |
+ case MSFT_CP1254: |
+ case MSFT_CP1255: |
+ case MSFT_CP1256: |
+ case MSFT_CP1257: |
+ case RUSSIAN_KOI8_R: |
+ case RUSSIAN_KOI8_RU: |
+ case RUSSIAN_CP866: |
+ return true; |
+ default: |
+ return false; |
+ } |
+} |
+ |
+bool IsCJKEncoding(Encoding e) { |
+ switch (e) { |
+ case JAPANESE_EUC_JP: |
+ case JAPANESE_SHIFT_JIS: |
+ case JAPANESE_JIS: |
+ case CHINESE_BIG5: |
+ case CHINESE_GB: |
+ case CHINESE_EUC_CN: |
+ case KOREAN_EUC_KR: |
+ case CHINESE_EUC_DEC: |
+ case CHINESE_CNS: |
+ case CHINESE_BIG5_CP950: |
+ case JAPANESE_CP932: |
+ case ISO_2022_KR: |
+ case GBK: |
+ case GB18030: |
+ case BIG5_HKSCS: |
+ case ISO_2022_CN: |
+ case HZ_GB_2312: |
+ return true; |
+ default: |
+ return false; |
+ } |
+} |
+ |
+bool IsHebrewEncoding(Encoding e) { |
+ return (e == ISO_8859_8 || |
+ e == ISO_8859_8_I || |
+ e == MSFT_CP1255 || |
+ e == HEBREW_VISUAL); |
+} |
+ |
+ |
+ |
+bool IsRightToLeftEncoding(Encoding enc) { |
+ switch (enc) { |
+ case MSFT_CP1255: |
+ case MSFT_CP1256: |
+ case ARABIC_ENCODING: |
+ case HEBREW_ENCODING: |
+ case ISO_8859_8_I: |
+ case HEBREW_VISUAL: |
+ return true; |
+ default: |
+ return false; |
+ } |
+} |
+ |
+bool IsLogicalRightToLeftEncoding(Encoding enc) { |
+ return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc); |
+} |
+ |
+// Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6) |
+// is NOT visual. |
+bool IsVisualRightToLeftEncoding(Encoding enc) { |
+ switch (enc) { |
+ case HEBREW_ENCODING: |
+ case HEBREW_VISUAL: |
+ return true; |
+ default: |
+ return false; |
+ } |
+} |
+ |
+ |
+ |
+ |
+ |
+bool IsIso2022Encoding(Encoding enc) { |
+ return (IsIso2022JpOrVariant(enc) || |
+ enc == ISO_2022_KR || |
+ enc == ISO_2022_CN); |
+} |
+ |
+bool IsIso2022JpOrVariant(Encoding enc) { |
+ return (enc == JAPANESE_JIS || |
+ enc == KDDI_ISO_2022_JP || |
+ enc == SOFTBANK_ISO_2022_JP); |
+} |
+ |
+bool IsShiftJisOrVariant(Encoding enc) { |
+ return (enc == JAPANESE_SHIFT_JIS || |
+ enc == JAPANESE_CP932 || |
+ enc == KDDI_SHIFT_JIS || |
+ enc == DOCOMO_SHIFT_JIS || |
+ enc == SOFTBANK_SHIFT_JIS); |
+} |
+ |
+bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) { |
+ return (enc == KDDI_ISO_2022_JP || |
+ enc == KDDI_SHIFT_JIS || |
+ enc == DOCOMO_SHIFT_JIS || |
+ enc == SOFTBANK_SHIFT_JIS || |
+ enc == SOFTBANK_ISO_2022_JP); |
+} |
+ |
+ |
+// ************************************************************* |
+// ENCODING NAMES |
+// EncodingName() [Encoding to name] |
+// MimeEncodingName() [Encoding to name] |
+// EncodingFromName() [name to Encoding] |
+// EncodingNameAliasToEncoding() [name to Encoding] |
+// default_encoding_name() |
+// invalid_encoding_name() |
+// ************************************************************* |
-// We do not use it, just to please a compiler and minimize ported |
-// code changes. |
const char * EncodingName(const Encoding enc) { |
- return ""; |
+ if ( (enc < 0) || (enc >= kNumEncodings) ) |
+ return invalid_encoding_name(); |
+ return kEncodingInfoTable[enc].encoding_name_; |
+} |
+ |
+// TODO: Unify MimeEncodingName and EncodingName, or determine why |
+// such a unification is not possible. |
+ |
+const char * MimeEncodingName(Encoding enc) { |
+ if ( (enc < 0) || (enc >= kNumEncodings) ) |
+ return ""; // TODO(jrm) Should this be invalid_encoding_name()? |
+ return kEncodingInfoTable[enc].mime_encoding_name_; |
+} |
+ |
+bool EncodingFromName(const char* enc_name, Encoding *encoding) { |
+ *encoding = UNKNOWN_ENCODING; |
+ if ( enc_name == NULL ) return false; |
+ |
+ for ( int i = 0; i < kNumEncodings; i++ ) { |
+ if ( !strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) { |
+ *encoding = static_cast<Encoding>(i); |
+ return true; |
+ } |
+ } |
+ return false; |
+} |
+ |
+#if 0 |
+// The encoding_map maps standard and non-standard encoding-names |
+// (strings) to Encoding enums. It is used only by |
+// EncodingNameAliasToEncoding. Note that the map uses |
+// case-insensitive hash and comparison functions. |
+ |
+typedef hash_map <const char *, Encoding, |
+ CStringAlnumCaseHash, |
+ CStringAlnumCaseEqual> EncodingMap; |
+ |
+static EncodingMap encoding_map; |
+ |
+// Mutex for locking the code that initializes encoding_map. |
+// static Mutex encodings_init_mutex(base::LINKER_INITIALIZED); |
+ |
+void InitEncodings() { |
+ // For thread safety, keep a mutex while initializing this map. |
+ // Also allow this function to be called more than once and |
+ // gracefully exiting if that occurs. |
+ // MutexLock lock(&encodings_init_mutex); |
+ if (!encoding_map.empty()) { |
+ // Already initialized |
+ return; |
+ } |
+ |
+ // Initialize the map with all the "standard" encoding names, |
+ // i.e., the ones returned by EncodingName and MimeEncodingName. |
+ // |
+ // First, add internal encoding names returned by EncodingName(). |
+ for (int i = 0; i < NUM_ENCODINGS; ++i) { |
+ Encoding e = static_cast<Encoding>(i); |
+ // Internal encoding names must be unique. |
+ // The internal names are guaranteed to be unique by the CHECK_EQ. |
+ const char *encoding_name = EncodingName(e); |
+ CHECK_EQ(0, encoding_map.count(encoding_name)) |
+ << "Duplicate found for " << encoding_name; |
+ encoding_map[encoding_name] = e; |
+ } |
+ // Then, add mime encoding names returned by MimeEncodingName(). |
+ // We don't override existing entries, to give precedence to entries |
+ // added earlier. |
+ for (int i = 0; i < NUM_ENCODINGS; ++i) { |
+ Encoding e = static_cast<Encoding>(i); |
+ // Note that MimeEncodingName() can return the same mime encoding |
+ // name for different encoding enums like JAPANESE_SHIFT_JIS and |
+ // KDDI_SHIFT_JIS. In that case, the encoding enum first seen |
+ // will be the value for the encoding name in the map. |
+ const char *mime_encoding_name = MimeEncodingName(e); |
+ if (encoding_map.count(mime_encoding_name) == 0) { |
+ encoding_map[mime_encoding_name] = e; |
+ } |
+ } |
+ |
+ // Add some non-standard names: alternate spellings, common typos, |
+ // etc. (It does no harm to add names already in the map.) Note |
+ // that although the map is case-insensitive, by convention the |
+ // keys are written here in lower case. For ease of maintenance, |
+ // they are listed in alphabetical order. |
+ encoding_map["5601"] = KOREAN_EUC_KR; |
+ encoding_map["646"] = ASCII_7BIT; |
+ encoding_map["852"] = CZECH_CP852; |
+ encoding_map["866"] = RUSSIAN_CP866; |
+ encoding_map["8859-1"] = ISO_8859_1; |
+ encoding_map["ansi-1251"] = RUSSIAN_CP1251; |
+ encoding_map["ansi_x3.4-1968"] = ASCII_7BIT; |
+ encoding_map["arabic"] = ISO_8859_6; |
+ encoding_map["ascii"] = ISO_8859_1; |
+ encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard |
+ encoding_map["asmo-708"] = ISO_8859_6; |
+ encoding_map["bhaskar"] = BHASKAR; |
+ encoding_map["big5"] = CHINESE_BIG5; |
+ encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard |
+ encoding_map["big5-hkscs"] = BIG5_HKSCS; |
+ encoding_map["chinese"] = CHINESE_GB; |
+ encoding_map["cns"] = CHINESE_CNS; // not iana standard |
+ encoding_map["cns11643"] = CHINESE_CNS; |
+ encoding_map["cp1250"] = MSFT_CP1250; // not iana standard |
+ encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard |
+ encoding_map["cp1252"] = MSFT_CP1252; // not iana standard |
+ encoding_map["cp1253"] = MSFT_CP1253; // not iana standard |
+ encoding_map["cp1254"] = MSFT_CP1254; // not iana standard |
+ encoding_map["cp1255"] = MSFT_CP1255; |
+ encoding_map["cp1256"] = MSFT_CP1256; |
+ encoding_map["cp1257"] = MSFT_CP1257; // not iana standard |
+ encoding_map["cp819"] = ISO_8859_1; |
+ encoding_map["cp852"] = CZECH_CP852; |
+ encoding_map["cp866"] = RUSSIAN_CP866; |
+ encoding_map["cp-866"] = RUSSIAN_CP866; |
+ encoding_map["cp874"] = MSFT_CP874; |
+ encoding_map["cp932"] = JAPANESE_CP932; // not iana standard |
+ encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard |
+ encoding_map["csbig5"] = CHINESE_BIG5; |
+ encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP; |
+ encoding_map["cseuckr"] = KOREAN_EUC_KR; |
+ encoding_map["csgb2312"] = CHINESE_GB; |
+ encoding_map["csibm852"] = CZECH_CP852; |
+ encoding_map["csibm866"] = RUSSIAN_CP866; |
+ encoding_map["csiso2022jp"] = JAPANESE_JIS; |
+ encoding_map["csiso2022kr"] = ISO_2022_KR; |
+ encoding_map["csiso58gb231280"] = CHINESE_GB; |
+ encoding_map["csiso88598i"] = ISO_8859_8_I; |
+ encoding_map["csisolatin1"] = ISO_8859_1; |
+ encoding_map["csisolatin2"] = ISO_8859_2; |
+ encoding_map["csisolatin3"] = ISO_8859_3; |
+ encoding_map["csisolatin4"] = ISO_8859_4; |
+ encoding_map["csisolatin5"] = ISO_8859_9; |
+ encoding_map["csisolatin6"] = ISO_8859_10; |
+ encoding_map["csisolatinarabic"] = ISO_8859_6; |
+ encoding_map["csisolatincyrillic"] = ISO_8859_5; |
+ encoding_map["csisolatingreek"] = ISO_8859_7; |
+ encoding_map["csisolatinhebrew"] = ISO_8859_8; |
+ encoding_map["csksc56011987"] = KOREAN_EUC_KR; |
+ encoding_map["csmacintosh"] = MACINTOSH_ROMAN; |
+ encoding_map["csn-369103"] = CZECH_CSN_369103; |
+ encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS; |
+ encoding_map["csunicode"] = UTF16BE; |
+ encoding_map["csunicode11"] = UTF16BE; |
+ encoding_map["csunicode11utf7"] = UTF7; |
+ encoding_map["csunicodeascii"] = UTF16BE; |
+ encoding_map["csunicodelatin1"] = UTF16BE; |
+ encoding_map["cyrillic"] = ISO_8859_5; |
+ encoding_map["ecma-114"] = ISO_8859_6; |
+ encoding_map["ecma-118"] = ISO_8859_7; |
+ encoding_map["elot_928"] = ISO_8859_7; |
+ encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard |
+ encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard |
+ encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard |
+ encoding_map["euc-jp"] = JAPANESE_EUC_JP; |
+ encoding_map["euc-kr"] = KOREAN_EUC_KR; |
+ encoding_map["eucgb2312_cn"] = CHINESE_GB; |
+ encoding_map["gb"] = CHINESE_GB; // not iana standard |
+ encoding_map["gb18030"] = GB18030; |
+ encoding_map["gb2132"] = CHINESE_GB; // common typo |
+ encoding_map["gb2312"] = CHINESE_GB; |
+ encoding_map["gb_2312-80"] = CHINESE_GB; |
+ encoding_map["gbk"] = GBK; |
+ encoding_map["greek"] = ISO_8859_7; |
+ encoding_map["greek8"] = ISO_8859_7; |
+ encoding_map["hebrew"] = ISO_8859_8; |
+ encoding_map["htchanakya"] = HTCHANAKYA; |
+ encoding_map["hz-gb-2312"] = HZ_GB_2312; |
+ encoding_map["ibm819"] = ISO_8859_1; |
+ encoding_map["ibm852"] = CZECH_CP852; |
+ encoding_map["ibm874"] = MSFT_CP874; |
+ encoding_map["iso-10646"] = UTF16BE; |
+ encoding_map["iso-10646-j-1"] = UTF16BE; |
+ encoding_map["iso-10646-ucs-2"] = UNICODE; |
+ encoding_map["iso-10646-ucs-4"] = UTF32BE; |
+ encoding_map["iso-10646-ucs-basic"] = UTF16BE; |
+ encoding_map["iso-10646-unicode-latin1"] = UTF16BE; |
+ encoding_map["iso-2022-cn"] = ISO_2022_CN; |
+ encoding_map["iso-2022-jp"] = JAPANESE_JIS; |
+ encoding_map["iso-2022-kr"] = ISO_2022_KR; |
+ encoding_map["iso-8559-1"] = ISO_8859_1; // common typo |
+ encoding_map["iso-874"] = MSFT_CP874; |
+ encoding_map["iso-8858-1"] = ISO_8859_1; // common typo |
+ // iso-8859-0 was a temporary name, eventually renamed iso-8859-15 |
+ encoding_map["iso-8859-0"] = ISO_8859_15; |
+ encoding_map["iso-8859-1"] = ISO_8859_1; |
+ encoding_map["iso-8859-10"] = ISO_8859_10; |
+ encoding_map["iso-8859-11"] = ISO_8859_11; |
+ encoding_map["iso-8859-13"] = ISO_8859_13; |
+ encoding_map["iso-8859-15"] = ISO_8859_15; |
+ encoding_map["iso-8859-2"] = ISO_8859_2; |
+ encoding_map["iso-8859-3"] = ISO_8859_3; |
+ encoding_map["iso-8859-4"] = ISO_8859_4; |
+ encoding_map["iso-8859-5"] = ISO_8859_5; |
+ encoding_map["iso-8859-6"] = ISO_8859_6; |
+ encoding_map["iso-8859-7"] = ISO_8859_7; |
+ encoding_map["iso-8859-8"] = ISO_8859_8; |
+ encoding_map["iso-8859-8-i"] = ISO_8859_8_I; |
+ encoding_map["iso-8859-9"] = ISO_8859_9; |
+ encoding_map["iso-9959-1"] = ISO_8859_1; // common typo |
+ encoding_map["iso-ir-100"] = ISO_8859_1; |
+ encoding_map["iso-ir-101"] = ISO_8859_2; |
+ encoding_map["iso-ir-109"] = ISO_8859_3; |
+ encoding_map["iso-ir-110"] = ISO_8859_4; |
+ encoding_map["iso-ir-126"] = ISO_8859_7; |
+ encoding_map["iso-ir-127"] = ISO_8859_6; |
+ encoding_map["iso-ir-138"] = ISO_8859_8; |
+ encoding_map["iso-ir-144"] = ISO_8859_5; |
+ encoding_map["iso-ir-148"] = ISO_8859_9; |
+ encoding_map["iso-ir-149"] = KOREAN_EUC_KR; |
+ encoding_map["iso-ir-157"] = ISO_8859_10; |
+ encoding_map["iso-ir-58"] = CHINESE_GB; |
+ encoding_map["iso-latin-1"] = ISO_8859_1; |
+ encoding_map["iso_2022-cn"] = ISO_2022_CN; |
+ encoding_map["iso_2022-kr"] = ISO_2022_KR; |
+ encoding_map["iso_8859-1"] = ISO_8859_1; |
+ encoding_map["iso_8859-10:1992"] = ISO_8859_10; |
+ encoding_map["iso_8859-11"] = ISO_8859_11; |
+ encoding_map["iso_8859-13"] = ISO_8859_13; |
+ encoding_map["iso_8859-15"] = ISO_8859_15; |
+ encoding_map["iso_8859-1:1987"] = ISO_8859_1; |
+ encoding_map["iso_8859-2"] = ISO_8859_2; |
+ encoding_map["iso_8859-2:1987"] = ISO_8859_2; |
+ encoding_map["iso_8859-3"] = ISO_8859_3; |
+ encoding_map["iso_8859-3:1988"] = ISO_8859_3; |
+ encoding_map["iso_8859-4"] = ISO_8859_4; |
+ encoding_map["iso_8859-4:1988"] = ISO_8859_4; |
+ encoding_map["iso_8859-5"] = ISO_8859_5; |
+ encoding_map["iso_8859-5:1988"] = ISO_8859_5; |
+ encoding_map["iso_8859-6"] = ISO_8859_6; |
+ encoding_map["iso_8859-6:1987"] = ISO_8859_6; |
+ encoding_map["iso_8859-7"] = ISO_8859_7; |
+ encoding_map["iso_8859-7:1987"] = ISO_8859_7; |
+ encoding_map["iso_8859-8"] = ISO_8859_8; |
+ encoding_map["iso_8859-8:1988:"] = ISO_8859_8; |
+ encoding_map["iso_8859-9"] = ISO_8859_9; |
+ encoding_map["iso_8859-9:1989"] = ISO_8859_9; |
+ encoding_map["jagran"] = JAGRAN; |
+ encoding_map["jis"] = JAPANESE_JIS; // not iana standard |
+ encoding_map["koi8-cs"] = CZECH_CSN_369103; |
+ encoding_map["koi8-r"] = RUSSIAN_KOI8_R; |
+ encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard |
+ encoding_map["koi8-u"] = RUSSIAN_KOI8_RU; |
+ encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard |
+ encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard |
+ encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant |
+ encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard |
+ encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard |
+ encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR; |
+ encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard |
+ encoding_map["l1"] = ISO_8859_1; |
+ encoding_map["l2"] = ISO_8859_2; |
+ encoding_map["l3"] = ISO_8859_3; |
+ encoding_map["l4"] = ISO_8859_4; |
+ encoding_map["l5"] = ISO_8859_9; |
+ encoding_map["l6"] = ISO_8859_10; |
+ encoding_map["latin-1"] = ISO_8859_1; // not iana standard |
+ encoding_map["latin1"] = ISO_8859_1; |
+ encoding_map["latin2"] = ISO_8859_2; |
+ encoding_map["latin3"] = ISO_8859_3; |
+ encoding_map["latin4"] = ISO_8859_4; |
+ encoding_map["latin5"] = ISO_8859_9; |
+ encoding_map["latin6"] = ISO_8859_10; |
+ encoding_map["mac"] = MACINTOSH_ROMAN; |
+ encoding_map["macintosh"] = MACINTOSH_ROMAN; |
+ encoding_map["macintosh-roman"] = MACINTOSH_ROMAN; |
+ encoding_map["ms932"] = JAPANESE_CP932; // not iana standard |
+ encoding_map["ms_kanji"] = JAPANESE_CP932; |
+ encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS; |
+ encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS; |
+ encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard |
+ encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard |
+ encoding_map["sun_eu_greek"] = ISO_8859_7; |
+ encoding_map["tab"] = TAMIL_BI; |
+ encoding_map["tam"] = TAMIL_MONO; |
+ encoding_map["tis-620"] = ISO_8859_11; |
+ encoding_map["tscii"] = TSCII; |
+ encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard |
+ encoding_map["unicode"] = UNICODE; // not iana standard |
+ encoding_map["unicode-1-1-utf-7"] = UTF7; |
+ encoding_map["unicode-1-1-utf-8"] = UTF8; |
+ encoding_map["unicode-2-0-utf-7"] = UTF7; |
+ encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard |
+ encoding_map["us"] = ISO_8859_1; |
+ encoding_map["us-ascii"] = ISO_8859_1; |
+ encoding_map["utf-16be"] = UTF16BE; |
+ encoding_map["utf-16le"] = UTF16LE; |
+ encoding_map["utf-32be"] = UTF32BE; |
+ encoding_map["utf-32le"] = UTF32LE; |
+ encoding_map["utf-7"] = UTF7; |
+ encoding_map["utf-8"] = UTF8; |
+ encoding_map["utf7"] = UTF7; |
+ encoding_map["utf8"] = UTF8; // not iana standard |
+ encoding_map["visual"] = HEBREW_VISUAL; |
+ encoding_map["win-1250"] = MSFT_CP1250; // not iana standard |
+ encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard |
+ encoding_map["window-874"] = MSFT_CP874; |
+ encoding_map["windows-1250"] = MSFT_CP1250; |
+ encoding_map["windows-1251"] = RUSSIAN_CP1251; |
+ encoding_map["windows-1252"] = MSFT_CP1252; |
+ encoding_map["windows-1253"] = MSFT_CP1253; |
+ encoding_map["windows-1254"] = MSFT_CP1254; |
+ encoding_map["windows-1255"] = MSFT_CP1255; |
+ encoding_map["windows-1256"] = MSFT_CP1256; |
+ encoding_map["windows-1257"] = MSFT_CP1257; |
+ encoding_map["windows-31j"] = JAPANESE_CP932; |
+ encoding_map["windows-874"] = MSFT_CP874; |
+ encoding_map["windows-936"] = GBK; |
+ encoding_map["x-big5"] = CHINESE_BIG5; |
+ encoding_map["x-binaryenc"] = BINARYENC; // not iana standard |
+ encoding_map["x-cp1250"] = MSFT_CP1250; |
+ encoding_map["x-cp1251"] = RUSSIAN_CP1251; |
+ encoding_map["x-cp1252"] = MSFT_CP1252; |
+ encoding_map["x-cp1253"] = MSFT_CP1253; |
+ encoding_map["x-cp1254"] = MSFT_CP1254; |
+ encoding_map["x-cp1255"] = MSFT_CP1255; |
+ encoding_map["x-cp1256"] = MSFT_CP1256; |
+ encoding_map["x-cp1257"] = MSFT_CP1257; |
+ encoding_map["x-euc-jp"] = JAPANESE_EUC_JP; |
+ encoding_map["x-euc-tw"] = CHINESE_CNS; |
+ encoding_map["x-gbk"] = GBK; |
+ encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE; |
+ encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE; |
+ encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE; |
+ encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE; |
+ encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard |
+ encoding_map["x-mac-roman"] = MACINTOSH_ROMAN; |
+ encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard |
+ encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS; |
+ encoding_map["x-unicode-2-0-utf-7"] = UTF7; |
+ encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard |
+ encoding_map["x-x-big5"] = CHINESE_BIG5; |
+ encoding_map["zh_cn.euc"] = CHINESE_GB; |
+ encoding_map["zh_tw-big5"] = CHINESE_BIG5; |
+ encoding_map["zh_tw-euc"] = CHINESE_CNS; |
+ |
+ // Remove they entry for the empty string, if any. |
+ encoding_map.erase(""); |
+} |
+ |
+REGISTER_MODULE_INITIALIZER(encodings, { |
+ InitEncodings(); |
+}); |
+ |
+// ---------------------------------------------------------------------- |
+// EncodingNameAliasToEncoding() |
+// |
+// This function takes an encoding name/alias and returns the Encoding |
+// enum. The input is case insensitive. It is the union of the common |
+// IANA standard names, the charset names used in Netscape Navigator, |
+// and some common names we have been using. |
+// See: http://www.iana.org/assignments/character-sets |
+// http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html |
+// |
+// UNKNOWN_ENCODING is returned if none matches. |
+// |
+// TODO: Check if it is possible to remove the non-standard, |
+// non-netscape-use names. It is because this routine is used for |
+// encoding detections from html meta info. Non-standard names may |
+// introduce noise on encoding detection. |
+// |
+// TODO: Unify EncodingNameAliasToEncoding and EncodingFromName, |
+// or determine why such a unification is not possible. |
+// ---------------------------------------------------------------------- |
+Encoding EncodingNameAliasToEncoding(const char *encoding_name) { |
+ if (!encoding_name) { |
+ return UNKNOWN_ENCODING; |
+ } |
+ |
+ // The map is initialized during InitGoogle() in a thread-safe manner. |
+ CHECK(!encoding_map.empty()) << ": Must call InitGoogle()"; |
+ |
+ EncodingMap::iterator emi = encoding_map.find(encoding_name); |
+ if (emi != encoding_map.end()) { |
+ return emi->second; |
+ } else { |
+ return UNKNOWN_ENCODING; |
+ } |
+} |
+#endif |
+ |
+const char* default_encoding_name() { |
+ return kEncodingInfoTable[LATIN1].encoding_name_; |
+} |
+ |
+static const char* const kInvalidEncodingName = "invalid_encoding"; |
+ |
+const char *invalid_encoding_name() { |
+ return kInvalidEncodingName; |
+} |
+ |
+ |
+ |
+// ************************************************************* |
+// Miscellany |
+// ************************************************************* |
+ |
+ |
+Encoding PreferredWebOutputEncoding(Encoding enc) { |
+ return IsValidEncoding(enc) |
+ ? kEncodingInfoTable[enc].preferred_web_output_encoding_ |
+ : UTF8; |
} |