third_party/cld/encodings/internal/encodings.cc - Issue 1956183002: CL for perf tryjob on linux

Unified Diff: third_party/cld/encodings/internal/encodings.cc

Issue 1956183002: CL for perf tryjob on linux (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/cld/encodings/internal/encodings.cc

diff --git a/third_party/cld/encodings/internal/encodings.cc b/third_party/cld/encodings/internal/encodings.cc

index c8bf82ece85cd8389ff97d3bf881e41e593f90a1..82ef5e243581c09adfc058d4ebdc76bbf644108c 100644

--- a/third_party/cld/encodings/internal/encodings.cc

+++ b/third_party/cld/encodings/internal/encodings.cc

@@ -1,12 +1,898 @@

-// Use of this source code is governed by a BSD-style license that can be

-// found in the LICENSE file.

+// Author: jrm@google.com (Jim Meehan)

#include "encodings/public/encodings.h"

+#include <string.h> // for strcasecmp

+//#include <hash_map> // for _Hashtable_iterator, etc

+#include <utility> // for pair

+//#include "base/googleinit.h" // for REGISTER_MODULE_INITIALIZER

+//#include "base/logging.h" // for operator<<, Check_EQImpl, etc

+//#include "base/macros.h" // for COMPILE_ASSERT, etc

+//#include "base/mutex.h" // for Mutex, MutexLock

+//#include "util/hash/case_insensitive_hash.h"

+//#include "util/hash/hash.h"

+#include "encodings/compact_lang_det/win/cld_basictypes.h"

+#include "encodings/compact_lang_det/win/cld_logging.h"

+#include "encodings/compact_lang_det/win/cld_macros.h"

+struct EncodingInfo {

+ // The standard name for this encoding.

+ //

+ const char* encoding_name_;

+ // The "preferred MIME name" of an encoding as specified by the IANA at:

+ // http://www.iana.org/assignments/character-sets

+ //

+ // Note that the preferred MIME name may differ slightly from the

+ // official IANA name: i.e. ISO-8859-1 vs. ISO_8859-1:1987

+ //

+ const char* mime_encoding_name_;

+ // NOTE: As of January 2007, it is a Google requirement that if an

+ // encoding has an IANA name, then encoding_name_ and

+ // mime_encoding_name_ must be the same string.

+ //

+ // However, there can be exceptions if there are compelling reasons.

+ // For example, Japanese mobile handsets require the name

+ // "Shift_JIS" in charset=... parameter in Content-Type headers to

+ // process emoji (emoticons) in their private encodings. In that

+ // case, mime_encoding_name_ should be "Shift_JIS", despite

+ // encoding_name_ actually is "X-KDDI-Shift_JIS".

+ // Some multi-byte encodings use byte values that coincide with the

+ // ASCII codes for HTML syntax characters <>"&' and browsers like MSIE

+ // can misinterpret these, as indicated in an external XSS report from

+ // 2007-02-15. Here, we map these dangerous encodings to safer ones. We

+ // also use UTF8 instead of encodings that we don't support in our

+ // output, and we generally try to be conservative in what we send out.

+ // Where the client asks for single- or double-byte encodings that are

+ // not as common, we substitute a more common single- or double-byte

+ // encoding, if there is one, thereby preserving the client's intent

+ // to use less space than UTF-8. This also means that characters

+ // outside the destination set will be converted to HTML NCRs (&#NNN;)

+ // if requested.

+ Encoding preferred_web_output_encoding_;

+};

+static const EncodingInfo kEncodingInfoTable[] = {

+ { "ASCII", "ISO-8859-1", ISO_8859_1},

+ { "Latin2", "ISO-8859-2", ISO_8859_2},

+ { "Latin3", "ISO-8859-3", UTF8},

+ // MSIE 6 does not support ISO-8859-3 (XSS issue)

+ { "Latin4", "ISO-8859-4", ISO_8859_4},

+ { "ISO-8859-5", "ISO-8859-5", ISO_8859_5},

+ { "Arabic", "ISO-8859-6", ISO_8859_6},

+ { "Greek", "ISO-8859-7", ISO_8859_7},

+ { "Hebrew", "ISO-8859-8", MSFT_CP1255},

+ // we do not endorse the visual order

+ { "Latin5", "ISO-8859-9", ISO_8859_9},

+ { "Latin6", "ISO-8859-10", UTF8},

+ // MSIE does not support ISO-8859-10 (XSS issue)

+ { "EUC-JP", "EUC-JP", JAPANESE_EUC_JP},

+ { "SJS", "Shift_JIS", JAPANESE_SHIFT_JIS},

+ { "JIS", "ISO-2022-JP", JAPANESE_SHIFT_JIS},

+ // due to potential confusion with HTML syntax chars

+ { "BIG5", "Big5", CHINESE_BIG5},

+ { "GB", "GB2312", CHINESE_GB},

+ { "EUC-CN",

+ "EUC-CN",

+ // Misnamed. Should be EUC-TW.

+ CHINESE_BIG5},

+ // MSIE treats "EUC-CN" like GB2312, which is not EUC-TW,

+ // and EUC-TW is rare, so we prefer Big5 for output.

+ { "KSC", "EUC-KR", KOREAN_EUC_KR},

+ { "Unicode",

+ "UTF-16LE",

+ // Internet Explorer doesn't recognize "ISO-10646-UCS-2"

+ UTF8

+ // due to potential confusion with HTML syntax chars

+ },

+ { "EUC",

+ "EUC", // Misnamed. Should be EUC-TW.

+ CHINESE_BIG5

+ // MSIE does not recognize "EUC" (XSS issue),

+ // and EUC-TW is rare, so we prefer Big5 for output.

+ },

+ { "CNS",

+ "CNS", // Misnamed. Should be EUC-TW.

+ CHINESE_BIG5},

+ // MSIE does not recognize "CNS" (XSS issue),

+ // and EUC-TW is rare, so we prefer Big5 for output.

+ { "BIG5-CP950",

+ "BIG5-CP950", // Not an IANA name

+ CHINESE_BIG5

+ // MSIE does not recognize "BIG5-CP950" (XSS issue)

+ },

+ { "CP932", "CP932", // Not an IANA name

+ JAPANESE_SHIFT_JIS}, // MSIE does not recognize "CP932" (XSS issue)

+ { "UTF8", "UTF-8", UTF8},

+ { "Unknown",

+ "x-unknown", // Not an IANA name

+ UTF8}, // UTF-8 is our default output encoding

+ { "ASCII-7-bit", "US-ASCII", ASCII_7BIT},

+ { "KOI8R", "KOI8-R", RUSSIAN_KOI8_R},

+ { "CP1251", "windows-1251", RUSSIAN_CP1251},

+ { "CP1252", "windows-1252", MSFT_CP1252},

+ { "KOI8U",

+ "KOI8-U",

+ ISO_8859_5}, // because koi8-u is not as common

+ { "CP1250", "windows-1250", MSFT_CP1250},

+ { "ISO-8859-15", "ISO-8859-15", ISO_8859_15},

+ { "CP1254", "windows-1254", MSFT_CP1254},

+ { "CP1257", "windows-1257", MSFT_CP1257},

+ { "ISO-8859-11", "ISO-8859-11", ISO_8859_11},

+ { "CP874", "windows-874", MSFT_CP874},

+ { "CP1256", "windows-1256", MSFT_CP1256},

+ { "CP1255", "windows-1255", MSFT_CP1255},

+ { "ISO-8859-8-I", "ISO-8859-8-I", MSFT_CP1255},

+ // Java does not support iso-8859-8-i

+ { "VISUAL", "ISO-8859-8", MSFT_CP1255},

+ // we do not endorse the visual order

+ { "CP852", "cp852", MSFT_CP1250},

+ // because cp852 is not as common

+ { "CSN_369103", "csn_369103", MSFT_CP1250},

+ // MSIE does not recognize "csn_369103" (XSS issue)

+ { "CP1253", "windows-1253", MSFT_CP1253},

+ { "CP866", "IBM866", RUSSIAN_CP1251},

+ // because cp866 is not as common

+ { "ISO-8859-13", "ISO-8859-13", UTF8},

+ // because iso-8859-13 is not widely supported

+ { "ISO-2022-KR", "ISO-2022-KR", KOREAN_EUC_KR},

+ // due to potential confusion with HTML syntax chars

+ { "GBK", "GBK", GBK},

+ { "GB18030", "GB18030", GBK},

+ // because gb18030 is not widely supported

+ { "BIG5_HKSCS", "BIG5-HKSCS", CHINESE_BIG5},

+ // because Big5-HKSCS is not widely supported

+ { "ISO_2022_CN", "ISO-2022-CN", CHINESE_GB},

+ // due to potential confusion with HTML syntax chars

+ { "TSCII", "tscii", UTF8},

+ // we do not have an output converter for this font encoding

+ { "TAM", "tam", UTF8},

+ // we do not have an output converter for this font encoding

+ { "TAB", "tab", UTF8},

+ // we do not have an output converter for this font encoding

+ { "JAGRAN", "jagran", UTF8},

+ // we do not have an output converter for this font encoding

+ { "MACINTOSH", "MACINTOSH", ISO_8859_1},

+ // because macintosh is relatively uncommon

+ { "UTF7", "UTF-7",

+ UTF8}, // UTF-7 has been the subject of XSS attacks and is deprecated

+ { "BHASKAR", "bhaskar",

+ UTF8}, // we do not have an output converter for this font encoding

+ { "HTCHANAKYA", "htchanakya", // not an IANA charset name.

+ UTF8}, // we do not have an output converter for this font encoding

+ { "UTF-16BE", "UTF-16BE",

+ UTF8}, // due to potential confusion with HTML syntax chars

+ { "UTF-16LE", "UTF-16LE",

+ UTF8}, // due to potential confusion with HTML syntax chars

+ { "UTF-32BE", "UTF-32BE",

+ UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web

+ { "UTF-32LE", "UTF-32LE",

+ UTF8}, // unlikely to cause XSS bugs, but very uncommon on Web

+ { "X-BINARYENC", "x-binaryenc", // Not an IANA name

+ UTF8}, // because this one is not intended for output (just input)

+ { "HZ-GB-2312", "HZ-GB-2312",

+ CHINESE_GB}, // due to potential confusion with HTML syntax chars

+ { "X-UTF8UTF8", "x-utf8utf8", // Not an IANA name

+ UTF8}, // because this one is not intended for output (just input)

+ { "X-TAM-ELANGO", "x-tam-elango",

+ UTF8}, // we do not have an output converter for this font encoding

+ { "X-TAM-LTTMBARANI", "x-tam-lttmbarani",

+ UTF8}, // we do not have an output converter for this font encoding

+ { "X-TAM-SHREE", "x-tam-shree",

+ UTF8}, // we do not have an output converter for this font encoding

+ { "X-TAM-TBOOMIS", "x-tam-tboomis",

+ UTF8}, // we do not have an output converter for this font encoding

+ { "X-TAM-TMNEWS", "x-tam-tmnews",

+ UTF8}, // we do not have an output converter for this font encoding

+ { "X-TAM-WEBTAMIL", "x-tam-webtamil",

+ UTF8}, // we do not have an output converter for this font encoding

+ { "X-KDDI-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},

+ // KDDI version of Shift_JIS with Google Emoji PUA mappings.

+ // Note that MimeEncodingName() returns "Shift_JIS", since KDDI uses

+ // "Shift_JIS" in HTTP headers and email messages.

+ { "X-DoCoMo-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},

+ // DoCoMo version of Shift_JIS with Google Emoji PUA mappings.

+ // See the comment at KDDI_SHIFT_JIS for other issues.

+ { "X-SoftBank-Shift_JIS", "Shift_JIS", JAPANESE_SHIFT_JIS},

+ // SoftBank version of Shift_JIS with Google Emoji PUA mappings.

+ // See the comment at KDDI_SHIFT_JIS for other issues.

+ { "X-KDDI-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},

+ // KDDI version of ISO-2022-JP with Google Emoji PUA mappings.

+ // See the comment at KDDI_SHIFT_JIS for other issues.

+ // The preferred Web encoding is due to potential confusion with

+ // HTML syntax chars.

+ { "X-SoftBank-ISO-2022-JP", "ISO-2022-JP", JAPANESE_SHIFT_JIS},

+ // SoftBank version of ISO-2022-JP with Google Emoji PUA mappings.

+ // See the comment at KDDI_SHIFT_JIS for other issues.

+ // The preferred Web encoding is due to potential confusion with

+ // HTML syntax chars.

+ // Please refer to NOTE: section in the comments in the definition

+ // of "struct I18NInfoByEncoding", before adding new encodings.

+};

+COMPILE_ASSERT(arraysize(kEncodingInfoTable) == NUM_ENCODINGS,

+ kEncodingInfoTable_has_incorrect_size);

+Encoding default_encoding() {return LATIN1;}

+// *************************************************************

+// Encoding predicates

+// IsValidEncoding()

+// IsEncEncCompatible

+// IsEncodingWithSupportedLanguage

+// IsSupersetOfAscii7Bit

+// Is8BitEncoding

+// IsCJKEncoding

+// IsHebrewEncoding

+// IsRightToLeftEncoding

+// IsLogicalRightToLeftEncoding

+// IsVisualRightToLeftEncoding

+// IsIso2022Encoding

+// IsIso2022JpOrVariant

+// IsShiftJisOrVariant

+// IsJapaneseCellPhoneCarrierSpecificEncoding

+// *************************************************************

+bool IsValidEncoding(Encoding enc) {

+ return ((enc >= 0) && (enc < kNumEncodings));

+bool IsEncEncCompatible(const Encoding from, const Encoding to) {

+ // Tests compatibility between the "from" and "to" encodings; in

+ // the typical case -- when both are valid known encodings -- this

+ // returns true iff converting from first to second is a no-op.

+ if (!IsValidEncoding(from) || !IsValidEncoding(to)) {

+ return false; // we only work with valid encodings...

+ } else if (to == from) {

+ return true; // the trivial common case

+ }

+ if (to == UNKNOWN_ENCODING) {

+ return true; // all valid encodings are compatible with the unknown

+ }

+ if (from == UNKNOWN_ENCODING) {

+ return false; // no unknown encoding is compatible with one that is

+ }

+ if (from == ASCII_7BIT) {

+ return IsSupersetOfAscii7Bit(to);

+ }

+ return (from == ISO_8859_1 && to == MSFT_CP1252) ||

+ (from == ISO_8859_8 && to == HEBREW_VISUAL) ||

+ (from == HEBREW_VISUAL && to == ISO_8859_8) ||

+ (from == ISO_8859_9 && to == MSFT_CP1254) ||

+ (from == ISO_8859_11 && to == MSFT_CP874) ||

+ (from == JAPANESE_SHIFT_JIS && to == JAPANESE_CP932) ||

+ (from == CHINESE_BIG5 && to == CHINESE_BIG5_CP950) ||

+ (from == CHINESE_GB && to == GBK) ||

+ (from == CHINESE_GB && to == GB18030) ||

+ (from == CHINESE_EUC_CN && to == CHINESE_EUC_DEC) ||

+ (from == CHINESE_EUC_CN && to == CHINESE_CNS) ||

+ (from == CHINESE_EUC_DEC && to == CHINESE_EUC_CN) ||

+ (from == CHINESE_EUC_DEC && to == CHINESE_CNS) ||

+ (from == CHINESE_CNS && to == CHINESE_EUC_CN) ||

+ (from == CHINESE_CNS && to == CHINESE_EUC_DEC);

+// To be a superset of 7-bit Ascii means that bytes 0...127 in the given

+// encoding represent the same characters as they do in ISO_8859_1.

+// TODO: This list could be expanded. Many other encodings are supersets

+// of 7-bit Ascii. In fact, Japanese JIS and Unicode are the only two

+// encodings that I know for a fact should *not* be in this list.

+bool IsSupersetOfAscii7Bit(Encoding e) {

+ switch (e) {

+ case ISO_8859_1:

+ case ISO_8859_2:

+ case ISO_8859_3:

+ case ISO_8859_4:

+ case ISO_8859_5:

+ case ISO_8859_6:

+ case ISO_8859_7:

+ case ISO_8859_8:

+ case ISO_8859_9:

+ case ISO_8859_10:

+ case JAPANESE_EUC_JP:

+ case JAPANESE_SHIFT_JIS:

+ case CHINESE_BIG5:

+ case CHINESE_GB:

+ case CHINESE_EUC_CN:

+ case KOREAN_EUC_KR:

+ case CHINESE_EUC_DEC:

+ case CHINESE_CNS:

+ case CHINESE_BIG5_CP950:

+ case JAPANESE_CP932:

+ case UTF8:

+ case UNKNOWN_ENCODING:

+ case ASCII_7BIT:

+ case RUSSIAN_KOI8_R:

+ case RUSSIAN_CP1251:

+ case MSFT_CP1252:

+ case RUSSIAN_KOI8_RU:

+ case MSFT_CP1250:

+ case ISO_8859_15:

+ case MSFT_CP1254:

+ case MSFT_CP1257:

+ case ISO_8859_11:

+ case MSFT_CP874:

+ case MSFT_CP1256:

+ case MSFT_CP1255:

+ case ISO_8859_8_I:

+ case HEBREW_VISUAL:

+ case CZECH_CP852:

+ case MSFT_CP1253:

+ case RUSSIAN_CP866:

+ case ISO_8859_13:

+ case GBK:

+ case GB18030:

+ case BIG5_HKSCS:

+ case MACINTOSH_ROMAN:

+ return true;

+ default:

+ return false;

+ }

+// To be an 8-bit encoding means that there are fewer than 256 symbols.

+// Each byte determines a new character; there are no multi-byte sequences.

+// TODO: This list could maybe be expanded. Other encodings may be 8-bit.

+bool Is8BitEncoding(Encoding e) {

+ switch (e) {

+ case ASCII_7BIT:

+ case ISO_8859_1:

+ case ISO_8859_2:

+ case ISO_8859_3:

+ case ISO_8859_4:

+ case ISO_8859_5:

+ case ISO_8859_6:

+ case ISO_8859_7:

+ case ISO_8859_8:

+ case ISO_8859_8_I:

+ case ISO_8859_9:

+ case ISO_8859_10:

+ case ISO_8859_11:

+ case ISO_8859_13:

+ case ISO_8859_15:

+ case MSFT_CP1252:

+ case MSFT_CP1253:

+ case MSFT_CP1254:

+ case MSFT_CP1255:

+ case MSFT_CP1256:

+ case MSFT_CP1257:

+ case RUSSIAN_KOI8_R:

+ case RUSSIAN_KOI8_RU:

+ case RUSSIAN_CP866:

+ return true;

+ default:

+ return false;

+ }

+bool IsCJKEncoding(Encoding e) {

+ switch (e) {

+ case JAPANESE_EUC_JP:

+ case JAPANESE_SHIFT_JIS:

+ case JAPANESE_JIS:

+ case CHINESE_BIG5:

+ case CHINESE_GB:

+ case CHINESE_EUC_CN:

+ case KOREAN_EUC_KR:

+ case CHINESE_EUC_DEC:

+ case CHINESE_CNS:

+ case CHINESE_BIG5_CP950:

+ case JAPANESE_CP932:

+ case ISO_2022_KR:

+ case GBK:

+ case GB18030:

+ case BIG5_HKSCS:

+ case ISO_2022_CN:

+ case HZ_GB_2312:

+ return true;

+ default:

+ return false;

+ }

+bool IsHebrewEncoding(Encoding e) {

+ return (e == ISO_8859_8 ||

+ e == ISO_8859_8_I ||

+ e == MSFT_CP1255 ||

+ e == HEBREW_VISUAL);

+bool IsRightToLeftEncoding(Encoding enc) {

+ switch (enc) {

+ case MSFT_CP1255:

+ case MSFT_CP1256:

+ case ARABIC_ENCODING:

+ case HEBREW_ENCODING:

+ case ISO_8859_8_I:

+ case HEBREW_VISUAL:

+ return true;

+ default:

+ return false;

+ }

+bool IsLogicalRightToLeftEncoding(Encoding enc) {

+ return IsRightToLeftEncoding(enc) && !IsVisualRightToLeftEncoding(enc);

+// Note that despite an RFC to the contrary, ARABIC_ENCODING (ISO-8859-6)

+// is NOT visual.

+bool IsVisualRightToLeftEncoding(Encoding enc) {

+ switch (enc) {

+ case HEBREW_ENCODING:

+ case HEBREW_VISUAL:

+ return true;

+ default:

+ return false;

+ }

+bool IsIso2022Encoding(Encoding enc) {

+ return (IsIso2022JpOrVariant(enc) ||

+ enc == ISO_2022_KR ||

+ enc == ISO_2022_CN);

+bool IsIso2022JpOrVariant(Encoding enc) {

+ return (enc == JAPANESE_JIS ||

+ enc == KDDI_ISO_2022_JP ||

+ enc == SOFTBANK_ISO_2022_JP);

+bool IsShiftJisOrVariant(Encoding enc) {

+ return (enc == JAPANESE_SHIFT_JIS ||

+ enc == JAPANESE_CP932 ||

+ enc == KDDI_SHIFT_JIS ||

+ enc == DOCOMO_SHIFT_JIS ||

+ enc == SOFTBANK_SHIFT_JIS);

+bool IsJapaneseCellPhoneCarrierSpecificEncoding(Encoding enc) {

+ return (enc == KDDI_ISO_2022_JP ||

+ enc == KDDI_SHIFT_JIS ||

+ enc == DOCOMO_SHIFT_JIS ||

+ enc == SOFTBANK_SHIFT_JIS ||

+ enc == SOFTBANK_ISO_2022_JP);

+// *************************************************************

+// ENCODING NAMES

+// EncodingName() [Encoding to name]

+// MimeEncodingName() [Encoding to name]

+// EncodingFromName() [name to Encoding]

+// EncodingNameAliasToEncoding() [name to Encoding]

+// default_encoding_name()

+// invalid_encoding_name()

+// *************************************************************

-// We do not use it, just to please a compiler and minimize ported

-// code changes.

const char * EncodingName(const Encoding enc) {

- return "";

+ if ( (enc < 0) || (enc >= kNumEncodings) )

+ return invalid_encoding_name();

+ return kEncodingInfoTable[enc].encoding_name_;

+// TODO: Unify MimeEncodingName and EncodingName, or determine why

+// such a unification is not possible.

+const char * MimeEncodingName(Encoding enc) {

+ if ( (enc < 0) || (enc >= kNumEncodings) )

+ return ""; // TODO(jrm) Should this be invalid_encoding_name()?

+ return kEncodingInfoTable[enc].mime_encoding_name_;

+bool EncodingFromName(const char* enc_name, Encoding *encoding) {

+ *encoding = UNKNOWN_ENCODING;

+ if ( enc_name == NULL ) return false;

+ for ( int i = 0; i < kNumEncodings; i++ ) {

+ if ( !strcasecmp(enc_name, kEncodingInfoTable[i].encoding_name_) ) {

+ *encoding = static_cast<Encoding>(i);

+ return true;

+ }

+ return false;

+#if 0

+// The encoding_map maps standard and non-standard encoding-names

+// (strings) to Encoding enums. It is used only by

+// EncodingNameAliasToEncoding. Note that the map uses

+// case-insensitive hash and comparison functions.

+typedef hash_map <const char *, Encoding,

+ CStringAlnumCaseHash,

+ CStringAlnumCaseEqual> EncodingMap;

+static EncodingMap encoding_map;

+// Mutex for locking the code that initializes encoding_map.

+// static Mutex encodings_init_mutex(base::LINKER_INITIALIZED);

+void InitEncodings() {

+ // For thread safety, keep a mutex while initializing this map.

+ // Also allow this function to be called more than once and

+ // gracefully exiting if that occurs.

+ // MutexLock lock(&encodings_init_mutex);

+ if (!encoding_map.empty()) {

+ // Already initialized

+ return;

+ }

+ // Initialize the map with all the "standard" encoding names,

+ // i.e., the ones returned by EncodingName and MimeEncodingName.

+ //

+ // First, add internal encoding names returned by EncodingName().

+ for (int i = 0; i < NUM_ENCODINGS; ++i) {

+ Encoding e = static_cast<Encoding>(i);

+ // Internal encoding names must be unique.

+ // The internal names are guaranteed to be unique by the CHECK_EQ.

+ const char *encoding_name = EncodingName(e);

+ CHECK_EQ(0, encoding_map.count(encoding_name))

+ << "Duplicate found for " << encoding_name;

+ encoding_map[encoding_name] = e;

+ }

+ // Then, add mime encoding names returned by MimeEncodingName().

+ // We don't override existing entries, to give precedence to entries

+ // added earlier.

+ for (int i = 0; i < NUM_ENCODINGS; ++i) {

+ Encoding e = static_cast<Encoding>(i);

+ // Note that MimeEncodingName() can return the same mime encoding

+ // name for different encoding enums like JAPANESE_SHIFT_JIS and

+ // KDDI_SHIFT_JIS. In that case, the encoding enum first seen

+ // will be the value for the encoding name in the map.

+ const char *mime_encoding_name = MimeEncodingName(e);

+ if (encoding_map.count(mime_encoding_name) == 0) {

+ encoding_map[mime_encoding_name] = e;

+ }

+ // Add some non-standard names: alternate spellings, common typos,

+ // etc. (It does no harm to add names already in the map.) Note

+ // that although the map is case-insensitive, by convention the

+ // keys are written here in lower case. For ease of maintenance,

+ // they are listed in alphabetical order.

+ encoding_map["5601"] = KOREAN_EUC_KR;

+ encoding_map["646"] = ASCII_7BIT;

+ encoding_map["852"] = CZECH_CP852;

+ encoding_map["866"] = RUSSIAN_CP866;

+ encoding_map["8859-1"] = ISO_8859_1;

+ encoding_map["ansi-1251"] = RUSSIAN_CP1251;

+ encoding_map["ansi_x3.4-1968"] = ASCII_7BIT;

+ encoding_map["arabic"] = ISO_8859_6;

+ encoding_map["ascii"] = ISO_8859_1;

+ encoding_map["ascii-7-bit"] = ASCII_7BIT; // not iana standard

+ encoding_map["asmo-708"] = ISO_8859_6;

+ encoding_map["bhaskar"] = BHASKAR;

+ encoding_map["big5"] = CHINESE_BIG5;

+ encoding_map["big5-cp950"] = CHINESE_BIG5_CP950; // not iana standard

+ encoding_map["big5-hkscs"] = BIG5_HKSCS;

+ encoding_map["chinese"] = CHINESE_GB;

+ encoding_map["cns"] = CHINESE_CNS; // not iana standard

+ encoding_map["cns11643"] = CHINESE_CNS;

+ encoding_map["cp1250"] = MSFT_CP1250; // not iana standard

+ encoding_map["cp1251"] = RUSSIAN_CP1251; // not iana standard

+ encoding_map["cp1252"] = MSFT_CP1252; // not iana standard

+ encoding_map["cp1253"] = MSFT_CP1253; // not iana standard

+ encoding_map["cp1254"] = MSFT_CP1254; // not iana standard

+ encoding_map["cp1255"] = MSFT_CP1255;

+ encoding_map["cp1256"] = MSFT_CP1256;

+ encoding_map["cp1257"] = MSFT_CP1257; // not iana standard

+ encoding_map["cp819"] = ISO_8859_1;

+ encoding_map["cp852"] = CZECH_CP852;

+ encoding_map["cp866"] = RUSSIAN_CP866;

+ encoding_map["cp-866"] = RUSSIAN_CP866;

+ encoding_map["cp874"] = MSFT_CP874;

+ encoding_map["cp932"] = JAPANESE_CP932; // not iana standard

+ encoding_map["cp950"] = CHINESE_BIG5_CP950; // not iana standard

+ encoding_map["csbig5"] = CHINESE_BIG5;

+ encoding_map["cseucjpkdfmtjapanese"] = JAPANESE_EUC_JP;

+ encoding_map["cseuckr"] = KOREAN_EUC_KR;

+ encoding_map["csgb2312"] = CHINESE_GB;

+ encoding_map["csibm852"] = CZECH_CP852;

+ encoding_map["csibm866"] = RUSSIAN_CP866;

+ encoding_map["csiso2022jp"] = JAPANESE_JIS;

+ encoding_map["csiso2022kr"] = ISO_2022_KR;

+ encoding_map["csiso58gb231280"] = CHINESE_GB;

+ encoding_map["csiso88598i"] = ISO_8859_8_I;

+ encoding_map["csisolatin1"] = ISO_8859_1;

+ encoding_map["csisolatin2"] = ISO_8859_2;

+ encoding_map["csisolatin3"] = ISO_8859_3;

+ encoding_map["csisolatin4"] = ISO_8859_4;

+ encoding_map["csisolatin5"] = ISO_8859_9;

+ encoding_map["csisolatin6"] = ISO_8859_10;

+ encoding_map["csisolatinarabic"] = ISO_8859_6;

+ encoding_map["csisolatincyrillic"] = ISO_8859_5;

+ encoding_map["csisolatingreek"] = ISO_8859_7;

+ encoding_map["csisolatinhebrew"] = ISO_8859_8;

+ encoding_map["csksc56011987"] = KOREAN_EUC_KR;

+ encoding_map["csmacintosh"] = MACINTOSH_ROMAN;

+ encoding_map["csn-369103"] = CZECH_CSN_369103;

+ encoding_map["csshiftjis"] = JAPANESE_SHIFT_JIS;

+ encoding_map["csunicode"] = UTF16BE;

+ encoding_map["csunicode11"] = UTF16BE;

+ encoding_map["csunicode11utf7"] = UTF7;

+ encoding_map["csunicodeascii"] = UTF16BE;

+ encoding_map["csunicodelatin1"] = UTF16BE;

+ encoding_map["cyrillic"] = ISO_8859_5;

+ encoding_map["ecma-114"] = ISO_8859_6;

+ encoding_map["ecma-118"] = ISO_8859_7;

+ encoding_map["elot_928"] = ISO_8859_7;

+ encoding_map["euc"] = CHINESE_EUC_DEC; // not iana standard

+ encoding_map["euc-cn"] = CHINESE_EUC_CN; // not iana standard

+ encoding_map["euc-dec"] = CHINESE_EUC_DEC; // not iana standard

+ encoding_map["euc-jp"] = JAPANESE_EUC_JP;

+ encoding_map["euc-kr"] = KOREAN_EUC_KR;

+ encoding_map["eucgb2312_cn"] = CHINESE_GB;

+ encoding_map["gb"] = CHINESE_GB; // not iana standard

+ encoding_map["gb18030"] = GB18030;

+ encoding_map["gb2132"] = CHINESE_GB; // common typo

+ encoding_map["gb2312"] = CHINESE_GB;

+ encoding_map["gb_2312-80"] = CHINESE_GB;

+ encoding_map["gbk"] = GBK;

+ encoding_map["greek"] = ISO_8859_7;

+ encoding_map["greek8"] = ISO_8859_7;

+ encoding_map["hebrew"] = ISO_8859_8;

+ encoding_map["htchanakya"] = HTCHANAKYA;

+ encoding_map["hz-gb-2312"] = HZ_GB_2312;

+ encoding_map["ibm819"] = ISO_8859_1;

+ encoding_map["ibm852"] = CZECH_CP852;

+ encoding_map["ibm874"] = MSFT_CP874;

+ encoding_map["iso-10646"] = UTF16BE;

+ encoding_map["iso-10646-j-1"] = UTF16BE;

+ encoding_map["iso-10646-ucs-2"] = UNICODE;

+ encoding_map["iso-10646-ucs-4"] = UTF32BE;

+ encoding_map["iso-10646-ucs-basic"] = UTF16BE;

+ encoding_map["iso-10646-unicode-latin1"] = UTF16BE;

+ encoding_map["iso-2022-cn"] = ISO_2022_CN;

+ encoding_map["iso-2022-jp"] = JAPANESE_JIS;

+ encoding_map["iso-2022-kr"] = ISO_2022_KR;

+ encoding_map["iso-8559-1"] = ISO_8859_1; // common typo

+ encoding_map["iso-874"] = MSFT_CP874;

+ encoding_map["iso-8858-1"] = ISO_8859_1; // common typo

+ // iso-8859-0 was a temporary name, eventually renamed iso-8859-15

+ encoding_map["iso-8859-0"] = ISO_8859_15;

+ encoding_map["iso-8859-1"] = ISO_8859_1;

+ encoding_map["iso-8859-10"] = ISO_8859_10;

+ encoding_map["iso-8859-11"] = ISO_8859_11;

+ encoding_map["iso-8859-13"] = ISO_8859_13;

+ encoding_map["iso-8859-15"] = ISO_8859_15;

+ encoding_map["iso-8859-2"] = ISO_8859_2;

+ encoding_map["iso-8859-3"] = ISO_8859_3;

+ encoding_map["iso-8859-4"] = ISO_8859_4;

+ encoding_map["iso-8859-5"] = ISO_8859_5;

+ encoding_map["iso-8859-6"] = ISO_8859_6;

+ encoding_map["iso-8859-7"] = ISO_8859_7;

+ encoding_map["iso-8859-8"] = ISO_8859_8;

+ encoding_map["iso-8859-8-i"] = ISO_8859_8_I;

+ encoding_map["iso-8859-9"] = ISO_8859_9;

+ encoding_map["iso-9959-1"] = ISO_8859_1; // common typo

+ encoding_map["iso-ir-100"] = ISO_8859_1;

+ encoding_map["iso-ir-101"] = ISO_8859_2;

+ encoding_map["iso-ir-109"] = ISO_8859_3;

+ encoding_map["iso-ir-110"] = ISO_8859_4;

+ encoding_map["iso-ir-126"] = ISO_8859_7;

+ encoding_map["iso-ir-127"] = ISO_8859_6;

+ encoding_map["iso-ir-138"] = ISO_8859_8;

+ encoding_map["iso-ir-144"] = ISO_8859_5;

+ encoding_map["iso-ir-148"] = ISO_8859_9;

+ encoding_map["iso-ir-149"] = KOREAN_EUC_KR;

+ encoding_map["iso-ir-157"] = ISO_8859_10;

+ encoding_map["iso-ir-58"] = CHINESE_GB;

+ encoding_map["iso-latin-1"] = ISO_8859_1;

+ encoding_map["iso_2022-cn"] = ISO_2022_CN;

+ encoding_map["iso_2022-kr"] = ISO_2022_KR;

+ encoding_map["iso_8859-1"] = ISO_8859_1;

+ encoding_map["iso_8859-10:1992"] = ISO_8859_10;

+ encoding_map["iso_8859-11"] = ISO_8859_11;

+ encoding_map["iso_8859-13"] = ISO_8859_13;

+ encoding_map["iso_8859-15"] = ISO_8859_15;

+ encoding_map["iso_8859-1:1987"] = ISO_8859_1;

+ encoding_map["iso_8859-2"] = ISO_8859_2;

+ encoding_map["iso_8859-2:1987"] = ISO_8859_2;

+ encoding_map["iso_8859-3"] = ISO_8859_3;

+ encoding_map["iso_8859-3:1988"] = ISO_8859_3;

+ encoding_map["iso_8859-4"] = ISO_8859_4;

+ encoding_map["iso_8859-4:1988"] = ISO_8859_4;

+ encoding_map["iso_8859-5"] = ISO_8859_5;

+ encoding_map["iso_8859-5:1988"] = ISO_8859_5;

+ encoding_map["iso_8859-6"] = ISO_8859_6;

+ encoding_map["iso_8859-6:1987"] = ISO_8859_6;

+ encoding_map["iso_8859-7"] = ISO_8859_7;

+ encoding_map["iso_8859-7:1987"] = ISO_8859_7;

+ encoding_map["iso_8859-8"] = ISO_8859_8;

+ encoding_map["iso_8859-8:1988:"] = ISO_8859_8;

+ encoding_map["iso_8859-9"] = ISO_8859_9;

+ encoding_map["iso_8859-9:1989"] = ISO_8859_9;

+ encoding_map["jagran"] = JAGRAN;

+ encoding_map["jis"] = JAPANESE_JIS; // not iana standard

+ encoding_map["koi8-cs"] = CZECH_CSN_369103;

+ encoding_map["koi8-r"] = RUSSIAN_KOI8_R;

+ encoding_map["koi8-ru"] = RUSSIAN_KOI8_RU; // not iana standard

+ encoding_map["koi8-u"] = RUSSIAN_KOI8_RU;

+ encoding_map["koi8r"] = RUSSIAN_KOI8_R; // not iana standard

+ encoding_map["koi8u"] = RUSSIAN_KOI8_RU; // not iana standard

+ encoding_map["korean"] = KOREAN_EUC_KR; // i assume this is what is meant

+ encoding_map["ks-c-5601"] = KOREAN_EUC_KR; // not iana standard

+ encoding_map["ks-c-5601-1987"] = KOREAN_EUC_KR; // not iana standard

+ encoding_map["ks_c_5601-1989"] = KOREAN_EUC_KR;

+ encoding_map["ksc"] = KOREAN_EUC_KR; // not iana standard

+ encoding_map["l1"] = ISO_8859_1;

+ encoding_map["l2"] = ISO_8859_2;

+ encoding_map["l3"] = ISO_8859_3;

+ encoding_map["l4"] = ISO_8859_4;

+ encoding_map["l5"] = ISO_8859_9;

+ encoding_map["l6"] = ISO_8859_10;

+ encoding_map["latin-1"] = ISO_8859_1; // not iana standard

+ encoding_map["latin1"] = ISO_8859_1;

+ encoding_map["latin2"] = ISO_8859_2;

+ encoding_map["latin3"] = ISO_8859_3;

+ encoding_map["latin4"] = ISO_8859_4;

+ encoding_map["latin5"] = ISO_8859_9;

+ encoding_map["latin6"] = ISO_8859_10;

+ encoding_map["mac"] = MACINTOSH_ROMAN;

+ encoding_map["macintosh"] = MACINTOSH_ROMAN;

+ encoding_map["macintosh-roman"] = MACINTOSH_ROMAN;

+ encoding_map["ms932"] = JAPANESE_CP932; // not iana standard

+ encoding_map["ms_kanji"] = JAPANESE_CP932;

+ encoding_map["shift-jis"] = JAPANESE_SHIFT_JIS;

+ encoding_map["shift_jis"] = JAPANESE_SHIFT_JIS;

+ encoding_map["sjis"] = JAPANESE_SHIFT_JIS; // not iana standard

+ encoding_map["sjs"] = JAPANESE_SHIFT_JIS; // not iana standard

+ encoding_map["sun_eu_greek"] = ISO_8859_7;

+ encoding_map["tab"] = TAMIL_BI;

+ encoding_map["tam"] = TAMIL_MONO;

+ encoding_map["tis-620"] = ISO_8859_11;

+ encoding_map["tscii"] = TSCII;

+ encoding_map["un"] = UNKNOWN_ENCODING; // not iana standard

+ encoding_map["unicode"] = UNICODE; // not iana standard

+ encoding_map["unicode-1-1-utf-7"] = UTF7;

+ encoding_map["unicode-1-1-utf-8"] = UTF8;

+ encoding_map["unicode-2-0-utf-7"] = UTF7;

+ encoding_map["unknown"] = UNKNOWN_ENCODING; // not iana standard

+ encoding_map["us"] = ISO_8859_1;

+ encoding_map["us-ascii"] = ISO_8859_1;

+ encoding_map["utf-16be"] = UTF16BE;

+ encoding_map["utf-16le"] = UTF16LE;

+ encoding_map["utf-32be"] = UTF32BE;

+ encoding_map["utf-32le"] = UTF32LE;

+ encoding_map["utf-7"] = UTF7;

+ encoding_map["utf-8"] = UTF8;

+ encoding_map["utf7"] = UTF7;

+ encoding_map["utf8"] = UTF8; // not iana standard

+ encoding_map["visual"] = HEBREW_VISUAL;

+ encoding_map["win-1250"] = MSFT_CP1250; // not iana standard

+ encoding_map["win-1251"] = RUSSIAN_CP1251; // not iana standard

+ encoding_map["window-874"] = MSFT_CP874;

+ encoding_map["windows-1250"] = MSFT_CP1250;

+ encoding_map["windows-1251"] = RUSSIAN_CP1251;

+ encoding_map["windows-1252"] = MSFT_CP1252;

+ encoding_map["windows-1253"] = MSFT_CP1253;

+ encoding_map["windows-1254"] = MSFT_CP1254;

+ encoding_map["windows-1255"] = MSFT_CP1255;

+ encoding_map["windows-1256"] = MSFT_CP1256;

+ encoding_map["windows-1257"] = MSFT_CP1257;

+ encoding_map["windows-31j"] = JAPANESE_CP932;

+ encoding_map["windows-874"] = MSFT_CP874;

+ encoding_map["windows-936"] = GBK;

+ encoding_map["x-big5"] = CHINESE_BIG5;

+ encoding_map["x-binaryenc"] = BINARYENC; // not iana standard

+ encoding_map["x-cp1250"] = MSFT_CP1250;

+ encoding_map["x-cp1251"] = RUSSIAN_CP1251;

+ encoding_map["x-cp1252"] = MSFT_CP1252;

+ encoding_map["x-cp1253"] = MSFT_CP1253;

+ encoding_map["x-cp1254"] = MSFT_CP1254;

+ encoding_map["x-cp1255"] = MSFT_CP1255;

+ encoding_map["x-cp1256"] = MSFT_CP1256;

+ encoding_map["x-cp1257"] = MSFT_CP1257;

+ encoding_map["x-euc-jp"] = JAPANESE_EUC_JP;

+ encoding_map["x-euc-tw"] = CHINESE_CNS;

+ encoding_map["x-gbk"] = GBK;

+ encoding_map["x-iso-10646-ucs-2-be"] = UTF16BE;

+ encoding_map["x-iso-10646-ucs-2-le"] = UTF16LE;

+ encoding_map["x-iso-10646-ucs-4-be"] = UTF32BE;

+ encoding_map["x-iso-10646-ucs-4-le"] = UTF32LE;

+ encoding_map["x-jis"] = JAPANESE_JIS; // not iana standard

+ encoding_map["x-mac-roman"] = MACINTOSH_ROMAN;

+ encoding_map["x-shift_jis"] = JAPANESE_SHIFT_JIS; // not iana standard

+ encoding_map["x-sjis"] = JAPANESE_SHIFT_JIS;

+ encoding_map["x-unicode-2-0-utf-7"] = UTF7;

+ encoding_map["x-utf8utf8"] = UTF8UTF8; // not iana standard

+ encoding_map["x-x-big5"] = CHINESE_BIG5;

+ encoding_map["zh_cn.euc"] = CHINESE_GB;

+ encoding_map["zh_tw-big5"] = CHINESE_BIG5;

+ encoding_map["zh_tw-euc"] = CHINESE_CNS;

+ // Remove they entry for the empty string, if any.

+ encoding_map.erase("");

+REGISTER_MODULE_INITIALIZER(encodings, {

+ InitEncodings();

+});

+// ----------------------------------------------------------------------

+// EncodingNameAliasToEncoding()

+//

+// This function takes an encoding name/alias and returns the Encoding

+// enum. The input is case insensitive. It is the union of the common

+// IANA standard names, the charset names used in Netscape Navigator,

+// and some common names we have been using.

+// See: http://www.iana.org/assignments/character-sets

+// http://physics.hallym.ac.kr/resource/relnotes/windows-2.0.html

+//

+// UNKNOWN_ENCODING is returned if none matches.

+//

+// TODO: Check if it is possible to remove the non-standard,

+// non-netscape-use names. It is because this routine is used for

+// encoding detections from html meta info. Non-standard names may

+// introduce noise on encoding detection.

+//

+// TODO: Unify EncodingNameAliasToEncoding and EncodingFromName,

+// or determine why such a unification is not possible.

+// ----------------------------------------------------------------------

+Encoding EncodingNameAliasToEncoding(const char *encoding_name) {

+ if (!encoding_name) {

+ return UNKNOWN_ENCODING;

+ }

+ // The map is initialized during InitGoogle() in a thread-safe manner.

+ CHECK(!encoding_map.empty()) << ": Must call InitGoogle()";

+ EncodingMap::iterator emi = encoding_map.find(encoding_name);

+ if (emi != encoding_map.end()) {

+ return emi->second;

+ } else {

+ return UNKNOWN_ENCODING;

+ }

+#endif

+const char* default_encoding_name() {

+ return kEncodingInfoTable[LATIN1].encoding_name_;

+static const char* const kInvalidEncodingName = "invalid_encoding";

+const char *invalid_encoding_name() {

+ return kInvalidEncodingName;

+// *************************************************************

+// Miscellany

+// *************************************************************

+Encoding PreferredWebOutputEncoding(Encoding enc) {

+ return IsValidEncoding(enc)

+ ? kEncodingInfoTable[enc].preferred_web_output_encoding_

+ : UTF8;

}

« no previous file with comments | « third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc ('k') | tools/run-perf-test.cfg » ('j') | no next file with comments »