base/lang_enc.h - Issue 624713003: Keep only base/extractor.[cc|h].

Unified Diff: base/lang_enc.h

Issue 624713003: Keep only base/extractor.[cc|h]. (Closed) Base URL: https://chromium.googlesource.com/external/omaha.git@master

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: base/lang_enc.h

diff --git a/base/lang_enc.h b/base/lang_enc.h

deleted file mode 100644

index 9da8c0351f129a05b90b474da404d7972d4795eb..0000000000000000000000000000000000000000

--- a/base/lang_enc.h

+++ /dev/null

@@ -1,299 +0,0 @@

-//

-// Licensed under the Apache License, Version 2.0 (the "License");

-// you may not use this file except in compliance with the License.

-// You may obtain a copy of the License at

-//

-// http://www.apache.org/licenses/LICENSE-2.0

-//

-// Unless required by applicable law or agreed to in writing, software

-// distributed under the License is distributed on an "AS IS" BASIS,

-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

-// See the License for the specific language governing permissions and

-// limitations under the License.

-// ========================================================================

-//

-// This file is for i18n. It contains two enums, namely Language and

-// Encoding, where Language is the linguistic convention, and Encoding

-// contains information on both language encoding and character set.

-//

-// The language and encoding are both based on Teragram's conventions,

-// except for some common ISO-8859 encodings that are not detected by

-// Teragram but might be in the future.

-//

-// This file also includes functions that do mappings among

-// Language/Encoding enums, language/encoding string names (typically

-// the output from Language Encoding identifier), and language codes

-// (iso 639), and two-letter country codes (iso 3166)

-//

-// NOTE: Both Language and Encoding enums should always start from

-// zero value. This assumption has been made and used.

-#ifndef OMAHA_BASE_LANG_ENC_H_

-#define OMAHA_BASE_LANG_ENC_H_

-#include <windows.h>

-// some of the popular encoding aliases

-#define LATIN1 ISO_8859_1

-#define LATIN2 ISO_8859_2

-#define LATIN3 ISO_8859_3

-#define LATIN4 ISO_8859_4

-#define CYRILLIC ISO_8859_5

-#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language

-#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language

-#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language

-#define LATIN5 ISO_8859_9

-#define LATIN6 ISO_8859_10

-#define KOREAN_HANGUL KOREAN_EUC_KR

-// NOTE: Only add new languages to the end of this list (but before

-// NUM_LANGUAGES).

-enum Language {

- ENGLISH = 0, /* 0 */

- DANISH, /* 1 */

- DUTCH, /* 2 */

- FINNISH, /* 3 */

- FRENCH, /* 4 */

- GERMAN, /* 5 */

- HEBREW, /* 6 */

- ITALIAN, /* 7 */

- JAPANESE, /* 8 */

- KOREAN, /* 9 */

- NORWEGIAN, /* 10 */

- POLISH, /* 11 */

- PORTUGUESE, /* 12 */

- RUSSIAN, /* 13 */

- SPANISH, /* 14 */

- SWEDISH, /* 15 */

- CHINESE, /* 16 */

- CZECH, /* 17 */

- GREEK, /* 18 */

- ICELANDIC, /* 19 */

- LATVIAN, /* 20 */

- LITHUANIAN, /* 21 */

- ROMANIAN, /* 22 */

- HUNGARIAN, /* 23 */

- ESTONIAN, /* 24 */

- TG_UNKNOWN_LANGUAGE, /* 25 */

- UNKNOWN_LANGUAGE, /* 26 */

- BULGARIAN, /* 27 */

- CROATIAN, /* 28 */

- SERBIAN, /* 29 */

- IRISH, /* 30 */

- GALICIAN, /* 31 */

- TAGALOG, /* 32 */

- TURKISH, /* 33 */

- UKRAINIAN, /* 34 */

- HINDI, /* 35 */

- MACEDONIAN, /* 36 */

- BENGALI, /* 37 */

- INDONESIAN, /* 38 */

- LATIN, /* 39 */

- MALAY, /* 40 */

- MALAYALAM, /* 41 */

- WELSH, /* 42 */

- NEPALI, /* 43 */

- TELUGU, /* 44 */

- ALBANIAN, /* 45 */

- TAMIL, /* 46 */

- BELARUSIAN, /* 47 */

- JAVANESE, /* 48 */

- OCCITAN, /* 49 */

- URDU, /* 50 */

- BIHARI, /* 51 */

- GUJARATI, /* 52 */

- THAI, /* 53 */

- ARABIC, /* 54 */

- CATALAN, /* 55 */

- ESPERANTO, /* 56 */

- BASQUE, /* 57 */

- INTERLINGUA, /* 58 */

- KANNADA, /* 59 */

- PUNJABI, /* 60 */

- SCOTS_GAELIC, /* 61 */

- SWAHILI, /* 62 */

- SLOVENIAN, /* 63 */

- MARATHI, /* 64 */

- MALTESE, /* 65 */

- VIETNAMESE, /* 66 */

- FRISIAN, /* 67 */

- SLOVAK, /* 68 */

- CHINESE_T, /* 69 */ // This is added to solve the problem of

- // distinguishing Traditional and Simplified

- // Chinese when the encoding is UTF8.

- FAROESE, /* 70 */

- SUNDANESE, /* 71 */

- UZBEK, /* 72 */

- AMHARIC, /* 73 */

- AZERBAIJANI, /* 74 */

- GEORGIAN, /* 75 */

- TIGRINYA, /* 76 */

- PERSIAN, /* 77 */

- BOSNIAN, /* 78 */

- SINHALESE, /* 79 */

- NORWEGIAN_N, /* 80 */

- PORTUGUESE_P, /* 81 */

- PORTUGUESE_B, /* 82 */

- XHOSA, /* 83 */

- ZULU, /* 84 */

- GUARANI, /* 85 */

- SESOTHO, /* 86 */

- TURKMEN, /* 87 */

- KYRGYZ, /* 88 */

- BRETON, /* 89 */

- TWI, /* 90 */

- YIDDISH, /* 91 */

- ORIYA, /* 92 */

- SERBO_CROATIAN, /* 93 */

- SOMALI, /* 94 */

- UIGHUR, /* 95 */

- KURDISH, /* 96 */

- MONGOLIAN, /* 97 */

- ARMENIAN, /* 98 */

- LAOTHIAN, /* 99 */

- SINDHI, /* 100! */

- RHAETO_ROMANCE, /* 101 */

- CHINESE_JAPANESE_KOREAN, /* 103 */ // Not really a language

- PSEUDOTRANSLATION, /* 104 */ // Not really a language

- NUM_LANGUAGES, // Always keep this at the end. It is not a

- // valid Language enum, it is only used to

- // indicate the total number of Languages.

-};

-// Language codes for those languages we support, used to map to IDs from

-// the Language enumeration. We could have used the Rfc1766ToLcid from the

-// Win32 system's mlang.dll to map these to LCIDs, but a) we don't want to

-// have to load mlang.dll and b) we are using our own language IDs.

-const TCHAR* const kLangCodeChinesePrc = _T("zh_cn");

-const TCHAR* const kLangCodeChineseTaiwan = _T("zh_tw");

-const TCHAR* const kLangCodeCjk = _T("cjk");

-const TCHAR* const kLangCodeDutch = _T("nl");

-const TCHAR* const kLangCodeEnglish = _T("en");

-const TCHAR* const kLangCodeFrench = _T("fr");

-const TCHAR* const kLangCodeGerman = _T("de");

-const TCHAR* const kLangCodeItalian = _T("it");

-const TCHAR* const kLangCodeJapanese = _T("ja");

-const TCHAR* const kLangCodeKorean = _T("ko");

-const TCHAR* const kLangCodePseudo = _T("x");

-const TCHAR* const kLangCodeSpanish = _T("es");

-// Maps language codes to languages. Terminated by a { NULL, UNKNOWN_LANGUAGE }

-// item.

-struct CodeToLanguage {

- const TCHAR* code;

- Language language;

-};

-SELECTANY CodeToLanguage codes_to_languages[] = {

- { kLangCodeChinesePrc, CHINESE },

- { kLangCodeChineseTaiwan, CHINESE_T },

- { kLangCodeCjk, CHINESE_JAPANESE_KOREAN },

- { kLangCodeDutch, DUTCH },

- { kLangCodeEnglish, ENGLISH },

- { kLangCodeFrench, FRENCH },

- { kLangCodeGerman, GERMAN },

- { kLangCodeItalian, ITALIAN },

- { kLangCodeJapanese, JAPANESE },

- { kLangCodeKorean, KOREAN },

- { kLangCodePseudo, PSEUDOTRANSLATION },

- { kLangCodeSpanish, SPANISH },

- { NULL, UNKNOWN_LANGUAGE }

-};

-// Macro to wrap the notion of "unknown language".

-#define IS_LANGUAGE_UNKNOWN(l) \

- ((l) == TG_UNKNOWN_LANGUAGE || (l) == UNKNOWN_LANGUAGE)

-// NOTE: Only add new encodings to the end of this list (but before

-// NUM_ENCODINGS).

-// NOTE: If you add an encoding here, you must also modify basistech_encoding()

-// and google2/com/google/i18n/Encoding.java

-enum Encoding {

- ISO_8859_1 = 0, // 0: Teragram ASCII

- ISO_8859_2, // 1: Teragram Latin2

- ISO_8859_3, // 2: in BasisTech but not in Teragram

- ISO_8859_4, // 3: Teragram Latin4

- ISO_8859_5, // 4: Teragram ISO-8859-5

- ISO_8859_6, // 5: Teragram Arabic

- ISO_8859_7, // 6: Teragram Greek

- ISO_8859_8, // 7: Teragram Hebrew

- ISO_8859_9, // 8: in BasisTech but not in Teragram

- ISO_8859_10, // 9: in BasisTech but not in Teragram

- JAPANESE_EUC_JP, // 10: Teragram EUC_JP

- JAPANESE_SHIFT_JIS, // 11: Teragram SJS

- JAPANESE_JIS, // 12: Teragram JIS

- CHINESE_BIG5, // 13: Teragram BIG5

- CHINESE_GB, // 14: Teragram GB

- CHINESE_EUC_CN, // 15: Teragram EUC-CN

- KOREAN_EUC_KR, // 16: Teragram KSC

- UNICODE_ENCODING, // 17: Teragram Unicode, changed to UNICODE_ENCODING

- // from UNICODE, which is predefined by WINDOW

- CHINESE_EUC_DEC, // 18: Teragram EUC

- CHINESE_CNS, // 19: Teragram CNS

- CHINESE_BIG5_CP950, // 20: Teragram BIG5_CP950

- JAPANESE_CP932, // 21: Teragram CP932

- UTF8, // 22

- UNKNOWN_ENCODING, // 23

- ASCII_7BIT, // 24: ISO_8859_1 with all characters <= 127.

- // Should be present only in the crawler

- // and in the repository,

- // *never* as a result of Document::encoding().

- RUSSIAN_KOI8_R, // 25: Teragram KOI8R

- RUSSIAN_CP1251, // 26: Teragram CP1251

- //----------------------------------------------------------

- // These are _not_ output from teragram. Instead, they are as

- // detected in the headers of usenet articles.

- MSFT_CP1252, // 27: CP1252 aka MSFT euro ascii

- RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian

- MSFT_CP1250, // 29: CP1250 aka MSFT eastern european

- ISO_8859_15, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized

- //----------------------------------------------------------

- // These are in BasisTech but not in Teragram. They are

- // needed for new interface languages. Now detected by

- // research langid

- MSFT_CP1254, // 31: used for Turkish

- MSFT_CP1257, // 32: used in Baltic countries

- //----------------------------------------------------------

- // New encodings detected by Teragram

- ISO_8859_11, // 33: aka TIS-620, used for Thai

- MSFT_CP874, // 34: used for Thai

- MSFT_CP1256, // 35: used for Arabic

- //----------------------------------------------------------

- // Detected as ISO_8859_8 by Teragram, but can be found in META tags

- MSFT_CP1255, // 36: Logical Hebrew Microsoft

- ISO_8859_8_I, // 37: Iso Hebrew Logical

- HEBREW_VISUAL, // 38: Iso Hebrew Visual

- //----------------------------------------------------------

- // Detected by research langid

- CZECH_CP852, // 39

- CZECH_CSN_369103, // 40: aka ISO_IR_139 aka KOI8_CS

- MSFT_CP1253, // 41: used for Greek

- RUSSIAN_CP866, // 42

- //----------------------------------------------------------

- HZ_ENCODING,

- ISO2022_CN,

- ISO2022_KR,

- NUM_ENCODINGS // Always keep this at the end. It is not a

- // valid Encoding enum, it is only used to

- // indicate the total number of Encodings.

-};

-const int kNumLanguages = NUM_LANGUAGES;

-const int kNumEncodings = NUM_ENCODINGS;

-#endif // OMAHA_BASE_LANG_ENC_H_

« no previous file with comments | « base/highres_timer_unittest.cc ('k') | base/localization.h » ('j') | no next file with comments »