Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1908)

Unified Diff: base/lang_enc.h

Issue 624713003: Keep only base/extractor.[cc|h]. (Closed) Base URL: https://chromium.googlesource.com/external/omaha.git@master
Patch Set: Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « base/highres_timer_unittest.cc ('k') | base/localization.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: base/lang_enc.h
diff --git a/base/lang_enc.h b/base/lang_enc.h
deleted file mode 100644
index 9da8c0351f129a05b90b474da404d7972d4795eb..0000000000000000000000000000000000000000
--- a/base/lang_enc.h
+++ /dev/null
@@ -1,299 +0,0 @@
-// Copyright 2004-2009 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// ========================================================================
-//
-// This file is for i18n. It contains two enums, namely Language and
-// Encoding, where Language is the linguistic convention, and Encoding
-// contains information on both language encoding and character set.
-//
-// The language and encoding are both based on Teragram's conventions,
-// except for some common ISO-8859 encodings that are not detected by
-// Teragram but might be in the future.
-//
-// This file also includes functions that do mappings among
-// Language/Encoding enums, language/encoding string names (typically
-// the output from Language Encoding identifier), and language codes
-// (iso 639), and two-letter country codes (iso 3166)
-//
-// NOTE: Both Language and Encoding enums should always start from
-// zero value. This assumption has been made and used.
-
-#ifndef OMAHA_BASE_LANG_ENC_H_
-#define OMAHA_BASE_LANG_ENC_H_
-
-#include <windows.h>
-
-// some of the popular encoding aliases
-#define LATIN1 ISO_8859_1
-#define LATIN2 ISO_8859_2
-#define LATIN3 ISO_8859_3
-#define LATIN4 ISO_8859_4
-#define CYRILLIC ISO_8859_5
-#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
-#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
-#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
-#define LATIN5 ISO_8859_9
-#define LATIN6 ISO_8859_10
-#define KOREAN_HANGUL KOREAN_EUC_KR
-
-// NOTE: Only add new languages to the end of this list (but before
-// NUM_LANGUAGES).
-enum Language {
- ENGLISH = 0, /* 0 */
- DANISH, /* 1 */
- DUTCH, /* 2 */
- FINNISH, /* 3 */
- FRENCH, /* 4 */
- GERMAN, /* 5 */
- HEBREW, /* 6 */
- ITALIAN, /* 7 */
- JAPANESE, /* 8 */
- KOREAN, /* 9 */
- NORWEGIAN, /* 10 */
- POLISH, /* 11 */
- PORTUGUESE, /* 12 */
- RUSSIAN, /* 13 */
- SPANISH, /* 14 */
- SWEDISH, /* 15 */
- CHINESE, /* 16 */
- CZECH, /* 17 */
- GREEK, /* 18 */
- ICELANDIC, /* 19 */
- LATVIAN, /* 20 */
- LITHUANIAN, /* 21 */
- ROMANIAN, /* 22 */
- HUNGARIAN, /* 23 */
- ESTONIAN, /* 24 */
- TG_UNKNOWN_LANGUAGE, /* 25 */
- UNKNOWN_LANGUAGE, /* 26 */
- BULGARIAN, /* 27 */
- CROATIAN, /* 28 */
- SERBIAN, /* 29 */
- IRISH, /* 30 */
- GALICIAN, /* 31 */
- TAGALOG, /* 32 */
- TURKISH, /* 33 */
- UKRAINIAN, /* 34 */
- HINDI, /* 35 */
- MACEDONIAN, /* 36 */
- BENGALI, /* 37 */
- INDONESIAN, /* 38 */
- LATIN, /* 39 */
- MALAY, /* 40 */
- MALAYALAM, /* 41 */
- WELSH, /* 42 */
- NEPALI, /* 43 */
- TELUGU, /* 44 */
- ALBANIAN, /* 45 */
- TAMIL, /* 46 */
- BELARUSIAN, /* 47 */
- JAVANESE, /* 48 */
- OCCITAN, /* 49 */
- URDU, /* 50 */
- BIHARI, /* 51 */
- GUJARATI, /* 52 */
- THAI, /* 53 */
- ARABIC, /* 54 */
- CATALAN, /* 55 */
- ESPERANTO, /* 56 */
- BASQUE, /* 57 */
- INTERLINGUA, /* 58 */
- KANNADA, /* 59 */
- PUNJABI, /* 60 */
- SCOTS_GAELIC, /* 61 */
- SWAHILI, /* 62 */
- SLOVENIAN, /* 63 */
- MARATHI, /* 64 */
- MALTESE, /* 65 */
- VIETNAMESE, /* 66 */
- FRISIAN, /* 67 */
- SLOVAK, /* 68 */
- CHINESE_T, /* 69 */ // This is added to solve the problem of
- // distinguishing Traditional and Simplified
- // Chinese when the encoding is UTF8.
- FAROESE, /* 70 */
- SUNDANESE, /* 71 */
- UZBEK, /* 72 */
- AMHARIC, /* 73 */
- AZERBAIJANI, /* 74 */
- GEORGIAN, /* 75 */
- TIGRINYA, /* 76 */
- PERSIAN, /* 77 */
- BOSNIAN, /* 78 */
- SINHALESE, /* 79 */
- NORWEGIAN_N, /* 80 */
- PORTUGUESE_P, /* 81 */
- PORTUGUESE_B, /* 82 */
- XHOSA, /* 83 */
- ZULU, /* 84 */
- GUARANI, /* 85 */
- SESOTHO, /* 86 */
- TURKMEN, /* 87 */
- KYRGYZ, /* 88 */
- BRETON, /* 89 */
- TWI, /* 90 */
- YIDDISH, /* 91 */
- ORIYA, /* 92 */
- SERBO_CROATIAN, /* 93 */
- SOMALI, /* 94 */
- UIGHUR, /* 95 */
- KURDISH, /* 96 */
- MONGOLIAN, /* 97 */
- ARMENIAN, /* 98 */
- LAOTHIAN, /* 99 */
- SINDHI, /* 100! */
- RHAETO_ROMANCE, /* 101 */
- CHINESE_JAPANESE_KOREAN, /* 103 */ // Not really a language
- PSEUDOTRANSLATION, /* 104 */ // Not really a language
- NUM_LANGUAGES, // Always keep this at the end. It is not a
- // valid Language enum, it is only used to
- // indicate the total number of Languages.
-};
-
-
-// Language codes for those languages we support, used to map to IDs from
-// the Language enumeration. We could have used the Rfc1766ToLcid from the
-// Win32 system's mlang.dll to map these to LCIDs, but a) we don't want to
-// have to load mlang.dll and b) we are using our own language IDs.
-const TCHAR* const kLangCodeChinesePrc = _T("zh_cn");
-const TCHAR* const kLangCodeChineseTaiwan = _T("zh_tw");
-const TCHAR* const kLangCodeCjk = _T("cjk");
-const TCHAR* const kLangCodeDutch = _T("nl");
-const TCHAR* const kLangCodeEnglish = _T("en");
-const TCHAR* const kLangCodeFrench = _T("fr");
-const TCHAR* const kLangCodeGerman = _T("de");
-const TCHAR* const kLangCodeItalian = _T("it");
-const TCHAR* const kLangCodeJapanese = _T("ja");
-const TCHAR* const kLangCodeKorean = _T("ko");
-const TCHAR* const kLangCodePseudo = _T("x");
-const TCHAR* const kLangCodeSpanish = _T("es");
-
-
-// Maps language codes to languages. Terminated by a { NULL, UNKNOWN_LANGUAGE }
-// item.
-struct CodeToLanguage {
- const TCHAR* code;
- Language language;
-};
-
-SELECTANY CodeToLanguage codes_to_languages[] = {
- { kLangCodeChinesePrc, CHINESE },
- { kLangCodeChineseTaiwan, CHINESE_T },
- { kLangCodeCjk, CHINESE_JAPANESE_KOREAN },
- { kLangCodeDutch, DUTCH },
- { kLangCodeEnglish, ENGLISH },
- { kLangCodeFrench, FRENCH },
- { kLangCodeGerman, GERMAN },
- { kLangCodeItalian, ITALIAN },
- { kLangCodeJapanese, JAPANESE },
- { kLangCodeKorean, KOREAN },
- { kLangCodePseudo, PSEUDOTRANSLATION },
- { kLangCodeSpanish, SPANISH },
- { NULL, UNKNOWN_LANGUAGE }
-};
-
-
-
-// Macro to wrap the notion of "unknown language".
-#define IS_LANGUAGE_UNKNOWN(l) \
- ((l) == TG_UNKNOWN_LANGUAGE || (l) == UNKNOWN_LANGUAGE)
-
-// NOTE: Only add new encodings to the end of this list (but before
-// NUM_ENCODINGS).
-// NOTE: If you add an encoding here, you must also modify basistech_encoding()
-// and google2/com/google/i18n/Encoding.java
-enum Encoding {
- ISO_8859_1 = 0, // 0: Teragram ASCII
- ISO_8859_2, // 1: Teragram Latin2
- ISO_8859_3, // 2: in BasisTech but not in Teragram
- ISO_8859_4, // 3: Teragram Latin4
- ISO_8859_5, // 4: Teragram ISO-8859-5
- ISO_8859_6, // 5: Teragram Arabic
- ISO_8859_7, // 6: Teragram Greek
- ISO_8859_8, // 7: Teragram Hebrew
- ISO_8859_9, // 8: in BasisTech but not in Teragram
- ISO_8859_10, // 9: in BasisTech but not in Teragram
- JAPANESE_EUC_JP, // 10: Teragram EUC_JP
- JAPANESE_SHIFT_JIS, // 11: Teragram SJS
- JAPANESE_JIS, // 12: Teragram JIS
- CHINESE_BIG5, // 13: Teragram BIG5
- CHINESE_GB, // 14: Teragram GB
- CHINESE_EUC_CN, // 15: Teragram EUC-CN
- KOREAN_EUC_KR, // 16: Teragram KSC
- UNICODE_ENCODING, // 17: Teragram Unicode, changed to UNICODE_ENCODING
- // from UNICODE, which is predefined by WINDOW
- CHINESE_EUC_DEC, // 18: Teragram EUC
- CHINESE_CNS, // 19: Teragram CNS
- CHINESE_BIG5_CP950, // 20: Teragram BIG5_CP950
- JAPANESE_CP932, // 21: Teragram CP932
- UTF8, // 22
- UNKNOWN_ENCODING, // 23
- ASCII_7BIT, // 24: ISO_8859_1 with all characters <= 127.
- // Should be present only in the crawler
- // and in the repository,
- // *never* as a result of Document::encoding().
- RUSSIAN_KOI8_R, // 25: Teragram KOI8R
- RUSSIAN_CP1251, // 26: Teragram CP1251
-
- //----------------------------------------------------------
- // These are _not_ output from teragram. Instead, they are as
- // detected in the headers of usenet articles.
- MSFT_CP1252, // 27: CP1252 aka MSFT euro ascii
- RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian
- MSFT_CP1250, // 29: CP1250 aka MSFT eastern european
- ISO_8859_15, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized
- //----------------------------------------------------------
-
- //----------------------------------------------------------
- // These are in BasisTech but not in Teragram. They are
- // needed for new interface languages. Now detected by
- // research langid
- MSFT_CP1254, // 31: used for Turkish
- MSFT_CP1257, // 32: used in Baltic countries
- //----------------------------------------------------------
-
- //----------------------------------------------------------
- //----------------------------------------------------------
- // New encodings detected by Teragram
- ISO_8859_11, // 33: aka TIS-620, used for Thai
- MSFT_CP874, // 34: used for Thai
- MSFT_CP1256, // 35: used for Arabic
-
- //----------------------------------------------------------
- // Detected as ISO_8859_8 by Teragram, but can be found in META tags
- MSFT_CP1255, // 36: Logical Hebrew Microsoft
- ISO_8859_8_I, // 37: Iso Hebrew Logical
- HEBREW_VISUAL, // 38: Iso Hebrew Visual
- //----------------------------------------------------------
-
- //----------------------------------------------------------
- // Detected by research langid
- CZECH_CP852, // 39
- CZECH_CSN_369103, // 40: aka ISO_IR_139 aka KOI8_CS
- MSFT_CP1253, // 41: used for Greek
- RUSSIAN_CP866, // 42
- //----------------------------------------------------------
- HZ_ENCODING,
- ISO2022_CN,
- ISO2022_KR,
-
- NUM_ENCODINGS // Always keep this at the end. It is not a
- // valid Encoding enum, it is only used to
- // indicate the total number of Encodings.
-};
-
-const int kNumLanguages = NUM_LANGUAGES;
-const int kNumEncodings = NUM_ENCODINGS;
-
-#endif // OMAHA_BASE_LANG_ENC_H_
« no previous file with comments | « base/highres_timer_unittest.cc ('k') | base/localization.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698