| Index: base/lang_enc.h
|
| diff --git a/base/lang_enc.h b/base/lang_enc.h
|
| deleted file mode 100644
|
| index 9da8c0351f129a05b90b474da404d7972d4795eb..0000000000000000000000000000000000000000
|
| --- a/base/lang_enc.h
|
| +++ /dev/null
|
| @@ -1,299 +0,0 @@
|
| -// Copyright 2004-2009 Google Inc.
|
| -//
|
| -// Licensed under the Apache License, Version 2.0 (the "License");
|
| -// you may not use this file except in compliance with the License.
|
| -// You may obtain a copy of the License at
|
| -//
|
| -// http://www.apache.org/licenses/LICENSE-2.0
|
| -//
|
| -// Unless required by applicable law or agreed to in writing, software
|
| -// distributed under the License is distributed on an "AS IS" BASIS,
|
| -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| -// See the License for the specific language governing permissions and
|
| -// limitations under the License.
|
| -// ========================================================================
|
| -//
|
| -// This file is for i18n. It contains two enums, namely Language and
|
| -// Encoding, where Language is the linguistic convention, and Encoding
|
| -// contains information on both language encoding and character set.
|
| -//
|
| -// The language and encoding are both based on Teragram's conventions,
|
| -// except for some common ISO-8859 encodings that are not detected by
|
| -// Teragram but might be in the future.
|
| -//
|
| -// This file also includes functions that do mappings among
|
| -// Language/Encoding enums, language/encoding string names (typically
|
| -// the output from Language Encoding identifier), and language codes
|
| -// (iso 639), and two-letter country codes (iso 3166)
|
| -//
|
| -// NOTE: Both Language and Encoding enums should always start from
|
| -// zero value. This assumption has been made and used.
|
| -
|
| -#ifndef OMAHA_BASE_LANG_ENC_H_
|
| -#define OMAHA_BASE_LANG_ENC_H_
|
| -
|
| -#include <windows.h>
|
| -
|
| -// some of the popular encoding aliases
|
| -#define LATIN1 ISO_8859_1
|
| -#define LATIN2 ISO_8859_2
|
| -#define LATIN3 ISO_8859_3
|
| -#define LATIN4 ISO_8859_4
|
| -#define CYRILLIC ISO_8859_5
|
| -#define ARABIC_ENCODING ISO_8859_6 // avoiding the same name as language
|
| -#define GREEK_ENCODING ISO_8859_7 // avoiding the same name as language
|
| -#define HEBREW_ENCODING ISO_8859_8 // avoiding the same name as language
|
| -#define LATIN5 ISO_8859_9
|
| -#define LATIN6 ISO_8859_10
|
| -#define KOREAN_HANGUL KOREAN_EUC_KR
|
| -
|
| -// NOTE: Only add new languages to the end of this list (but before
|
| -// NUM_LANGUAGES).
|
| -enum Language {
|
| - ENGLISH = 0, /* 0 */
|
| - DANISH, /* 1 */
|
| - DUTCH, /* 2 */
|
| - FINNISH, /* 3 */
|
| - FRENCH, /* 4 */
|
| - GERMAN, /* 5 */
|
| - HEBREW, /* 6 */
|
| - ITALIAN, /* 7 */
|
| - JAPANESE, /* 8 */
|
| - KOREAN, /* 9 */
|
| - NORWEGIAN, /* 10 */
|
| - POLISH, /* 11 */
|
| - PORTUGUESE, /* 12 */
|
| - RUSSIAN, /* 13 */
|
| - SPANISH, /* 14 */
|
| - SWEDISH, /* 15 */
|
| - CHINESE, /* 16 */
|
| - CZECH, /* 17 */
|
| - GREEK, /* 18 */
|
| - ICELANDIC, /* 19 */
|
| - LATVIAN, /* 20 */
|
| - LITHUANIAN, /* 21 */
|
| - ROMANIAN, /* 22 */
|
| - HUNGARIAN, /* 23 */
|
| - ESTONIAN, /* 24 */
|
| - TG_UNKNOWN_LANGUAGE, /* 25 */
|
| - UNKNOWN_LANGUAGE, /* 26 */
|
| - BULGARIAN, /* 27 */
|
| - CROATIAN, /* 28 */
|
| - SERBIAN, /* 29 */
|
| - IRISH, /* 30 */
|
| - GALICIAN, /* 31 */
|
| - TAGALOG, /* 32 */
|
| - TURKISH, /* 33 */
|
| - UKRAINIAN, /* 34 */
|
| - HINDI, /* 35 */
|
| - MACEDONIAN, /* 36 */
|
| - BENGALI, /* 37 */
|
| - INDONESIAN, /* 38 */
|
| - LATIN, /* 39 */
|
| - MALAY, /* 40 */
|
| - MALAYALAM, /* 41 */
|
| - WELSH, /* 42 */
|
| - NEPALI, /* 43 */
|
| - TELUGU, /* 44 */
|
| - ALBANIAN, /* 45 */
|
| - TAMIL, /* 46 */
|
| - BELARUSIAN, /* 47 */
|
| - JAVANESE, /* 48 */
|
| - OCCITAN, /* 49 */
|
| - URDU, /* 50 */
|
| - BIHARI, /* 51 */
|
| - GUJARATI, /* 52 */
|
| - THAI, /* 53 */
|
| - ARABIC, /* 54 */
|
| - CATALAN, /* 55 */
|
| - ESPERANTO, /* 56 */
|
| - BASQUE, /* 57 */
|
| - INTERLINGUA, /* 58 */
|
| - KANNADA, /* 59 */
|
| - PUNJABI, /* 60 */
|
| - SCOTS_GAELIC, /* 61 */
|
| - SWAHILI, /* 62 */
|
| - SLOVENIAN, /* 63 */
|
| - MARATHI, /* 64 */
|
| - MALTESE, /* 65 */
|
| - VIETNAMESE, /* 66 */
|
| - FRISIAN, /* 67 */
|
| - SLOVAK, /* 68 */
|
| - CHINESE_T, /* 69 */ // This is added to solve the problem of
|
| - // distinguishing Traditional and Simplified
|
| - // Chinese when the encoding is UTF8.
|
| - FAROESE, /* 70 */
|
| - SUNDANESE, /* 71 */
|
| - UZBEK, /* 72 */
|
| - AMHARIC, /* 73 */
|
| - AZERBAIJANI, /* 74 */
|
| - GEORGIAN, /* 75 */
|
| - TIGRINYA, /* 76 */
|
| - PERSIAN, /* 77 */
|
| - BOSNIAN, /* 78 */
|
| - SINHALESE, /* 79 */
|
| - NORWEGIAN_N, /* 80 */
|
| - PORTUGUESE_P, /* 81 */
|
| - PORTUGUESE_B, /* 82 */
|
| - XHOSA, /* 83 */
|
| - ZULU, /* 84 */
|
| - GUARANI, /* 85 */
|
| - SESOTHO, /* 86 */
|
| - TURKMEN, /* 87 */
|
| - KYRGYZ, /* 88 */
|
| - BRETON, /* 89 */
|
| - TWI, /* 90 */
|
| - YIDDISH, /* 91 */
|
| - ORIYA, /* 92 */
|
| - SERBO_CROATIAN, /* 93 */
|
| - SOMALI, /* 94 */
|
| - UIGHUR, /* 95 */
|
| - KURDISH, /* 96 */
|
| - MONGOLIAN, /* 97 */
|
| - ARMENIAN, /* 98 */
|
| - LAOTHIAN, /* 99 */
|
| - SINDHI, /* 100! */
|
| - RHAETO_ROMANCE, /* 101 */
|
| - CHINESE_JAPANESE_KOREAN, /* 103 */ // Not really a language
|
| - PSEUDOTRANSLATION, /* 104 */ // Not really a language
|
| - NUM_LANGUAGES, // Always keep this at the end. It is not a
|
| - // valid Language enum, it is only used to
|
| - // indicate the total number of Languages.
|
| -};
|
| -
|
| -
|
| -// Language codes for those languages we support, used to map to IDs from
|
| -// the Language enumeration. We could have used the Rfc1766ToLcid from the
|
| -// Win32 system's mlang.dll to map these to LCIDs, but a) we don't want to
|
| -// have to load mlang.dll and b) we are using our own language IDs.
|
| -const TCHAR* const kLangCodeChinesePrc = _T("zh_cn");
|
| -const TCHAR* const kLangCodeChineseTaiwan = _T("zh_tw");
|
| -const TCHAR* const kLangCodeCjk = _T("cjk");
|
| -const TCHAR* const kLangCodeDutch = _T("nl");
|
| -const TCHAR* const kLangCodeEnglish = _T("en");
|
| -const TCHAR* const kLangCodeFrench = _T("fr");
|
| -const TCHAR* const kLangCodeGerman = _T("de");
|
| -const TCHAR* const kLangCodeItalian = _T("it");
|
| -const TCHAR* const kLangCodeJapanese = _T("ja");
|
| -const TCHAR* const kLangCodeKorean = _T("ko");
|
| -const TCHAR* const kLangCodePseudo = _T("x");
|
| -const TCHAR* const kLangCodeSpanish = _T("es");
|
| -
|
| -
|
| -// Maps language codes to languages. Terminated by a { NULL, UNKNOWN_LANGUAGE }
|
| -// item.
|
| -struct CodeToLanguage {
|
| - const TCHAR* code;
|
| - Language language;
|
| -};
|
| -
|
| -SELECTANY CodeToLanguage codes_to_languages[] = {
|
| - { kLangCodeChinesePrc, CHINESE },
|
| - { kLangCodeChineseTaiwan, CHINESE_T },
|
| - { kLangCodeCjk, CHINESE_JAPANESE_KOREAN },
|
| - { kLangCodeDutch, DUTCH },
|
| - { kLangCodeEnglish, ENGLISH },
|
| - { kLangCodeFrench, FRENCH },
|
| - { kLangCodeGerman, GERMAN },
|
| - { kLangCodeItalian, ITALIAN },
|
| - { kLangCodeJapanese, JAPANESE },
|
| - { kLangCodeKorean, KOREAN },
|
| - { kLangCodePseudo, PSEUDOTRANSLATION },
|
| - { kLangCodeSpanish, SPANISH },
|
| - { NULL, UNKNOWN_LANGUAGE }
|
| -};
|
| -
|
| -
|
| -
|
| -// Macro to wrap the notion of "unknown language".
|
| -#define IS_LANGUAGE_UNKNOWN(l) \
|
| - ((l) == TG_UNKNOWN_LANGUAGE || (l) == UNKNOWN_LANGUAGE)
|
| -
|
| -// NOTE: Only add new encodings to the end of this list (but before
|
| -// NUM_ENCODINGS).
|
| -// NOTE: If you add an encoding here, you must also modify basistech_encoding()
|
| -// and google2/com/google/i18n/Encoding.java
|
| -enum Encoding {
|
| - ISO_8859_1 = 0, // 0: Teragram ASCII
|
| - ISO_8859_2, // 1: Teragram Latin2
|
| - ISO_8859_3, // 2: in BasisTech but not in Teragram
|
| - ISO_8859_4, // 3: Teragram Latin4
|
| - ISO_8859_5, // 4: Teragram ISO-8859-5
|
| - ISO_8859_6, // 5: Teragram Arabic
|
| - ISO_8859_7, // 6: Teragram Greek
|
| - ISO_8859_8, // 7: Teragram Hebrew
|
| - ISO_8859_9, // 8: in BasisTech but not in Teragram
|
| - ISO_8859_10, // 9: in BasisTech but not in Teragram
|
| - JAPANESE_EUC_JP, // 10: Teragram EUC_JP
|
| - JAPANESE_SHIFT_JIS, // 11: Teragram SJS
|
| - JAPANESE_JIS, // 12: Teragram JIS
|
| - CHINESE_BIG5, // 13: Teragram BIG5
|
| - CHINESE_GB, // 14: Teragram GB
|
| - CHINESE_EUC_CN, // 15: Teragram EUC-CN
|
| - KOREAN_EUC_KR, // 16: Teragram KSC
|
| - UNICODE_ENCODING, // 17: Teragram Unicode, changed to UNICODE_ENCODING
|
| - // from UNICODE, which is predefined by WINDOW
|
| - CHINESE_EUC_DEC, // 18: Teragram EUC
|
| - CHINESE_CNS, // 19: Teragram CNS
|
| - CHINESE_BIG5_CP950, // 20: Teragram BIG5_CP950
|
| - JAPANESE_CP932, // 21: Teragram CP932
|
| - UTF8, // 22
|
| - UNKNOWN_ENCODING, // 23
|
| - ASCII_7BIT, // 24: ISO_8859_1 with all characters <= 127.
|
| - // Should be present only in the crawler
|
| - // and in the repository,
|
| - // *never* as a result of Document::encoding().
|
| - RUSSIAN_KOI8_R, // 25: Teragram KOI8R
|
| - RUSSIAN_CP1251, // 26: Teragram CP1251
|
| -
|
| - //----------------------------------------------------------
|
| - // These are _not_ output from teragram. Instead, they are as
|
| - // detected in the headers of usenet articles.
|
| - MSFT_CP1252, // 27: CP1252 aka MSFT euro ascii
|
| - RUSSIAN_KOI8_RU, // 28: CP21866 aka KOI8_RU, used for Ukrainian
|
| - MSFT_CP1250, // 29: CP1250 aka MSFT eastern european
|
| - ISO_8859_15, // 30: aka ISO_8859_0 aka ISO_8859_1 euroized
|
| - //----------------------------------------------------------
|
| -
|
| - //----------------------------------------------------------
|
| - // These are in BasisTech but not in Teragram. They are
|
| - // needed for new interface languages. Now detected by
|
| - // research langid
|
| - MSFT_CP1254, // 31: used for Turkish
|
| - MSFT_CP1257, // 32: used in Baltic countries
|
| - //----------------------------------------------------------
|
| -
|
| - //----------------------------------------------------------
|
| - //----------------------------------------------------------
|
| - // New encodings detected by Teragram
|
| - ISO_8859_11, // 33: aka TIS-620, used for Thai
|
| - MSFT_CP874, // 34: used for Thai
|
| - MSFT_CP1256, // 35: used for Arabic
|
| -
|
| - //----------------------------------------------------------
|
| - // Detected as ISO_8859_8 by Teragram, but can be found in META tags
|
| - MSFT_CP1255, // 36: Logical Hebrew Microsoft
|
| - ISO_8859_8_I, // 37: Iso Hebrew Logical
|
| - HEBREW_VISUAL, // 38: Iso Hebrew Visual
|
| - //----------------------------------------------------------
|
| -
|
| - //----------------------------------------------------------
|
| - // Detected by research langid
|
| - CZECH_CP852, // 39
|
| - CZECH_CSN_369103, // 40: aka ISO_IR_139 aka KOI8_CS
|
| - MSFT_CP1253, // 41: used for Greek
|
| - RUSSIAN_CP866, // 42
|
| - //----------------------------------------------------------
|
| - HZ_ENCODING,
|
| - ISO2022_CN,
|
| - ISO2022_KR,
|
| -
|
| - NUM_ENCODINGS // Always keep this at the end. It is not a
|
| - // valid Encoding enum, it is only used to
|
| - // indicate the total number of Encodings.
|
| -};
|
| -
|
| -const int kNumLanguages = NUM_LANGUAGES;
|
| -const int kNumEncodings = NUM_ENCODINGS;
|
| -
|
| -#endif // OMAHA_BASE_LANG_ENC_H_
|
|
|