Index: third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h |
=================================================================== |
--- third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h (revision 0) |
+++ third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h (revision 0) |
@@ -0,0 +1,255 @@ |
+// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+// This file is for i18n. It contains two enums, namely Language and |
+// Encoding, where Language is the linguistic convention, and Encoding |
+// contains information on both language encoding and character set. |
+// |
+// The language and encoding are both based on Teragram's conventions, |
+// except for some common ISO-8859 encodings that are not detected by |
+// Teragram but might be in the future. |
+// |
+// This file also includes functions that do mappings among |
+// Language/Encoding enums, language/encoding string names (typically |
+// the output from Language Encoding identifier), and language codes |
+// (iso 639), and two-letter country codes (iso 3166) |
+// |
+// NOTE: Both Language and Encoding enums should always start from |
+// zero value. This assumption has been made and used. |
+// |
+ |
+#ifndef I18N_ENCODINGS_LANG_ENC_H__ |
+#define I18N_ENCODINGS_LANG_ENC_H__ |
+ |
+#include "third_party/cld/bar/toolbar/cld/i18n/languages/public/languages.h" |
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/public/encodings.h" |
+ |
+ |
+// EncodingsForLanguage |
+// -------------------- |
+// |
+// Given the language, returns a pointer to an array of encodings this |
+// language supports. Typically, the encs array has at least one |
+// element: UNKNOWN_ENCODING, which is always the last element of the |
+// array. The first encoding is the default encoding of the language. |
+// Return NULL if the input is invalid. |
+// |
+// Note: The output encoding array does not include ASCII_7BIT, UTF8 |
+// or UNICODE which are good for all languages. TODO: Find out whether |
+// it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them |
+// as special cases. |
+// |
+const Encoding* EncodingsForLanguage(Language lang); |
+ |
+ |
+// DefaultEncodingForLanguage |
+// -------------------------- |
+// |
+// Given the language, returns the default encoding for the language |
+// via the argument encoding. |
+// |
+// The function returns true if the input lang is valid. Otherwise, |
+// false is returned, and encoding is set to UNKNOWN_ENCODING. |
+// |
+bool DefaultEncodingForLanguage(Language lang, |
+ Encoding *encoding); |
+ |
+// LanguagesForEncoding |
+// -------------------- |
+// |
+// Given the encoding, returns a pointer to an array of languages this |
+// encoding supports. Typically, the langs array has at least one |
+// element: UNKNOWN_LANGUAGE, which is always the last element of the |
+// array. The first language in the array if the most popular |
+// language for that encoding. NULL is returned if the input is |
+// invalid. |
+// |
+// Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and |
+// UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all |
+// the languages or to treat these two encodings as special cases. |
+// |
+// For other known encodings, ENGLISH is always included. This is |
+// because English (Latin) characters are included in each encoding. |
+// |
+const Language* LanguagesForEncoding(Encoding enc); |
+ |
+// DefaultLanguageForEncoding |
+// -------------------------- |
+// |
+// Given the encoding, returns the default language for that encoding |
+// via the argument language. |
+// |
+// The function returns true if the input enc is valid. Otherwise, |
+// false is returned, and language is set to UNKNOWN_LANGUAGE. |
+// |
+// Note, this function is more useful for the encodings that have only |
+// one corresponding language i.e. shift_jis => Japanese. There are |
+// cases that multiple langauges have the same encoding, for which the |
+// default language is an arbitrary choice from them. |
+// |
+bool DefaultLanguageForEncoding(Encoding enc, Language* language); |
+ |
+// |
+// IsLangEncCompatible |
+// ------------------- |
+// |
+// This function is to determine whether the input language and |
+// encoding are compatible. For example, FRENCH and LATIN1 are |
+// compatible, but FRENCH and GB are not. |
+// |
+// If either lang or enc is invalid return false. |
+// If either lang is unknown, return true. |
+// (e.g. we can detect a page's encoding as latin1 from metatag info, but |
+// cannot derive it language since there are more than one |
+// language encoding in Latin1 ) |
+// If language is known, but encoding is unknown, return false. |
+// (return true will do us no good since we cannot convert to UTF8 anyway) |
+// If enc is unicode or utf8, return true. |
+// Otherwise check if lang is supported by enc and enc supported by |
+// lang. |
+// |
+bool IsLangEncCompatible(Language lang, Encoding enc); |
+ |
+// |
+// DominantLanguageFromEncoding |
+// ---------------------------- |
+// |
+// This function determine if there exists a dominant language for the |
+// input encoding. For example, the encoding GB has a dominant |
+// language (Chinese), but Latin1 does not. |
+// |
+// The word "dominant" is used here because English characters are |
+// included in each encoding. |
+// |
+// If there is no dominant langauge for the encoding, such as Latin1, |
+// UNKNOWN_LANGUAGE is returned. |
+// |
+Language DominantLanguageFromEncoding(Encoding enc); |
+ |
+// LanguageCode |
+// ------------------------ |
+// Given the Language and Encoding, return language code with dialects |
+// (>= 2 letters). Encoding is necessary to disambiguate between |
+// Simplified and Traditional Chinese. |
+// |
+// See the note on Chinese Language Codes in |
+// i18n/languages/public/languages.h |
+// for the details. |
+ |
+const char* LanguageCode(Language lang, Encoding enc); |
+ |
+// |
+// IsEncodingWithSupportedLanguage() |
+// --------------------------------- |
+// |
+// There are some encoding listed here just because they are commonly |
+// used. There is no interface language for them yet. They are not |
+// detected by Teragram, but can be detected from the meta info of the |
+// HTML page. |
+// |
+// For example, we have list ARABIC_ENCODING but there is no arabic in |
+// the Language enum. If the user input an Arabic query from Google |
+// main page, Netscape will just send the raw bytes to GWS, and GWS |
+// will treat them as Latin1. Therefore, there is no use to detect |
+// ARABIC_ENCODING for indexing, since they will never match the |
+// queries which are treated as Latin1 by GWS. On the contrary, if we |
+// treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will |
+// fall them through as Latin1 in indexing time. And there might be a |
+// match for some ARABIC queries which are also treated as Latin1 by |
+// GWS. In fact, some people are relying on this feature to do Arabic |
+// searches. |
+// |
+// Thus for these type of encoding, before we have the UI support for |
+// their language and have a pretty comprehensive language/encoding |
+// identification quality, it is better to revert them as |
+// UNKNOWN_ENCODING. |
+// |
+// This function checks whether the input encoding is one with |
+// an interface language. |
+bool IsEncodingWithSupportedLanguage(Encoding enc); |
+ |
+ |
+// |
+// LangsFromCountryCode and EncFromCountryCode |
+// ------------------------------------------- |
+// |
+// These two functions return the possible languages and encodings, |
+// respectively, according to the input country code, which is a |
+// 2-letter string. The country code is usually specified in the url |
+// of a document. |
+// |
+// |
+ |
+// LangsFromCountryCode |
+// -------------------- |
+// |
+// This function takes a string of arbitrary length. It treats the |
+// first 2 bytes of the string as the country code, as defined in iso |
+// 3166-1993 (E). It returns, via arguments, an array of the |
+// languages that are popular in that country, roughly in order of |
+// popularity, together with the size of the array. |
+// |
+// This function returns true if we have language information for |
+// country_code. Otherwise, it returns false. |
+// |
+bool LangsFromCountryCode(const char* country_code, |
+ const Language** lang_arry, |
+ int* num_langs); |
+ |
+ |
+// |
+// EncFromCountryCode |
+// ------------------ |
+// |
+// This function takes a string of arbitrary length. It treats the |
+// first 2 bytes of that string as the country code, as defined in iso |
+// 3166-1993 (E). It sets *enc to the encoding that is |
+// most often used for the languages spoken in that country. |
+// |
+// This function returns true if we have encoding information for |
+// country_code. Otherwise, it returns false, and *enc is set to |
+// UNKNOWN_ENCODING. |
+// |
+bool EncFromCountryCode(const char* country_code, Encoding* enc); |
+ |
+ |
+ |
+// VisualType |
+// ---------- |
+// |
+// Right-to-left documents may be in logical or visual order. When they |
+// are in visual order we convert them to logical order before processing. |
+// This enum lists the types of visual document we can encounter. |
+// Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual. |
+// The other documents in those languages, and all documents in non-RTL |
+// languages, will be NOT_VISUAL_DOCUMENT. |
+// See http://wiki/Main/RtlLanguages for details. |
+enum VisualType { |
+ NOT_VISUAL_DOCUMENT = 0, |
+ VISUAL_HEBREW_HTML, // HTML documents in the legacy visual order. |
+ CONVERTED_RTL_PDF, // Converted RTL PDFs, which are always visual. |
+}; |
+ |
+VisualType default_visualtype(); |
+ |
+// VisualTypeName |
+// -------------- |
+// |
+// Given the visual type, returns a string name useful for debug output. |
+const char* VisualTypeName(VisualType visualtype); |
+ |
+ |
+ |
+// InitLangEnc |
+// ----------- |
+// |
+// Ensures the LangEnc module has been initialized. Normally this |
+// happens during InitGoogle, but this allows access for scripts that |
+// don't support InitGoogle. InitLangEnc calls InitEncodings (see |
+// i18n/encodings/public/encodings.h) and also initializes data |
+// structures used in lang_enc.cc. |
+// |
+void InitLangEnc(); |
+ |
+#endif // I18N_ENCODINGS_LANG_ENC_H__ |
Property changes on: third_party\cld\bar\toolbar\cld\i18n\encodings\lang_enc.h |
___________________________________________________________________ |
Added: svn:eol-style |
+ LF |