| Index: third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h
 | 
| ===================================================================
 | 
| --- third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h	(revision 0)
 | 
| +++ third_party/cld/bar/toolbar/cld/i18n/encodings/lang_enc.h	(revision 0)
 | 
| @@ -0,0 +1,255 @@
 | 
| +// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
 | 
| +// Use of this source code is governed by a BSD-style license that can be
 | 
| +// found in the LICENSE file.
 | 
| +
 | 
| +// This file is for i18n. It contains two enums, namely Language and
 | 
| +// Encoding, where Language is the linguistic convention, and Encoding
 | 
| +// contains information on both language encoding and character set.
 | 
| +//
 | 
| +// The language and encoding are both based on Teragram's conventions,
 | 
| +// except for some common ISO-8859 encodings that are not detected by
 | 
| +// Teragram but might be in the future.
 | 
| +//
 | 
| +// This file also includes functions that do mappings among
 | 
| +// Language/Encoding enums, language/encoding string names (typically
 | 
| +// the output from Language Encoding identifier), and language codes
 | 
| +// (iso 639), and two-letter country codes (iso 3166)
 | 
| +//
 | 
| +// NOTE: Both Language and Encoding enums should always start from
 | 
| +// zero value. This assumption has been made and used.
 | 
| +//
 | 
| +
 | 
| +#ifndef I18N_ENCODINGS_LANG_ENC_H__
 | 
| +#define I18N_ENCODINGS_LANG_ENC_H__
 | 
| +
 | 
| +#include "third_party/cld/bar/toolbar/cld/i18n/languages/public/languages.h"
 | 
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/public/encodings.h"
 | 
| +
 | 
| +
 | 
| +// EncodingsForLanguage
 | 
| +// --------------------
 | 
| +//
 | 
| +// Given the language, returns a pointer to an array of encodings this
 | 
| +// language supports. Typically, the encs array has at least one
 | 
| +// element: UNKNOWN_ENCODING, which is always the last element of the
 | 
| +// array. The first encoding is the default encoding of the language.
 | 
| +// Return NULL if the input is invalid.
 | 
| +//
 | 
| +// Note: The output encoding array does not include ASCII_7BIT, UTF8
 | 
| +// or UNICODE which are good for all languages. TODO: Find out whether
 | 
| +// it is better to include ASCII_7BIT, UTF8 and UNICODE or leave them
 | 
| +// as special cases.
 | 
| +//
 | 
| +const Encoding* EncodingsForLanguage(Language lang);
 | 
| +
 | 
| +
 | 
| +// DefaultEncodingForLanguage
 | 
| +// --------------------------
 | 
| +//
 | 
| +// Given the language, returns the default encoding for the language
 | 
| +// via the argument encoding.
 | 
| +//
 | 
| +// The function returns true if the input lang is valid. Otherwise,
 | 
| +// false is returned, and encoding is set to UNKNOWN_ENCODING.
 | 
| +//
 | 
| +bool DefaultEncodingForLanguage(Language lang,
 | 
| +                                Encoding *encoding);
 | 
| +
 | 
| +// LanguagesForEncoding
 | 
| +// --------------------
 | 
| +//
 | 
| +// Given the encoding, returns a pointer to an array of languages this
 | 
| +// encoding supports. Typically, the langs array has at least one
 | 
| +// element: UNKNOWN_LANGUAGE, which is always the last element of the
 | 
| +// array. The first language in the array if the most popular
 | 
| +// language for that encoding. NULL is returned if the input is
 | 
| +// invalid.
 | 
| +//
 | 
| +// Note: For ASCII_7BIT, UNICODE and UTF8, only ENGLISH and
 | 
| +// UNKNOWN_LANGUAGE are returned. TODO: Find out whether to return all
 | 
| +// the languages or to treat these two encodings as special cases.
 | 
| +//
 | 
| +// For other known encodings, ENGLISH is always included. This is
 | 
| +// because English (Latin) characters are included in each encoding.
 | 
| +//
 | 
| +const Language* LanguagesForEncoding(Encoding enc);
 | 
| +
 | 
| +// DefaultLanguageForEncoding
 | 
| +// --------------------------
 | 
| +//
 | 
| +// Given the encoding, returns the default language for that encoding
 | 
| +// via the argument language.
 | 
| +//
 | 
| +// The function returns true if the input enc is valid. Otherwise,
 | 
| +// false is returned, and language is set to UNKNOWN_LANGUAGE.
 | 
| +//
 | 
| +// Note, this function is more useful for the encodings that have only
 | 
| +// one corresponding language i.e. shift_jis => Japanese. There are
 | 
| +// cases that multiple langauges have the same encoding, for which the
 | 
| +// default language is an arbitrary choice from them.
 | 
| +//
 | 
| +bool DefaultLanguageForEncoding(Encoding enc, Language* language);
 | 
| +
 | 
| +//
 | 
| +// IsLangEncCompatible
 | 
| +// -------------------
 | 
| +//
 | 
| +// This function is to determine whether the input language and
 | 
| +// encoding are compatible. For example, FRENCH and LATIN1 are
 | 
| +// compatible, but FRENCH and GB are not.
 | 
| +//
 | 
| +// If either lang or enc is invalid return false.
 | 
| +// If either lang is unknown, return true.
 | 
| +//    (e.g. we can detect a page's encoding as latin1 from metatag info, but
 | 
| +//     cannot derive it language since there are more than one
 | 
| +//     language encoding in Latin1 )
 | 
| +// If language is known, but encoding is unknown, return false.
 | 
| +//    (return true will do us no good since we cannot convert to UTF8 anyway)
 | 
| +// If enc is unicode or utf8, return true.
 | 
| +// Otherwise check if lang is supported by enc and enc supported by
 | 
| +// lang.
 | 
| +//
 | 
| +bool IsLangEncCompatible(Language lang, Encoding enc);
 | 
| +
 | 
| +//
 | 
| +// DominantLanguageFromEncoding
 | 
| +// ----------------------------
 | 
| +//
 | 
| +// This function determine if there exists a dominant language for the
 | 
| +// input encoding. For example, the encoding GB has a dominant
 | 
| +// language (Chinese), but Latin1 does not.
 | 
| +//
 | 
| +// The word "dominant" is used here because English characters are
 | 
| +// included in each encoding.
 | 
| +//
 | 
| +// If there is no dominant langauge for the encoding, such as Latin1,
 | 
| +// UNKNOWN_LANGUAGE is returned.
 | 
| +//
 | 
| +Language DominantLanguageFromEncoding(Encoding enc);
 | 
| +
 | 
| +// LanguageCode
 | 
| +// ------------------------
 | 
| +// Given the Language and Encoding, return language code with dialects
 | 
| +// (>= 2 letters).  Encoding is necessary to disambiguate between
 | 
| +// Simplified and Traditional Chinese.
 | 
| +//
 | 
| +// See the note on Chinese Language Codes in
 | 
| +// i18n/languages/public/languages.h
 | 
| +// for the details.
 | 
| +
 | 
| +const char* LanguageCode(Language lang, Encoding enc);
 | 
| +
 | 
| +//
 | 
| +// IsEncodingWithSupportedLanguage()
 | 
| +// ---------------------------------
 | 
| +//
 | 
| +// There are some encoding listed here just because they are commonly
 | 
| +// used.  There is no interface language for them yet. They are not
 | 
| +// detected by Teragram, but can be detected from the meta info of the
 | 
| +// HTML page.
 | 
| +//
 | 
| +// For example, we have list ARABIC_ENCODING but there is no arabic in
 | 
| +// the Language enum. If the user input an Arabic query from Google
 | 
| +// main page, Netscape will just send the raw bytes to GWS, and GWS
 | 
| +// will treat them as Latin1.  Therefore, there is no use to detect
 | 
| +// ARABIC_ENCODING for indexing, since they will never match the
 | 
| +// queries which are treated as Latin1 by GWS. On the contrary, if we
 | 
| +// treat page with ARABIC_ENCODING as UNKNOWN_ENCODING, Google will
 | 
| +// fall them through as Latin1 in indexing time. And there might be a
 | 
| +// match for some ARABIC queries which are also treated as Latin1 by
 | 
| +// GWS. In fact, some people are relying on this feature to do Arabic
 | 
| +// searches.
 | 
| +//
 | 
| +// Thus for these type of encoding, before we have the UI support for
 | 
| +// their language and have a pretty comprehensive language/encoding
 | 
| +// identification quality, it is better to revert them as
 | 
| +// UNKNOWN_ENCODING.
 | 
| +//
 | 
| +// This function checks whether the input encoding is one with
 | 
| +// an interface language.
 | 
| +bool IsEncodingWithSupportedLanguage(Encoding enc);
 | 
| +
 | 
| +
 | 
| +//
 | 
| +// LangsFromCountryCode and EncFromCountryCode
 | 
| +// -------------------------------------------
 | 
| +//
 | 
| +// These two functions return the possible languages and encodings,
 | 
| +// respectively, according to the input country code, which is a
 | 
| +// 2-letter string. The country code is usually specified in the url
 | 
| +// of a document.
 | 
| +//
 | 
| +//
 | 
| +
 | 
| +// LangsFromCountryCode
 | 
| +// --------------------
 | 
| +//
 | 
| +// This function takes a string of arbitrary length. It treats the
 | 
| +// first 2 bytes of the string as the country code, as defined in iso
 | 
| +// 3166-1993 (E).  It returns, via arguments, an array of the
 | 
| +// languages that are popular in that country, roughly in order of
 | 
| +// popularity, together with the size of the array.
 | 
| +//
 | 
| +// This function returns true if we have language information for
 | 
| +// country_code.  Otherwise, it returns false.
 | 
| +//
 | 
| +bool LangsFromCountryCode(const char* country_code,
 | 
| +                          const Language** lang_arry,
 | 
| +                          int* num_langs);
 | 
| +
 | 
| +
 | 
| +//
 | 
| +// EncFromCountryCode
 | 
| +// ------------------
 | 
| +//
 | 
| +// This function takes a string of arbitrary length. It treats the
 | 
| +// first 2 bytes of that string as the country code, as defined in iso
 | 
| +// 3166-1993 (E). It sets *enc to the encoding that is
 | 
| +// most often used for the languages spoken in that country.
 | 
| +//
 | 
| +// This function returns true if we have encoding information for
 | 
| +// country_code.  Otherwise, it returns false, and *enc is set to
 | 
| +// UNKNOWN_ENCODING.
 | 
| +//
 | 
| +bool EncFromCountryCode(const char* country_code, Encoding* enc);
 | 
| +
 | 
| +
 | 
| +
 | 
| +// VisualType
 | 
| +// ----------
 | 
| +//
 | 
| +// Right-to-left documents may be in logical or visual order. When they
 | 
| +// are in visual order we convert them to logical order before processing.
 | 
| +// This enum lists the types of visual document we can encounter.
 | 
| +// Some, but not all, documents in Hebrew/Arabic/Persian etc. will be visual.
 | 
| +// The other documents in those languages, and all documents in non-RTL
 | 
| +// languages, will be NOT_VISUAL_DOCUMENT.
 | 
| +// See http://wiki/Main/RtlLanguages for details.
 | 
| +enum VisualType {
 | 
| +  NOT_VISUAL_DOCUMENT = 0,
 | 
| +  VISUAL_HEBREW_HTML,  // HTML documents in the legacy visual order.
 | 
| +  CONVERTED_RTL_PDF,   // Converted RTL PDFs, which are always visual.
 | 
| +};
 | 
| +
 | 
| +VisualType default_visualtype();
 | 
| +
 | 
| +// VisualTypeName
 | 
| +// --------------
 | 
| +//
 | 
| +// Given the visual type, returns a string name useful for debug output.
 | 
| +const char* VisualTypeName(VisualType visualtype);
 | 
| +
 | 
| +
 | 
| +
 | 
| +// InitLangEnc
 | 
| +// -----------
 | 
| +//
 | 
| +// Ensures the LangEnc module has been initialized.  Normally this
 | 
| +// happens during InitGoogle, but this allows access for scripts that
 | 
| +// don't support InitGoogle. InitLangEnc calls InitEncodings (see
 | 
| +// i18n/encodings/public/encodings.h) and also initializes data
 | 
| +// structures used in lang_enc.cc.
 | 
| +//
 | 
| +void InitLangEnc();
 | 
| +
 | 
| +#endif  // I18N_ENCODINGS_LANG_ENC_H__
 | 
| 
 | 
| Property changes on: third_party\cld\bar\toolbar\cld\i18n\encodings\lang_enc.h
 | 
| ___________________________________________________________________
 | 
| Added: svn:eol-style
 | 
|    + LF
 | 
| 
 | 
| 
 |