third_party/cld/bar/toolbar/cld/i18n/languages/public/languages.h - Issue 122007: [chromium-reviews] Add Compact Language Detection (CLD) library to Chrome. This works in Windows...

Unified Diff: third_party/cld/bar/toolbar/cld/i18n/languages/public/languages.h

Issue 122007: [chromium-reviews] Add Compact Language Detection (CLD) library to Chrome. This works in Windows... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/cld/bar/toolbar/cld/i18n/languages/public/languages.h

===================================================================

--- third_party/cld/bar/toolbar/cld/i18n/languages/public/languages.h (revision 0)

+++ third_party/cld/bar/toolbar/cld/i18n/languages/public/languages.h (revision 0)

@@ -0,0 +1,373 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#ifndef I18N_LANGUAGES_PUBLIC_LANGUAGES_H_

+#define I18N_LANGUAGES_PUBLIC_LANGUAGES_H_

+// This interface defines the Language enum and functions that depend

+// only on Language values.

+#ifndef SWIG

+// Language enum defined in languages.proto

+// Also description on how to add languages.

+#include "third_party/cld/bar/toolbar/cld/i18n/languages/proto/languages.pb.h"

+// We need this for compatibility:

+// - The Language enum in the default namespace.

+#else

+// And we must have a swig-compatible enum.

+// This one is a simple cleaned up version of language.proto, making the enum

+// compatible with C++.

+#include "i18n/languages/internal/languages_proto_wrapper.h"

+#endif

+const int kNumLanguages = NUM_LANGUAGES;

+// Return the default language (ENGLISH).

+Language default_language();

+// *******************************************

+// Language predicates

+// IsValidLanguage()

+// IS_LANGUAGE_UNKNOWN()

+// IsCJKLanguage()

+// IsChineseLanguage()

+// IsNorwegianLanguage()

+// IsPortugueseLanguage()

+// IsRightToLeftLanguage()

+// IsMaybeRightToLeftLanguage()

+// IsSameLanguage()

+// IsScriptRequiringLongerSnippets()

+// *******************************************

+// IsValidLanguage

+// ===============

+//

+// Function to check if the input is within range of the Language enum. If

+// IsValidLanguage(lang) returns true, it is safe to call

+// static_cast<Language>(lang).

+//

+inline bool IsValidLanguage(int lang) {

+ return ((lang >= 0) && (lang < kNumLanguages));

+// Return true if the language is "unknown". (This function was

+// previously a macro, hence the spelling in all caps.)

+//

+inline bool IS_LANGUAGE_UNKNOWN(Language lang) {

+ return lang == TG_UNKNOWN_LANGUAGE || lang == UNKNOWN_LANGUAGE;

+// IsCJKLanguage

+// -------------

+//

+// This function returns true if the language is either Chinese

+// (simplified or traditional), Japanese, or Korean.

+bool IsCJKLanguage(Language lang);

+// IsChineseLanguage

+// -----------------

+//

+// This function returns true if the language is either Chinese

+// (simplified or traditional)

+bool IsChineseLanguage(Language lang);

+// IsNorwegianLanguage

+// --------------------

+//

+// This function returns true if the language is any of the Norwegian

+// (regular or Nynorsk).

+bool IsNorwegianLanguage(Language lang);

+// IsPortugueseLanguage

+// --------------------

+//

+// This function returns true if the language is any of the Portuguese

+// languages (regular, Portugal or Brazil)

+bool IsPortugueseLanguage(Language lang);

+// IsSameLanguage

+// --------------

+//

+// WARNING: This function provides only a simple test on the values of

+// the two Language arguments. It returns false if either language is

+// invalid. It returns true if the language arguments are equal, or

+// if they are both Chinese languages, both Norwegian languages, or

+// both Portuguese languages, as defined by IsChineseLanguage,

+// IsNorwegianLanguage, and IsPortugueseLanguage. Otherwise it returns

+// false.

+bool IsSameLanguage(Language lang1, Language lang2);

+// IsRightToLeftLanguage

+// ---------------------

+//

+// This function returns true if the language is only written right-to-left

+// (E.g., Hebrew, Arabic, Persian etc.)

+//

+// IMPORTANT NOTE: Technically we're talking about scripts, not languages.

+// There are languages that can be written in more than one script.

+// Examples:

+// - Kurdish and Azeri ('AZERBAIJANI') can be written left-to-right in

+// Latin or Cyrillic script, and right-to-left in Arabic script.

+// - Sindhi and Punjabi are written in different scripts, depending on

+// region and dialect.

+// - Turkmen used an Arabic script historically, but not any more.

+// - Pashto and Uyghur can use Arabic script, but use a Roman script

+// on the Internet.

+// - Kashmiri and Urdu are written either with Arabic or Devanagari script.

+//

+// This function only returns true for languages that are always, unequivocally

+// written in right-to-left script.

+//

+// TODO : If we want to do anything special with multi-script languages

+// we should create new 'languages' for each language+script, as we do for

+// traditional vs. simplified Chinese. However most such languages are rare in

+// use and even rarer on the web, so this is unlikely to be something we'll

+// be concerned with for a while.

+bool IsRightToLeftLanguage(Language lang);

+// IsMaybeRightToLeftLanguage

+// --------------------------

+//

+// This function returns true if the language may appear on the web in a

+// right-to-left script (E.g., Hebrew, Arabic, Persian, Urdu, Kurdish, etc.)

+//

+// NOTE: See important notes under IsRightToLeftLanguage(...).

+//

+// This function returns true for languages that *may* appear on the web in a

+// right-to-left script, even if they may also appear in a left-to-right

+// script.

+//

+// This function should typically be used in cases where doing some work on

+// left-to-right text would be OK (usually a no-op), and this function is used

+// just to cut down on unnecessary work on regular, LTR text.

+bool IsMaybeRightToLeftLanguage(Language lang);

+// IsScriptRequiringLongerSnippets

+// --------------------

+//

+// This function returns true if the script chracteristics require longer

+// snippet length (Devanagari, Bengali, Gurmukhi,

+// Gujarati, Oriya, Tamil, Telugu, Kannada, Malayalam).

+// COMMENTED OUT TO REDUCE DEPENDENCIES ON GOOGLE3 CODE

+// bool IsScriptRequiringLongerSnippets(UnicodeScript script);

+// *******************************************

+// LANGUAGE NAMES

+//

+// This interface defines a standard name for each valid Language,

+// and a standard name for invalid languages. Some language names use all

+// uppercase letters, but others use mixed case.

+// LanguageName() [Language to name]

+// LanguageEnumName() [language to enum name]

+// LanguageFromName() [name to Language]

+// default_language_name()

+// invalid_language_name()

+// *******************************************

+// Given a Language, returns its standard name.

+// Return invalid_language_name() if the language is invalid.

+const char* LanguageName(Language lang);

+// Given a Language, return the name of the enum constant for that

+// language. In all but a few cases, this is the same as its standard

+// name. For example, LanguageName(CHINESE) returns "Chinese", but

+// LanguageEnumName(CHINESE) returns "CHINESE". This is intended for

+// code that is generating C++ code, where the enum constant is more

+// useful than its integer value. Return "NUM_LANGUAGES" if

+// the language is invalid.

+const char* LanguageEnumName(Language lang);

+// The maximum length of a standard language name.

+const int kMaxLanguageNameSize = 50;

+// The standard name for the default language.

+const char* default_language_name();

+// The standard name for all invalid languages.

+const char* invalid_language_name();

+// If lang_name matches the standard name of a Language, using a

+// case-insensitive comparison, set *language to that Language and

+// return true.

+// Otherwise, set *language to UNKNOWN_LANGUAGE and return false.

+//

+// For backwards compatibility, "HATIAN_CREOLE" is allowed as a name

+// for HAITIAN_CREOLE, and "QUECHAU" is allowed as a name for QUECHUA.

+// For compatibility with LanguageEnumName, "UNKNOWN_LANGUAGE" is allowed

+// as a name for UNKNOWN_LANGUAGE (the return value is true in this case,

+// as it is for "Unknown"), and "CHINESE_T" is allowed as a name for

+// CHINESE_T (i.e., a synonym for "ChineseT").

+//

+// REQUIRES: language must not be NULL.

+//

+bool LanguageFromName(const char* lang_name, Language *language);

+// *******************************************

+// LANGUAGE CODES

+//

+// This interface defines a standard code for each valid language, and

+// a standard code for invalid languages. These are derived from ISO codes,

+// with some Google additions.

+// LanguageCode()

+// default_language_code()

+// invalid_language_code()

+// LanguageCodeWithDialects()

+// LanguageCodeISO639_1()

+// LanguageCodeISO639_2()

+// *******************************************

+// Given a Language, return its standard code. There are Google-specific codes:

+// For CHINESE_T, return "zh-TW".

+// For TG_UNKNOWN_LANGUAGE, return "ut".

+// For UNKNOWN_LANGUAGE, return "un".

+// For PORTUGUESE_P, return "pt-PT".

+// For PORTUGUESE_B, return "pt-BR".

+// For LIMBU, return "sit-NP".

+// For CHEROKEE, return "chr".

+// For SYRIAC, return "syr".

+// Otherwise return the ISO 639-1 two-letter language code for lang.

+// If lang is invalid, return invalid_language_code().

+//

+// NOTE: See the note below about the codes for Chinese languages.

+//

+const char* LanguageCode(Language lang);

+// The maximum length of a language code.

+const int kMaxLanguageCodeSize = 50;

+// The standard code for the default language.

+const char* default_language_code();

+// The standard code for all invalid languages.

+const char* invalid_language_code();

+// --------------------------------------------

+// NOTE: CHINESE LANGUAGE CODES

+//

+// There are three functions that return codes for Chinese languages.

+// LanguageCode(lang) and LanguageCodeWithDialects(lang) are defined here.

+// LanguageCode(lang, encoding) is defined in i18n/encodings.lang_enc.h.

+// The following list shows the different results.

+//

+// LanguageCode(CHINESE) returns "zh"

+// LanguageCode(CHINESE_T) returns "zh-TW".

+//

+// LanguageCodeWithDialects(CHINESE) returns "zh-CN".

+// LanguageCodeWithDialects(CHINESE_T) returns "zh-TW".

+//

+// LanguageCode(CHINESE_T, <any encoding>) returns "zh-TW".

+// LanguageCode(CHINESE, CHINESE_BIG5) returns "zh-TW".

+// LanguageCode(CHINESE, <any other encoding>) returns "zh-CN".

+//

+// --------------------------------------------

+// LanguageCodeWithDialects

+// ------------------------

+//

+// If lang is CHINESE, return "zh-CN". Otherwise return LanguageCode(lang).

+const char* LanguageCodeWithDialects(Language lang);

+// LanguageCodeISO639_1

+// --------------------

+//

+// Return the ISO 639-1 two-letter language code for lang.

+// Return invalid_language_code() if lang is invalid or does not have

+// an ISO 639-1 two-letter language code.

+const char* LanguageCodeISO639_1(Language lang);

+// LanguageCodeISO639_2

+// --------------------

+//

+// Return the ISO 639-2 three-letter language for lang.

+// Return invalid_language_code() if lang is invalid or does not have

+// an ISO 639-2 three-letter language code.

+const char* LanguageCodeISO639_2(Language lang);

+// LanguageFromCode

+// ----------------

+//

+// If lang_code matches the code for a Language, using a case-insensitive

+// comparison, set *lang to that Language and return true.

+// Otherwise, set *lang to UNKNOWN_LANGUAGE and return false.

+//

+// lang_code can be an ISO 639-1 (two-letter) code, an ISO 639-2

+// (three-letter) code, or a Google-specific code (see LanguageCode).

+//

+// Certain language-code aliases are also allowed:

+// For "zh-cn" and "zh_cn", set *lang to CHINESE.

+// For "zh-tw" and "zh_tw", set *lang to CHINESE_T.

+// For "he", set *lang to HEBREW.

+// For "in", set *lang to INDONESIAN.

+// For "ji", set *lang to YIDDISH.

+// For "fil", set *lang to TAGALOG.

+//

+// REQUIRES: 'lang' must not be NULL.

+bool LanguageFromCode(const char* lang_code, Language *language);

+// LanguageFromCodeOrName

+// ----------------------

+//

+// If lang_code_or_name is a language code or a language name.

+// set *language to the corresponding Language and return true.

+// Otherwise set *language to UNKNOWN_LANGUAGE and return false.

+//

+bool LanguageFromCodeOrName(const char* lang_code_or_name,

+ Language* language);

+// LanguageNameFromCode

+// --------------------

+//

+// If language_code is the code for a Language (see LanguageFromCode),

+// return the standard name of that language (see LanguageName).

+// Otherwise return invalid_language_name().

+//

+const char* LanguageNameFromCode(const char* language_code);

+// Miscellany

+// LanguageCodeToUnderscoreForm

+// ----------------------------

+//

+// Given a language code, convert the dash "-" to underscore "_".

+// This is because some module of google3 use the underscore form

+// ISO 639 specification.

+//

+// Specifically, if result_length <= strlen(lang_code), set result[0]

+// to '\0' and return false. Otherwise, copy lang_code to result,

+// converting every dash to an underscore, converting every character

+// before the first dash or underscore to lower case, and converting

+// every character after the first dash or underscore to upper

+// case. If there is no dash or underscore, convert the entire string

+// to lower case.

+//

+// REQUIRES: 'lang_code' must not be NULL. 'result' must not be NULL.

+bool LanguageCodeToUnderscoreForm(const char* lang_code,

+ char* result,

+ int result_length);

+//

+// AlwaysPutInExpectedRestrict

+// ---------------------------

+//

+// For Web pages in certain top-level domains, Web Search always

+// applies a "country restrict". If 'tld' matches one of those, using

+// a case-SENSITIVE comparison, set *expected_language to the Language

+// most commonly found in that top-level domain and return true.

+// Otherwise, set *expected_language to UNKNOWN_LANGUAGE and return false.

+bool AlwaysPutInExpectedRestrict(const char *tld, Language *expected_language);

+#endif // I18N_LANGUAGES_PUBLIC_LANGUAGES_H_

Property changes on: third_party\cld\bar\toolbar\cld\i18n\languages\public\languages.h

___________________________________________________________________

Added: svn:eol-style

+ LF

« no previous file with comments | « third_party/cld/bar/toolbar/cld/i18n/languages/proto/languages.pb.h ('k') | third_party/cld/base/casts.h » ('j') | no next file with comments »