| Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.cc
|
| ===================================================================
|
| --- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.cc (revision 0)
|
| +++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.cc (revision 0)
|
| @@ -0,0 +1,258 @@
|
| +// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h"
|
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_impl.h"
|
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_basictypes.h"
|
| +
|
| +// String is "code_version - data_scrape_date"
|
| +static const char* kDetectLanguageVersion = "V1.6 - 20081121";
|
| +
|
| +// Large-table version for all ~160 languages (all Tiers)
|
| +
|
| +// Scan interchange-valid UTF-8 bytes and detect most likely language
|
| +Language CompactLangDet::DetectLanguage(
|
| + const char* buffer,
|
| + int buffer_length,
|
| + bool is_plain_text,
|
| + bool* is_reliable) {
|
| + bool allow_extended_lang = false;
|
| + Language language3[3];
|
| + int percent3[3];
|
| + double normalized_score3[3];
|
| + int text_bytes;
|
| + int flags = 0;
|
| + Language plus_one = UNKNOWN_LANGUAGE;
|
| + const char* tld_hint = "";
|
| + int encoding_hint = UNKNOWN_ENCODING;
|
| + Language language_hint = UNKNOWN_LANGUAGE;
|
| +
|
| + Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
|
| + buffer,
|
| + buffer_length,
|
| + is_plain_text,
|
| + tld_hint, // "id" boosts Indonesian
|
| + encoding_hint, // SJS boosts Japanese
|
| + language_hint, // ITALIAN boosts it
|
| + allow_extended_lang,
|
| + flags,
|
| + plus_one,
|
| + language3,
|
| + percent3,
|
| + normalized_score3,
|
| + &text_bytes,
|
| + is_reliable);
|
| + // Default to English.
|
| + if (lang == UNKNOWN_LANGUAGE) {
|
| + lang = ENGLISH;
|
| + }
|
| + return lang;
|
| +}
|
| +
|
| +// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
| +Language CompactLangDet::DetectLanguageSummary(
|
| + const char* buffer,
|
| + int buffer_length,
|
| + bool is_plain_text,
|
| + Language* language3,
|
| + int* percent3,
|
| + int* text_bytes,
|
| + bool* is_reliable) {
|
| + double normalized_score3[3];
|
| + bool allow_extended_lang = false;
|
| + int flags = 0;
|
| + Language plus_one = UNKNOWN_LANGUAGE;
|
| + const char* tld_hint = "";
|
| + int encoding_hint = UNKNOWN_ENCODING;
|
| + Language language_hint = UNKNOWN_LANGUAGE;
|
| +
|
| + Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
|
| + buffer,
|
| + buffer_length,
|
| + is_plain_text,
|
| + tld_hint, // "id" boosts Indonesian
|
| + encoding_hint, // SJS boosts Japanese
|
| + language_hint, // ITALIAN boosts it
|
| + allow_extended_lang,
|
| + flags,
|
| + plus_one,
|
| + language3,
|
| + percent3,
|
| + normalized_score3,
|
| + text_bytes,
|
| + is_reliable);
|
| + // Default to English
|
| + if (lang == UNKNOWN_LANGUAGE) {
|
| + lang = ENGLISH;
|
| + }
|
| + return lang;
|
| +}
|
| +
|
| +// Same as above, with hints supplied
|
| +// Scan interchange-valid UTF-8 bytes and detect list of top 3 languages.
|
| +Language CompactLangDet::DetectLanguageSummary(
|
| + const char* buffer,
|
| + int buffer_length,
|
| + bool is_plain_text,
|
| + const char* tld_hint, // "id" boosts Indonesian
|
| + int encoding_hint, // SJS boosts Japanese
|
| + Language language_hint, // ITALIAN boosts it
|
| + Language* language3,
|
| + int* percent3,
|
| + int* text_bytes,
|
| + bool* is_reliable) {
|
| + double normalized_score3[3];
|
| + bool allow_extended_lang = false;
|
| + int flags = 0;
|
| + Language plus_one = UNKNOWN_LANGUAGE;
|
| +
|
| + Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
|
| + buffer,
|
| + buffer_length,
|
| + is_plain_text,
|
| + tld_hint, // "id" boosts Indonesian
|
| + encoding_hint, // SJS boosts Japanese
|
| + language_hint, // ITALIAN boosts it
|
| + allow_extended_lang,
|
| + flags,
|
| + plus_one,
|
| + language3,
|
| + percent3,
|
| + normalized_score3,
|
| + text_bytes,
|
| + is_reliable);
|
| + // Default to English
|
| + if (lang == UNKNOWN_LANGUAGE) {
|
| + lang = ENGLISH;
|
| + }
|
| + return lang;
|
| +}
|
| +
|
| +
|
| +// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
| +// languages.
|
| +// Extended languages are additional Google interface languages and Unicode
|
| +// single-language scripts, from ext_lang_enc.h
|
| +Language CompactLangDet::ExtDetectLanguageSummary(
|
| + const char* buffer,
|
| + int buffer_length,
|
| + bool is_plain_text,
|
| + Language* language3,
|
| + int* percent3,
|
| + int* text_bytes,
|
| + bool* is_reliable) {
|
| + double normalized_score3[3];
|
| + bool allow_extended_lang = true;
|
| + int flags = 0;
|
| + Language plus_one = UNKNOWN_LANGUAGE;
|
| + const char* tld_hint = "";
|
| + int encoding_hint = UNKNOWN_ENCODING;
|
| + Language language_hint = UNKNOWN_LANGUAGE;
|
| +
|
| + Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
|
| + buffer,
|
| + buffer_length,
|
| + is_plain_text,
|
| + tld_hint, // "id" boosts Indonesian
|
| + encoding_hint, // SJS boosts Japanese
|
| + language_hint, // ITALIAN boosts it
|
| + allow_extended_lang,
|
| + flags,
|
| + plus_one,
|
| + language3,
|
| + percent3,
|
| + normalized_score3,
|
| + text_bytes,
|
| + is_reliable);
|
| + // Do not default to English
|
| + return lang;
|
| +}
|
| +
|
| +// Same as above, with hints supplied
|
| +// Scan interchange-valid UTF-8 bytes and detect list of top 3 extended
|
| +// languages.
|
| +// Extended languages are additional Google interface languages and Unicode
|
| +// single-language scripts, from ext_lang_enc.h
|
| +Language CompactLangDet::ExtDetectLanguageSummary(
|
| + const char* buffer,
|
| + int buffer_length,
|
| + bool is_plain_text,
|
| + const char* tld_hint, // "id" boosts Indonesian
|
| + int encoding_hint, // SJS boosts Japanese
|
| + Language language_hint, // ITALIAN boosts it
|
| + Language* language3,
|
| + int* percent3,
|
| + int* text_bytes,
|
| + bool* is_reliable) {
|
| + double normalized_score3[3];
|
| + bool allow_extended_lang = true;
|
| + int flags = 0;
|
| + Language plus_one = UNKNOWN_LANGUAGE;
|
| +
|
| + Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
|
| + buffer,
|
| + buffer_length,
|
| + is_plain_text,
|
| + tld_hint, // "id" boosts Indonesian
|
| + encoding_hint, // SJS boosts Japanese
|
| + language_hint, // ITALIAN boosts it
|
| + allow_extended_lang,
|
| + flags,
|
| + plus_one,
|
| + language3,
|
| + percent3,
|
| + normalized_score3,
|
| + text_bytes,
|
| + is_reliable);
|
| + // Do not default to English
|
| + return lang;
|
| +}
|
| +
|
| +// Same as above, and also returns internal language scores as a ratio to
|
| +// normal score for real text in that language. Scores close to 1.0 indicate
|
| +// normal text, while scores far away from 1.0 indicate badly-skewed text or
|
| +// gibberish
|
| +//
|
| +Language CompactLangDet::ExtDetectLanguageSummary(
|
| + const char* buffer,
|
| + int buffer_length,
|
| + bool is_plain_text,
|
| + const char* tld_hint, // "id" boosts Indonesian
|
| + int encoding_hint, // SJS boosts Japanese
|
| + Language language_hint, // ITALIAN boosts it
|
| + Language* language3,
|
| + int* percent3,
|
| + double* normalized_score3,
|
| + int* text_bytes,
|
| + bool* is_reliable) {
|
| + bool allow_extended_lang = true;
|
| + int flags = 0;
|
| + Language plus_one = UNKNOWN_LANGUAGE;
|
| +
|
| + Language lang = CompactLangDetImpl::DetectLanguageSummaryV25(
|
| + buffer,
|
| + buffer_length,
|
| + is_plain_text,
|
| + tld_hint, // "id" boosts Indonesian
|
| + encoding_hint, // SJS boosts Japanese
|
| + language_hint, // ITALIAN boosts it
|
| + allow_extended_lang,
|
| + flags,
|
| + plus_one,
|
| + language3,
|
| + percent3,
|
| + normalized_score3,
|
| + text_bytes,
|
| + is_reliable);
|
| + // Do not default to English
|
| + return lang;
|
| + }
|
| +
|
| +
|
| +
|
| +// Return version text string
|
| +// String is "code_version - data_scrape_date"
|
| +const char* CompactLangDet::DetectLanguageVersion() {
|
| + return kDetectLanguageVersion;
|
| +}
|
|
|
| Property changes on: third_party\cld\bar\toolbar\cld\i18n\encodings\compact_lang_det\compact_lang_det.cc
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|