| Index: chrome/common/translate/language_detection_util.cc
|
| diff --git a/chrome/common/translate/language_detection_util.cc b/chrome/common/translate/language_detection_util.cc
|
| deleted file mode 100644
|
| index f61331bc070a90881c1cb2a3dd1821e7909e2067..0000000000000000000000000000000000000000
|
| --- a/chrome/common/translate/language_detection_util.cc
|
| +++ /dev/null
|
| @@ -1,403 +0,0 @@
|
| -// Copyright 2013 The Chromium Authors. All rights reserved.
|
| -// Use of this source code is governed by a BSD-style license that can be
|
| -// found in the LICENSE file.
|
| -
|
| -#include "chrome/common/translate/language_detection_util.h"
|
| -
|
| -#include "base/logging.h"
|
| -#include "base/metrics/field_trial.h"
|
| -#include "base/strings/string_split.h"
|
| -#include "base/strings/string_util.h"
|
| -#include "base/strings/utf_string_conversions.h"
|
| -#include "base/time/time.h"
|
| -#include "chrome/common/chrome_constants.h"
|
| -#include "chrome/common/translate/translate_common_metrics.h"
|
| -#include "chrome/common/translate/translate_util.h"
|
| -
|
| -#if !defined(CLD_VERSION) || CLD_VERSION==1
|
| -#include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
|
| -#include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
|
| -#endif
|
| -
|
| -#if !defined(CLD_VERSION) || CLD_VERSION==2
|
| -#include "third_party/cld_2/src/public/compact_lang_det.h"
|
| -#endif
|
| -
|
| -namespace {
|
| -
|
| -// Similar language code list. Some languages are very similar and difficult
|
| -// for CLD to distinguish.
|
| -struct SimilarLanguageCode {
|
| - const char* const code;
|
| - int group;
|
| -};
|
| -
|
| -const SimilarLanguageCode kSimilarLanguageCodes[] = {
|
| - {"bs", 1},
|
| - {"hr", 1},
|
| - {"hi", 2},
|
| - {"ne", 2},
|
| -};
|
| -
|
| -// Checks |kSimilarLanguageCodes| and returns group code.
|
| -int GetSimilarLanguageGroupCode(const std::string& language) {
|
| - for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {
|
| - if (language.find(kSimilarLanguageCodes[i].code) != 0)
|
| - continue;
|
| - return kSimilarLanguageCodes[i].group;
|
| - }
|
| - return 0;
|
| -}
|
| -
|
| -// Well-known languages which often have wrong server configuration of
|
| -// Content-Language: en.
|
| -// TODO(toyoshim): Remove these static tables and caller functions to
|
| -// chrome/common/translate, and implement them as std::set<>.
|
| -const char* kWellKnownCodesOnWrongConfiguration[] = {
|
| - "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
|
| -};
|
| -
|
| -// Applies a series of language code modification in proper order.
|
| -void ApplyLanguageCodeCorrection(std::string* code) {
|
| - // Correct well-known format errors.
|
| - LanguageDetectionUtil::CorrectLanguageCodeTypo(code);
|
| -
|
| - if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) {
|
| - *code = std::string();
|
| - return;
|
| - }
|
| -
|
| - TranslateUtil::ToTranslateLanguageSynonym(code);
|
| -}
|
| -
|
| -int GetCLDMajorVersion() {
|
| -#if !defined(CLD_VERSION)
|
| - std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");
|
| - if (group_name == "CLD2")
|
| - return 2;
|
| - else
|
| - return 1;
|
| -#else
|
| - return CLD_VERSION;
|
| -#endif
|
| -}
|
| -
|
| -// Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
|
| -// failed.
|
| -// |is_cld_reliable| will be set as true if CLD says the detection is reliable.
|
| -std::string DetermineTextLanguage(const base::string16& text,
|
| - bool* is_cld_reliable) {
|
| - std::string language = chrome::kUnknownLanguageCode;
|
| - int text_bytes = 0;
|
| - bool is_reliable = false;
|
| -
|
| - // Language or CLD2::Language
|
| - int cld_language = 0;
|
| - bool is_valid_language = false;
|
| -
|
| - switch (GetCLDMajorVersion()) {
|
| -#if !defined(CLD_VERSION) || CLD_VERSION==1
|
| - case 1: {
|
| - int num_languages = 0;
|
| - cld_language =
|
| - DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
|
| - &num_languages, NULL, &text_bytes);
|
| - is_valid_language = cld_language != NUM_LANGUAGES &&
|
| - cld_language != UNKNOWN_LANGUAGE &&
|
| - cld_language != TG_UNKNOWN_LANGUAGE;
|
| - break;
|
| - }
|
| -#endif
|
| -#if !defined(CLD_VERSION) || CLD_VERSION==2
|
| - case 2: {
|
| - std::string utf8_text(UTF16ToUTF8(text));
|
| - CLD2::Language language3[3];
|
| - int percent3[3];
|
| - cld_language =
|
| - CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true,
|
| - language3, percent3,
|
| - &text_bytes, &is_reliable);
|
| - is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
|
| - cld_language != CLD2::UNKNOWN_LANGUAGE &&
|
| - cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
|
| - break;
|
| - }
|
| -#endif
|
| - default:
|
| - NOTREACHED();
|
| - }
|
| -
|
| - if (is_cld_reliable != NULL)
|
| - *is_cld_reliable = is_reliable;
|
| -
|
| - // We don't trust the result if the CLD reports that the detection is not
|
| - // reliable, or if the actual text used to detect the language was less than
|
| - // 100 bytes (short texts can often lead to wrong results).
|
| - // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
|
| - // the determined language code is correct with 50% confidence. Chrome should
|
| - // handle the real confidence value to judge.
|
| - if (is_reliable && text_bytes >= 100 && is_valid_language) {
|
| - // We should not use LanguageCode_ISO_639_1 because it does not cover all
|
| - // the languages CLD can detect. As a result, it'll return the invalid
|
| - // language code for tradtional Chinese among others.
|
| - // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
|
| - // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
|
| - // for Simplified Chinese.
|
| - switch (GetCLDMajorVersion()) {
|
| -#if !defined(CLD_VERSION) || CLD_VERSION==1
|
| - case 1:
|
| - language =
|
| - LanguageCodeWithDialects(static_cast<Language>(cld_language));
|
| - break;
|
| -#endif
|
| -#if !defined(CLD_VERSION) || CLD_VERSION==2
|
| - case 2:
|
| - // (1) CLD2's LanguageCode returns general Chinese 'zh' for
|
| - // CLD2::CHINESE, but Translate server doesn't accept it. This is
|
| - // converted to 'zh-CN' in the same way as CLD1's
|
| - // LanguageCodeWithDialects.
|
| - //
|
| - // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for
|
| - // CLD2::CHINESE_T. This is technically more precise for the language
|
| - // code of traditional Chinese, while Translate server hasn't accepted
|
| - // zh-Hant yet.
|
| - if (cld_language == CLD2::CHINESE) {
|
| - language = "zh-CN";
|
| - } else if (cld_language == CLD2::CHINESE_T) {
|
| - language = "zh-TW";
|
| - } else {
|
| - language =
|
| - CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
|
| - }
|
| - break;
|
| -#endif
|
| - default:
|
| - NOTREACHED();
|
| - }
|
| - }
|
| - VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
|
| - << "\n*************************************\n";
|
| - return language;
|
| -}
|
| -
|
| -// Checks if CLD can complement a sub code when the page language doesn't know
|
| -// the sub code.
|
| -bool CanCLDComplementSubCode(
|
| - const std::string& page_language, const std::string& cld_language) {
|
| - // Translate server cannot treat general Chinese. If Content-Language and
|
| - // CLD agree that the language is Chinese and Content-Language doesn't know
|
| - // which dialect is used, CLD language has priority.
|
| - // TODO(hajimehoshi): How about the other dialects like zh-MO?
|
| - return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
|
| -}
|
| -
|
| -} // namespace
|
| -
|
| -namespace LanguageDetectionUtil {
|
| -
|
| -std::string DeterminePageLanguage(const std::string& code,
|
| - const std::string& html_lang,
|
| - const base::string16& contents,
|
| - std::string* cld_language_p,
|
| - bool* is_cld_reliable_p) {
|
| - base::TimeTicks begin_time = base::TimeTicks::Now();
|
| - bool is_cld_reliable;
|
| - std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
|
| - TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time,
|
| - base::TimeTicks::Now());
|
| -
|
| - if (cld_language_p != NULL)
|
| - *cld_language_p = cld_language;
|
| - if (is_cld_reliable_p != NULL)
|
| - *is_cld_reliable_p = is_cld_reliable;
|
| - TranslateUtil::ToTranslateLanguageSynonym(&cld_language);
|
| -
|
| - // Check if html lang attribute is valid.
|
| - std::string modified_html_lang;
|
| - if (!html_lang.empty()) {
|
| - modified_html_lang = html_lang;
|
| - ApplyLanguageCodeCorrection(&modified_html_lang);
|
| - TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang);
|
| - VLOG(9) << "html lang based language code: " << modified_html_lang;
|
| - }
|
| -
|
| - // Check if Content-Language is valid.
|
| - std::string modified_code;
|
| - if (!code.empty()) {
|
| - modified_code = code;
|
| - ApplyLanguageCodeCorrection(&modified_code);
|
| - TranslateCommonMetrics::ReportContentLanguage(code, modified_code);
|
| - }
|
| -
|
| - // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
|
| - // |modified_code|.
|
| - std::string language = modified_html_lang.empty() ? modified_code :
|
| - modified_html_lang;
|
| -
|
| - // If |language| is empty, just use CLD result even though it might be
|
| - // chrome::kUnknownLanguageCode.
|
| - if (language.empty()) {
|
| - TranslateCommonMetrics::ReportLanguageVerification(
|
| - TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY);
|
| - return cld_language;
|
| - }
|
| -
|
| - if (cld_language == chrome::kUnknownLanguageCode) {
|
| - TranslateCommonMetrics::ReportLanguageVerification(
|
| - TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN);
|
| - return language;
|
| - } else if (CanCLDComplementSubCode(language, cld_language)) {
|
| - TranslateCommonMetrics::ReportLanguageVerification(
|
| - TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
|
| - return cld_language;
|
| - } else if (IsSameOrSimilarLanguages(language, cld_language)) {
|
| - TranslateCommonMetrics::ReportLanguageVerification(
|
| - TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE);
|
| - return language;
|
| - } else if (MaybeServerWrongConfiguration(language, cld_language)) {
|
| - TranslateCommonMetrics::ReportLanguageVerification(
|
| - TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD);
|
| - return cld_language;
|
| - } else {
|
| - TranslateCommonMetrics::ReportLanguageVerification(
|
| - TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE);
|
| - // Content-Language value might be wrong because CLD says that this page
|
| - // is written in another language with confidence.
|
| - // In this case, Chrome doesn't rely on any of the language codes, and
|
| - // gives up suggesting a translation.
|
| - return std::string(chrome::kUnknownLanguageCode);
|
| - }
|
| -
|
| - return language;
|
| -}
|
| -
|
| -void CorrectLanguageCodeTypo(std::string* code) {
|
| - DCHECK(code);
|
| -
|
| - size_t coma_index = code->find(',');
|
| - if (coma_index != std::string::npos) {
|
| - // There are more than 1 language specified, just keep the first one.
|
| - *code = code->substr(0, coma_index);
|
| - }
|
| - TrimWhitespaceASCII(*code, TRIM_ALL, code);
|
| -
|
| - // An underscore instead of a dash is a frequent mistake.
|
| - size_t underscore_index = code->find('_');
|
| - if (underscore_index != std::string::npos)
|
| - (*code)[underscore_index] = '-';
|
| -
|
| - // Change everything up to a dash to lower-case and everything after to upper.
|
| - size_t dash_index = code->find('-');
|
| - if (dash_index != std::string::npos) {
|
| - *code = StringToLowerASCII(code->substr(0, dash_index)) +
|
| - StringToUpperASCII(code->substr(dash_index));
|
| - } else {
|
| - *code = StringToLowerASCII(*code);
|
| - }
|
| -}
|
| -
|
| -bool IsValidLanguageCode(const std::string& code) {
|
| - // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
|
| - // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
|
| - std::vector<std::string> chunks;
|
| - base::SplitString(code, '-', &chunks);
|
| -
|
| - if (chunks.size() < 1 || 2 < chunks.size())
|
| - return false;
|
| -
|
| - const std::string& main_code = chunks[0];
|
| -
|
| - if (main_code.size() < 1 || 3 < main_code.size())
|
| - return false;
|
| -
|
| - for (std::string::const_iterator it = main_code.begin();
|
| - it != main_code.end(); ++it) {
|
| - if (!IsAsciiAlpha(*it))
|
| - return false;
|
| - }
|
| -
|
| - if (chunks.size() == 1)
|
| - return true;
|
| -
|
| - const std::string& sub_code = chunks[1];
|
| -
|
| - if (sub_code.size() != 2)
|
| - return false;
|
| -
|
| - for (std::string::const_iterator it = sub_code.begin();
|
| - it != sub_code.end(); ++it) {
|
| - if (!IsAsciiAlpha(*it))
|
| - return false;
|
| - }
|
| -
|
| - return true;
|
| -}
|
| -
|
| -bool IsSameOrSimilarLanguages(const std::string& page_language,
|
| - const std::string& cld_language) {
|
| - std::vector<std::string> chunks;
|
| -
|
| - base::SplitString(page_language, '-', &chunks);
|
| - if (chunks.size() == 0)
|
| - return false;
|
| - std::string page_language_main_part = chunks[0];
|
| -
|
| - base::SplitString(cld_language, '-', &chunks);
|
| - if (chunks.size() == 0)
|
| - return false;
|
| - std::string cld_language_main_part = chunks[0];
|
| -
|
| - // Language code part of |page_language| is matched to one of |cld_language|.
|
| - // Country code is ignored here.
|
| - if (page_language_main_part == cld_language_main_part) {
|
| - // Languages are matched strictly. Reports false to metrics, but returns
|
| - // true.
|
| - TranslateCommonMetrics::ReportSimilarLanguageMatch(false);
|
| - return true;
|
| - }
|
| -
|
| - // Check if |page_language| and |cld_language| are in the similar language
|
| - // list and belong to the same language group.
|
| - int page_code = GetSimilarLanguageGroupCode(page_language);
|
| - bool match = page_code != 0 &&
|
| - page_code == GetSimilarLanguageGroupCode(cld_language);
|
| -
|
| - TranslateCommonMetrics::ReportSimilarLanguageMatch(match);
|
| - return match;
|
| -}
|
| -
|
| -bool MaybeServerWrongConfiguration(const std::string& page_language,
|
| - const std::string& cld_language) {
|
| - // If |page_language| is not "en-*", respect it and just return false here.
|
| - if (!StartsWithASCII(page_language, "en", false))
|
| - return false;
|
| -
|
| - // A server provides a language meta information representing "en-*". But it
|
| - // might be just a default value due to missing user configuration.
|
| - // Let's trust |cld_language| if the determined language is not difficult to
|
| - // distinguish from English, and the language is one of well-known languages
|
| - // which often provide "en-*" meta information mistakenly.
|
| - for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
|
| - if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
|
| - return true;
|
| - }
|
| - return false;
|
| -}
|
| -
|
| -std::string GetCLDVersion() {
|
| - switch (GetCLDMajorVersion()) {
|
| -#if !defined(CLD_VERSION) || CLD_VERSION==1
|
| - case 1:
|
| - return CompactLangDet::DetectLanguageVersion();
|
| -#endif
|
| -#if !defined(CLD_VERSION) || CLD_VERSION==2
|
| - case 2:
|
| - return CLD2::DetectLanguageVersion();
|
| -#endif
|
| - default:
|
| - NOTREACHED();
|
| - }
|
| - return "";
|
| -}
|
| -
|
| -} // namespace LanguageDetectionUtil
|
|
|