Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4062)

Unified Diff: chrome/common/translate/language_detection_util.cc

Issue 25531002: Move language detection to a component (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Run translate unittests on iOS Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/common/translate/language_detection_util.cc
diff --git a/chrome/common/translate/language_detection_util.cc b/chrome/common/translate/language_detection_util.cc
deleted file mode 100644
index f61331bc070a90881c1cb2a3dd1821e7909e2067..0000000000000000000000000000000000000000
--- a/chrome/common/translate/language_detection_util.cc
+++ /dev/null
@@ -1,403 +0,0 @@
-// Copyright 2013 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "chrome/common/translate/language_detection_util.h"
-
-#include "base/logging.h"
-#include "base/metrics/field_trial.h"
-#include "base/strings/string_split.h"
-#include "base/strings/string_util.h"
-#include "base/strings/utf_string_conversions.h"
-#include "base/time/time.h"
-#include "chrome/common/chrome_constants.h"
-#include "chrome/common/translate/translate_common_metrics.h"
-#include "chrome/common/translate/translate_util.h"
-
-#if !defined(CLD_VERSION) || CLD_VERSION==1
-#include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
-#include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
-#endif
-
-#if !defined(CLD_VERSION) || CLD_VERSION==2
-#include "third_party/cld_2/src/public/compact_lang_det.h"
-#endif
-
-namespace {
-
-// Similar language code list. Some languages are very similar and difficult
-// for CLD to distinguish.
-struct SimilarLanguageCode {
- const char* const code;
- int group;
-};
-
-const SimilarLanguageCode kSimilarLanguageCodes[] = {
- {"bs", 1},
- {"hr", 1},
- {"hi", 2},
- {"ne", 2},
-};
-
-// Checks |kSimilarLanguageCodes| and returns group code.
-int GetSimilarLanguageGroupCode(const std::string& language) {
- for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {
- if (language.find(kSimilarLanguageCodes[i].code) != 0)
- continue;
- return kSimilarLanguageCodes[i].group;
- }
- return 0;
-}
-
-// Well-known languages which often have wrong server configuration of
-// Content-Language: en.
-// TODO(toyoshim): Remove these static tables and caller functions to
-// chrome/common/translate, and implement them as std::set<>.
-const char* kWellKnownCodesOnWrongConfiguration[] = {
- "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
-};
-
-// Applies a series of language code modification in proper order.
-void ApplyLanguageCodeCorrection(std::string* code) {
- // Correct well-known format errors.
- LanguageDetectionUtil::CorrectLanguageCodeTypo(code);
-
- if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) {
- *code = std::string();
- return;
- }
-
- TranslateUtil::ToTranslateLanguageSynonym(code);
-}
-
-int GetCLDMajorVersion() {
-#if !defined(CLD_VERSION)
- std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");
- if (group_name == "CLD2")
- return 2;
- else
- return 1;
-#else
- return CLD_VERSION;
-#endif
-}
-
-// Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
-// failed.
-// |is_cld_reliable| will be set as true if CLD says the detection is reliable.
-std::string DetermineTextLanguage(const base::string16& text,
- bool* is_cld_reliable) {
- std::string language = chrome::kUnknownLanguageCode;
- int text_bytes = 0;
- bool is_reliable = false;
-
- // Language or CLD2::Language
- int cld_language = 0;
- bool is_valid_language = false;
-
- switch (GetCLDMajorVersion()) {
-#if !defined(CLD_VERSION) || CLD_VERSION==1
- case 1: {
- int num_languages = 0;
- cld_language =
- DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
- &num_languages, NULL, &text_bytes);
- is_valid_language = cld_language != NUM_LANGUAGES &&
- cld_language != UNKNOWN_LANGUAGE &&
- cld_language != TG_UNKNOWN_LANGUAGE;
- break;
- }
-#endif
-#if !defined(CLD_VERSION) || CLD_VERSION==2
- case 2: {
- std::string utf8_text(UTF16ToUTF8(text));
- CLD2::Language language3[3];
- int percent3[3];
- cld_language =
- CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true,
- language3, percent3,
- &text_bytes, &is_reliable);
- is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
- cld_language != CLD2::UNKNOWN_LANGUAGE &&
- cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
- break;
- }
-#endif
- default:
- NOTREACHED();
- }
-
- if (is_cld_reliable != NULL)
- *is_cld_reliable = is_reliable;
-
- // We don't trust the result if the CLD reports that the detection is not
- // reliable, or if the actual text used to detect the language was less than
- // 100 bytes (short texts can often lead to wrong results).
- // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
- // the determined language code is correct with 50% confidence. Chrome should
- // handle the real confidence value to judge.
- if (is_reliable && text_bytes >= 100 && is_valid_language) {
- // We should not use LanguageCode_ISO_639_1 because it does not cover all
- // the languages CLD can detect. As a result, it'll return the invalid
- // language code for tradtional Chinese among others.
- // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
- // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
- // for Simplified Chinese.
- switch (GetCLDMajorVersion()) {
-#if !defined(CLD_VERSION) || CLD_VERSION==1
- case 1:
- language =
- LanguageCodeWithDialects(static_cast<Language>(cld_language));
- break;
-#endif
-#if !defined(CLD_VERSION) || CLD_VERSION==2
- case 2:
- // (1) CLD2's LanguageCode returns general Chinese 'zh' for
- // CLD2::CHINESE, but Translate server doesn't accept it. This is
- // converted to 'zh-CN' in the same way as CLD1's
- // LanguageCodeWithDialects.
- //
- // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for
- // CLD2::CHINESE_T. This is technically more precise for the language
- // code of traditional Chinese, while Translate server hasn't accepted
- // zh-Hant yet.
- if (cld_language == CLD2::CHINESE) {
- language = "zh-CN";
- } else if (cld_language == CLD2::CHINESE_T) {
- language = "zh-TW";
- } else {
- language =
- CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
- }
- break;
-#endif
- default:
- NOTREACHED();
- }
- }
- VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
- << "\n*************************************\n";
- return language;
-}
-
-// Checks if CLD can complement a sub code when the page language doesn't know
-// the sub code.
-bool CanCLDComplementSubCode(
- const std::string& page_language, const std::string& cld_language) {
- // Translate server cannot treat general Chinese. If Content-Language and
- // CLD agree that the language is Chinese and Content-Language doesn't know
- // which dialect is used, CLD language has priority.
- // TODO(hajimehoshi): How about the other dialects like zh-MO?
- return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
-}
-
-} // namespace
-
-namespace LanguageDetectionUtil {
-
-std::string DeterminePageLanguage(const std::string& code,
- const std::string& html_lang,
- const base::string16& contents,
- std::string* cld_language_p,
- bool* is_cld_reliable_p) {
- base::TimeTicks begin_time = base::TimeTicks::Now();
- bool is_cld_reliable;
- std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
- TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time,
- base::TimeTicks::Now());
-
- if (cld_language_p != NULL)
- *cld_language_p = cld_language;
- if (is_cld_reliable_p != NULL)
- *is_cld_reliable_p = is_cld_reliable;
- TranslateUtil::ToTranslateLanguageSynonym(&cld_language);
-
- // Check if html lang attribute is valid.
- std::string modified_html_lang;
- if (!html_lang.empty()) {
- modified_html_lang = html_lang;
- ApplyLanguageCodeCorrection(&modified_html_lang);
- TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang);
- VLOG(9) << "html lang based language code: " << modified_html_lang;
- }
-
- // Check if Content-Language is valid.
- std::string modified_code;
- if (!code.empty()) {
- modified_code = code;
- ApplyLanguageCodeCorrection(&modified_code);
- TranslateCommonMetrics::ReportContentLanguage(code, modified_code);
- }
-
- // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
- // |modified_code|.
- std::string language = modified_html_lang.empty() ? modified_code :
- modified_html_lang;
-
- // If |language| is empty, just use CLD result even though it might be
- // chrome::kUnknownLanguageCode.
- if (language.empty()) {
- TranslateCommonMetrics::ReportLanguageVerification(
- TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY);
- return cld_language;
- }
-
- if (cld_language == chrome::kUnknownLanguageCode) {
- TranslateCommonMetrics::ReportLanguageVerification(
- TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN);
- return language;
- } else if (CanCLDComplementSubCode(language, cld_language)) {
- TranslateCommonMetrics::ReportLanguageVerification(
- TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
- return cld_language;
- } else if (IsSameOrSimilarLanguages(language, cld_language)) {
- TranslateCommonMetrics::ReportLanguageVerification(
- TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE);
- return language;
- } else if (MaybeServerWrongConfiguration(language, cld_language)) {
- TranslateCommonMetrics::ReportLanguageVerification(
- TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD);
- return cld_language;
- } else {
- TranslateCommonMetrics::ReportLanguageVerification(
- TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE);
- // Content-Language value might be wrong because CLD says that this page
- // is written in another language with confidence.
- // In this case, Chrome doesn't rely on any of the language codes, and
- // gives up suggesting a translation.
- return std::string(chrome::kUnknownLanguageCode);
- }
-
- return language;
-}
-
-void CorrectLanguageCodeTypo(std::string* code) {
- DCHECK(code);
-
- size_t coma_index = code->find(',');
- if (coma_index != std::string::npos) {
- // There are more than 1 language specified, just keep the first one.
- *code = code->substr(0, coma_index);
- }
- TrimWhitespaceASCII(*code, TRIM_ALL, code);
-
- // An underscore instead of a dash is a frequent mistake.
- size_t underscore_index = code->find('_');
- if (underscore_index != std::string::npos)
- (*code)[underscore_index] = '-';
-
- // Change everything up to a dash to lower-case and everything after to upper.
- size_t dash_index = code->find('-');
- if (dash_index != std::string::npos) {
- *code = StringToLowerASCII(code->substr(0, dash_index)) +
- StringToUpperASCII(code->substr(dash_index));
- } else {
- *code = StringToLowerASCII(*code);
- }
-}
-
-bool IsValidLanguageCode(const std::string& code) {
- // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
- // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
- std::vector<std::string> chunks;
- base::SplitString(code, '-', &chunks);
-
- if (chunks.size() < 1 || 2 < chunks.size())
- return false;
-
- const std::string& main_code = chunks[0];
-
- if (main_code.size() < 1 || 3 < main_code.size())
- return false;
-
- for (std::string::const_iterator it = main_code.begin();
- it != main_code.end(); ++it) {
- if (!IsAsciiAlpha(*it))
- return false;
- }
-
- if (chunks.size() == 1)
- return true;
-
- const std::string& sub_code = chunks[1];
-
- if (sub_code.size() != 2)
- return false;
-
- for (std::string::const_iterator it = sub_code.begin();
- it != sub_code.end(); ++it) {
- if (!IsAsciiAlpha(*it))
- return false;
- }
-
- return true;
-}
-
-bool IsSameOrSimilarLanguages(const std::string& page_language,
- const std::string& cld_language) {
- std::vector<std::string> chunks;
-
- base::SplitString(page_language, '-', &chunks);
- if (chunks.size() == 0)
- return false;
- std::string page_language_main_part = chunks[0];
-
- base::SplitString(cld_language, '-', &chunks);
- if (chunks.size() == 0)
- return false;
- std::string cld_language_main_part = chunks[0];
-
- // Language code part of |page_language| is matched to one of |cld_language|.
- // Country code is ignored here.
- if (page_language_main_part == cld_language_main_part) {
- // Languages are matched strictly. Reports false to metrics, but returns
- // true.
- TranslateCommonMetrics::ReportSimilarLanguageMatch(false);
- return true;
- }
-
- // Check if |page_language| and |cld_language| are in the similar language
- // list and belong to the same language group.
- int page_code = GetSimilarLanguageGroupCode(page_language);
- bool match = page_code != 0 &&
- page_code == GetSimilarLanguageGroupCode(cld_language);
-
- TranslateCommonMetrics::ReportSimilarLanguageMatch(match);
- return match;
-}
-
-bool MaybeServerWrongConfiguration(const std::string& page_language,
- const std::string& cld_language) {
- // If |page_language| is not "en-*", respect it and just return false here.
- if (!StartsWithASCII(page_language, "en", false))
- return false;
-
- // A server provides a language meta information representing "en-*". But it
- // might be just a default value due to missing user configuration.
- // Let's trust |cld_language| if the determined language is not difficult to
- // distinguish from English, and the language is one of well-known languages
- // which often provide "en-*" meta information mistakenly.
- for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
- if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
- return true;
- }
- return false;
-}
-
-std::string GetCLDVersion() {
- switch (GetCLDMajorVersion()) {
-#if !defined(CLD_VERSION) || CLD_VERSION==1
- case 1:
- return CompactLangDet::DetectLanguageVersion();
-#endif
-#if !defined(CLD_VERSION) || CLD_VERSION==2
- case 2:
- return CLD2::DetectLanguageVersion();
-#endif
- default:
- NOTREACHED();
- }
- return "";
-}
-
-} // namespace LanguageDetectionUtil
« no previous file with comments | « chrome/common/translate/language_detection_util.h ('k') | chrome/common/translate/language_detection_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698