| OLD | NEW |
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "chrome/common/translate/language_detection_util.h" | 5 #include "chrome/common/translate/language_detection_util.h" |
| 6 | 6 |
| 7 #include "base/logging.h" | 7 #include "base/logging.h" |
| 8 #include "base/metrics/field_trial.h" | |
| 9 #include "base/strings/string_split.h" | 8 #include "base/strings/string_split.h" |
| 10 #include "base/strings/string_util.h" | 9 #include "base/strings/string_util.h" |
| 11 #include "base/strings/utf_string_conversions.h" | |
| 12 #include "base/time/time.h" | 10 #include "base/time/time.h" |
| 13 #include "chrome/common/chrome_constants.h" | 11 #include "chrome/common/chrome_constants.h" |
| 14 #include "chrome/common/translate/translate_common_metrics.h" | 12 #include "chrome/common/translate/translate_common_metrics.h" |
| 15 #include "chrome/common/translate/translate_util.h" | 13 #include "chrome/common/translate/translate_util.h" |
| 16 | |
| 17 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
| 18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | 14 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
| 19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | 15 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" |
| 20 #endif | |
| 21 | |
| 22 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
| 23 #include "third_party/cld_2/src/public/compact_lang_det.h" | |
| 24 #endif | |
| 25 | 16 |
| 26 namespace { | 17 namespace { |
| 27 | 18 |
| 28 // Similar language code list. Some languages are very similar and difficult | 19 // Similar language code list. Some languages are very similar and difficult |
| 29 // for CLD to distinguish. | 20 // for CLD to distinguish. |
| 30 struct SimilarLanguageCode { | 21 struct SimilarLanguageCode { |
| 31 const char* const code; | 22 const char* const code; |
| 32 int group; | 23 int group; |
| 33 }; | 24 }; |
| 34 | 25 |
| (...skipping 28 matching lines...) Expand all Loading... |
| 63 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); | 54 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); |
| 64 | 55 |
| 65 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { | 56 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { |
| 66 *code = std::string(); | 57 *code = std::string(); |
| 67 return; | 58 return; |
| 68 } | 59 } |
| 69 | 60 |
| 70 TranslateUtil::ToTranslateLanguageSynonym(code); | 61 TranslateUtil::ToTranslateLanguageSynonym(code); |
| 71 } | 62 } |
| 72 | 63 |
| 73 int GetCLDMajorVersion() { | |
| 74 #if !defined(CLD_VERSION) | |
| 75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); | |
| 76 if (group_name == "CLD2") | |
| 77 return 2; | |
| 78 else | |
| 79 return 1; | |
| 80 #else | |
| 81 return CLD_VERSION; | |
| 82 #endif | |
| 83 } | |
| 84 | |
| 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | 64 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
| 86 // failed. | 65 // failed. |
| 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | 66 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
| 88 std::string DetermineTextLanguage(const base::string16& text, | 67 std::string DetermineTextLanguage(const base::string16& text, |
| 89 bool* is_cld_reliable) { | 68 bool* is_cld_reliable) { |
| 90 std::string language = chrome::kUnknownLanguageCode; | 69 std::string language = chrome::kUnknownLanguageCode; |
| 70 int num_languages = 0; |
| 91 int text_bytes = 0; | 71 int text_bytes = 0; |
| 92 bool is_reliable = false; | 72 bool is_reliable = false; |
| 93 | 73 Language cld_language = |
| 94 // Language or CLD2::Language | 74 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, |
| 95 int cld_language = 0; | 75 &num_languages, NULL, &text_bytes); |
| 96 bool is_valid_language = false; | |
| 97 | |
| 98 switch (GetCLDMajorVersion()) { | |
| 99 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
| 100 case 1: { | |
| 101 int num_languages = 0; | |
| 102 cld_language = | |
| 103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, | |
| 104 &num_languages, NULL, &text_bytes); | |
| 105 is_valid_language = cld_language != NUM_LANGUAGES && | |
| 106 cld_language != UNKNOWN_LANGUAGE && | |
| 107 cld_language != TG_UNKNOWN_LANGUAGE; | |
| 108 break; | |
| 109 } | |
| 110 #endif | |
| 111 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
| 112 case 2: { | |
| 113 std::string utf8_text(UTF16ToUTF8(text)); | |
| 114 CLD2::Language language3[3]; | |
| 115 int percent3[3]; | |
| 116 cld_language = | |
| 117 CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true, | |
| 118 language3, percent3, | |
| 119 &text_bytes, &is_reliable); | |
| 120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && | |
| 121 cld_language != CLD2::UNKNOWN_LANGUAGE && | |
| 122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; | |
| 123 break; | |
| 124 } | |
| 125 #endif | |
| 126 default: | |
| 127 NOTREACHED(); | |
| 128 } | |
| 129 | |
| 130 if (is_cld_reliable != NULL) | 76 if (is_cld_reliable != NULL) |
| 131 *is_cld_reliable = is_reliable; | 77 *is_cld_reliable = is_reliable; |
| 132 | 78 |
| 133 // We don't trust the result if the CLD reports that the detection is not | 79 // We don't trust the result if the CLD reports that the detection is not |
| 134 // reliable, or if the actual text used to detect the language was less than | 80 // reliable, or if the actual text used to detect the language was less than |
| 135 // 100 bytes (short texts can often lead to wrong results). | 81 // 100 bytes (short texts can often lead to wrong results). |
| 136 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that | 82 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that |
| 137 // the determined language code is correct with 50% confidence. Chrome should | 83 // the determined language code is correct with 50% confidence. Chrome should |
| 138 // handle the real confidence value to judge. | 84 // handle the real confidence value to judge. |
| 139 if (is_reliable && text_bytes >= 100 && is_valid_language) { | 85 if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES && |
| 86 cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) { |
| 140 // We should not use LanguageCode_ISO_639_1 because it does not cover all | 87 // We should not use LanguageCode_ISO_639_1 because it does not cover all |
| 141 // the languages CLD can detect. As a result, it'll return the invalid | 88 // the languages CLD can detect. As a result, it'll return the invalid |
| 142 // language code for tradtional Chinese among others. | 89 // language code for tradtional Chinese among others. |
| 143 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and | 90 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and |
| 144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN | 91 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN |
| 145 // for Simplified Chinese. | 92 // for Simplified Chinese. |
| 146 switch (GetCLDMajorVersion()) { | 93 language = LanguageCodeWithDialects(cld_language); |
| 147 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
| 148 case 1: | |
| 149 language = | |
| 150 LanguageCodeWithDialects(static_cast<Language>(cld_language)); | |
| 151 break; | |
| 152 #endif | |
| 153 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
| 154 case 2: | |
| 155 if (cld_language == CLD2::CHINESE) { | |
| 156 language = "zh-CN"; | |
| 157 } else { | |
| 158 language = | |
| 159 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); | |
| 160 } | |
| 161 break; | |
| 162 #endif | |
| 163 default: | |
| 164 NOTREACHED(); | |
| 165 } | |
| 166 } | 94 } |
| 167 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text | 95 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text |
| 168 << "\n*************************************\n"; | 96 << "\n*************************************\n"; |
| 169 return language; | 97 return language; |
| 170 } | 98 } |
| 171 | 99 |
| 172 // Checks if CLD can complement a sub code when the page language doesn't know | 100 // Checks if CLD can complement a sub code when the page language doesn't know |
| 173 // the sub code. | 101 // the sub code. |
| 174 bool CanCLDComplementSubCode( | 102 bool CanCLDComplementSubCode( |
| 175 const std::string& page_language, const std::string& cld_language) { | 103 const std::string& page_language, const std::string& cld_language) { |
| (...skipping 180 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 356 // distinguish from English, and the language is one of well-known languages | 284 // distinguish from English, and the language is one of well-known languages |
| 357 // which often provide "en-*" meta information mistakenly. | 285 // which often provide "en-*" meta information mistakenly. |
| 358 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 286 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
| 359 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 287 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
| 360 return true; | 288 return true; |
| 361 } | 289 } |
| 362 return false; | 290 return false; |
| 363 } | 291 } |
| 364 | 292 |
| 365 std::string GetCLDVersion() { | 293 std::string GetCLDVersion() { |
| 366 switch (GetCLDMajorVersion()) { | 294 return CompactLangDet::DetectLanguageVersion(); |
| 367 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
| 368 case 1: | |
| 369 return CompactLangDet::DetectLanguageVersion(); | |
| 370 #endif | |
| 371 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
| 372 case 2: | |
| 373 return CLD2::DetectLanguageVersion(); | |
| 374 #endif | |
| 375 default: | |
| 376 NOTREACHED(); | |
| 377 } | |
| 378 return ""; | |
| 379 } | 295 } |
| 380 | 296 |
| 381 } // namespace LanguageDetectionUtil | 297 } // namespace LanguageDetectionUtil |
| OLD | NEW |