| OLD | NEW |
| (Empty) |
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "chrome/common/translate/language_detection_util.h" | |
| 6 | |
| 7 #include "base/logging.h" | |
| 8 #include "base/metrics/field_trial.h" | |
| 9 #include "base/strings/string_split.h" | |
| 10 #include "base/strings/string_util.h" | |
| 11 #include "base/strings/utf_string_conversions.h" | |
| 12 #include "base/time/time.h" | |
| 13 #include "chrome/common/chrome_constants.h" | |
| 14 #include "chrome/common/translate/translate_common_metrics.h" | |
| 15 #include "chrome/common/translate/translate_util.h" | |
| 16 | |
| 17 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
| 18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | |
| 19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | |
| 20 #endif | |
| 21 | |
| 22 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
| 23 #include "third_party/cld_2/src/public/compact_lang_det.h" | |
| 24 #endif | |
| 25 | |
| 26 namespace { | |
| 27 | |
| 28 // Similar language code list. Some languages are very similar and difficult | |
| 29 // for CLD to distinguish. | |
| 30 struct SimilarLanguageCode { | |
| 31 const char* const code; | |
| 32 int group; | |
| 33 }; | |
| 34 | |
| 35 const SimilarLanguageCode kSimilarLanguageCodes[] = { | |
| 36 {"bs", 1}, | |
| 37 {"hr", 1}, | |
| 38 {"hi", 2}, | |
| 39 {"ne", 2}, | |
| 40 }; | |
| 41 | |
| 42 // Checks |kSimilarLanguageCodes| and returns group code. | |
| 43 int GetSimilarLanguageGroupCode(const std::string& language) { | |
| 44 for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) { | |
| 45 if (language.find(kSimilarLanguageCodes[i].code) != 0) | |
| 46 continue; | |
| 47 return kSimilarLanguageCodes[i].group; | |
| 48 } | |
| 49 return 0; | |
| 50 } | |
| 51 | |
| 52 // Well-known languages which often have wrong server configuration of | |
| 53 // Content-Language: en. | |
| 54 // TODO(toyoshim): Remove these static tables and caller functions to | |
| 55 // chrome/common/translate, and implement them as std::set<>. | |
| 56 const char* kWellKnownCodesOnWrongConfiguration[] = { | |
| 57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th" | |
| 58 }; | |
| 59 | |
| 60 // Applies a series of language code modification in proper order. | |
| 61 void ApplyLanguageCodeCorrection(std::string* code) { | |
| 62 // Correct well-known format errors. | |
| 63 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); | |
| 64 | |
| 65 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { | |
| 66 *code = std::string(); | |
| 67 return; | |
| 68 } | |
| 69 | |
| 70 TranslateUtil::ToTranslateLanguageSynonym(code); | |
| 71 } | |
| 72 | |
| 73 int GetCLDMajorVersion() { | |
| 74 #if !defined(CLD_VERSION) | |
| 75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); | |
| 76 if (group_name == "CLD2") | |
| 77 return 2; | |
| 78 else | |
| 79 return 1; | |
| 80 #else | |
| 81 return CLD_VERSION; | |
| 82 #endif | |
| 83 } | |
| 84 | |
| 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | |
| 86 // failed. | |
| 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | |
| 88 std::string DetermineTextLanguage(const base::string16& text, | |
| 89 bool* is_cld_reliable) { | |
| 90 std::string language = chrome::kUnknownLanguageCode; | |
| 91 int text_bytes = 0; | |
| 92 bool is_reliable = false; | |
| 93 | |
| 94 // Language or CLD2::Language | |
| 95 int cld_language = 0; | |
| 96 bool is_valid_language = false; | |
| 97 | |
| 98 switch (GetCLDMajorVersion()) { | |
| 99 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
| 100 case 1: { | |
| 101 int num_languages = 0; | |
| 102 cld_language = | |
| 103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, | |
| 104 &num_languages, NULL, &text_bytes); | |
| 105 is_valid_language = cld_language != NUM_LANGUAGES && | |
| 106 cld_language != UNKNOWN_LANGUAGE && | |
| 107 cld_language != TG_UNKNOWN_LANGUAGE; | |
| 108 break; | |
| 109 } | |
| 110 #endif | |
| 111 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
| 112 case 2: { | |
| 113 std::string utf8_text(UTF16ToUTF8(text)); | |
| 114 CLD2::Language language3[3]; | |
| 115 int percent3[3]; | |
| 116 cld_language = | |
| 117 CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true, | |
| 118 language3, percent3, | |
| 119 &text_bytes, &is_reliable); | |
| 120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && | |
| 121 cld_language != CLD2::UNKNOWN_LANGUAGE && | |
| 122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; | |
| 123 break; | |
| 124 } | |
| 125 #endif | |
| 126 default: | |
| 127 NOTREACHED(); | |
| 128 } | |
| 129 | |
| 130 if (is_cld_reliable != NULL) | |
| 131 *is_cld_reliable = is_reliable; | |
| 132 | |
| 133 // We don't trust the result if the CLD reports that the detection is not | |
| 134 // reliable, or if the actual text used to detect the language was less than | |
| 135 // 100 bytes (short texts can often lead to wrong results). | |
| 136 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that | |
| 137 // the determined language code is correct with 50% confidence. Chrome should | |
| 138 // handle the real confidence value to judge. | |
| 139 if (is_reliable && text_bytes >= 100 && is_valid_language) { | |
| 140 // We should not use LanguageCode_ISO_639_1 because it does not cover all | |
| 141 // the languages CLD can detect. As a result, it'll return the invalid | |
| 142 // language code for tradtional Chinese among others. | |
| 143 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and | |
| 144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN | |
| 145 // for Simplified Chinese. | |
| 146 switch (GetCLDMajorVersion()) { | |
| 147 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
| 148 case 1: | |
| 149 language = | |
| 150 LanguageCodeWithDialects(static_cast<Language>(cld_language)); | |
| 151 break; | |
| 152 #endif | |
| 153 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
| 154 case 2: | |
| 155 // (1) CLD2's LanguageCode returns general Chinese 'zh' for | |
| 156 // CLD2::CHINESE, but Translate server doesn't accept it. This is | |
| 157 // converted to 'zh-CN' in the same way as CLD1's | |
| 158 // LanguageCodeWithDialects. | |
| 159 // | |
| 160 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for | |
| 161 // CLD2::CHINESE_T. This is technically more precise for the language | |
| 162 // code of traditional Chinese, while Translate server hasn't accepted | |
| 163 // zh-Hant yet. | |
| 164 if (cld_language == CLD2::CHINESE) { | |
| 165 language = "zh-CN"; | |
| 166 } else if (cld_language == CLD2::CHINESE_T) { | |
| 167 language = "zh-TW"; | |
| 168 } else { | |
| 169 language = | |
| 170 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); | |
| 171 } | |
| 172 break; | |
| 173 #endif | |
| 174 default: | |
| 175 NOTREACHED(); | |
| 176 } | |
| 177 } | |
| 178 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text | |
| 179 << "\n*************************************\n"; | |
| 180 return language; | |
| 181 } | |
| 182 | |
| 183 // Checks if CLD can complement a sub code when the page language doesn't know | |
| 184 // the sub code. | |
| 185 bool CanCLDComplementSubCode( | |
| 186 const std::string& page_language, const std::string& cld_language) { | |
| 187 // Translate server cannot treat general Chinese. If Content-Language and | |
| 188 // CLD agree that the language is Chinese and Content-Language doesn't know | |
| 189 // which dialect is used, CLD language has priority. | |
| 190 // TODO(hajimehoshi): How about the other dialects like zh-MO? | |
| 191 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); | |
| 192 } | |
| 193 | |
| 194 } // namespace | |
| 195 | |
| 196 namespace LanguageDetectionUtil { | |
| 197 | |
| 198 std::string DeterminePageLanguage(const std::string& code, | |
| 199 const std::string& html_lang, | |
| 200 const base::string16& contents, | |
| 201 std::string* cld_language_p, | |
| 202 bool* is_cld_reliable_p) { | |
| 203 base::TimeTicks begin_time = base::TimeTicks::Now(); | |
| 204 bool is_cld_reliable; | |
| 205 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); | |
| 206 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time, | |
| 207 base::TimeTicks::Now()); | |
| 208 | |
| 209 if (cld_language_p != NULL) | |
| 210 *cld_language_p = cld_language; | |
| 211 if (is_cld_reliable_p != NULL) | |
| 212 *is_cld_reliable_p = is_cld_reliable; | |
| 213 TranslateUtil::ToTranslateLanguageSynonym(&cld_language); | |
| 214 | |
| 215 // Check if html lang attribute is valid. | |
| 216 std::string modified_html_lang; | |
| 217 if (!html_lang.empty()) { | |
| 218 modified_html_lang = html_lang; | |
| 219 ApplyLanguageCodeCorrection(&modified_html_lang); | |
| 220 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang); | |
| 221 VLOG(9) << "html lang based language code: " << modified_html_lang; | |
| 222 } | |
| 223 | |
| 224 // Check if Content-Language is valid. | |
| 225 std::string modified_code; | |
| 226 if (!code.empty()) { | |
| 227 modified_code = code; | |
| 228 ApplyLanguageCodeCorrection(&modified_code); | |
| 229 TranslateCommonMetrics::ReportContentLanguage(code, modified_code); | |
| 230 } | |
| 231 | |
| 232 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt | |
| 233 // |modified_code|. | |
| 234 std::string language = modified_html_lang.empty() ? modified_code : | |
| 235 modified_html_lang; | |
| 236 | |
| 237 // If |language| is empty, just use CLD result even though it might be | |
| 238 // chrome::kUnknownLanguageCode. | |
| 239 if (language.empty()) { | |
| 240 TranslateCommonMetrics::ReportLanguageVerification( | |
| 241 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); | |
| 242 return cld_language; | |
| 243 } | |
| 244 | |
| 245 if (cld_language == chrome::kUnknownLanguageCode) { | |
| 246 TranslateCommonMetrics::ReportLanguageVerification( | |
| 247 TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN); | |
| 248 return language; | |
| 249 } else if (CanCLDComplementSubCode(language, cld_language)) { | |
| 250 TranslateCommonMetrics::ReportLanguageVerification( | |
| 251 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); | |
| 252 return cld_language; | |
| 253 } else if (IsSameOrSimilarLanguages(language, cld_language)) { | |
| 254 TranslateCommonMetrics::ReportLanguageVerification( | |
| 255 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE); | |
| 256 return language; | |
| 257 } else if (MaybeServerWrongConfiguration(language, cld_language)) { | |
| 258 TranslateCommonMetrics::ReportLanguageVerification( | |
| 259 TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD); | |
| 260 return cld_language; | |
| 261 } else { | |
| 262 TranslateCommonMetrics::ReportLanguageVerification( | |
| 263 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); | |
| 264 // Content-Language value might be wrong because CLD says that this page | |
| 265 // is written in another language with confidence. | |
| 266 // In this case, Chrome doesn't rely on any of the language codes, and | |
| 267 // gives up suggesting a translation. | |
| 268 return std::string(chrome::kUnknownLanguageCode); | |
| 269 } | |
| 270 | |
| 271 return language; | |
| 272 } | |
| 273 | |
| 274 void CorrectLanguageCodeTypo(std::string* code) { | |
| 275 DCHECK(code); | |
| 276 | |
| 277 size_t coma_index = code->find(','); | |
| 278 if (coma_index != std::string::npos) { | |
| 279 // There are more than 1 language specified, just keep the first one. | |
| 280 *code = code->substr(0, coma_index); | |
| 281 } | |
| 282 TrimWhitespaceASCII(*code, TRIM_ALL, code); | |
| 283 | |
| 284 // An underscore instead of a dash is a frequent mistake. | |
| 285 size_t underscore_index = code->find('_'); | |
| 286 if (underscore_index != std::string::npos) | |
| 287 (*code)[underscore_index] = '-'; | |
| 288 | |
| 289 // Change everything up to a dash to lower-case and everything after to upper. | |
| 290 size_t dash_index = code->find('-'); | |
| 291 if (dash_index != std::string::npos) { | |
| 292 *code = StringToLowerASCII(code->substr(0, dash_index)) + | |
| 293 StringToUpperASCII(code->substr(dash_index)); | |
| 294 } else { | |
| 295 *code = StringToLowerASCII(*code); | |
| 296 } | |
| 297 } | |
| 298 | |
| 299 bool IsValidLanguageCode(const std::string& code) { | |
| 300 // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/. | |
| 301 // TODO(hajimehoshi): How about es-419, which is used as an Accept language? | |
| 302 std::vector<std::string> chunks; | |
| 303 base::SplitString(code, '-', &chunks); | |
| 304 | |
| 305 if (chunks.size() < 1 || 2 < chunks.size()) | |
| 306 return false; | |
| 307 | |
| 308 const std::string& main_code = chunks[0]; | |
| 309 | |
| 310 if (main_code.size() < 1 || 3 < main_code.size()) | |
| 311 return false; | |
| 312 | |
| 313 for (std::string::const_iterator it = main_code.begin(); | |
| 314 it != main_code.end(); ++it) { | |
| 315 if (!IsAsciiAlpha(*it)) | |
| 316 return false; | |
| 317 } | |
| 318 | |
| 319 if (chunks.size() == 1) | |
| 320 return true; | |
| 321 | |
| 322 const std::string& sub_code = chunks[1]; | |
| 323 | |
| 324 if (sub_code.size() != 2) | |
| 325 return false; | |
| 326 | |
| 327 for (std::string::const_iterator it = sub_code.begin(); | |
| 328 it != sub_code.end(); ++it) { | |
| 329 if (!IsAsciiAlpha(*it)) | |
| 330 return false; | |
| 331 } | |
| 332 | |
| 333 return true; | |
| 334 } | |
| 335 | |
| 336 bool IsSameOrSimilarLanguages(const std::string& page_language, | |
| 337 const std::string& cld_language) { | |
| 338 std::vector<std::string> chunks; | |
| 339 | |
| 340 base::SplitString(page_language, '-', &chunks); | |
| 341 if (chunks.size() == 0) | |
| 342 return false; | |
| 343 std::string page_language_main_part = chunks[0]; | |
| 344 | |
| 345 base::SplitString(cld_language, '-', &chunks); | |
| 346 if (chunks.size() == 0) | |
| 347 return false; | |
| 348 std::string cld_language_main_part = chunks[0]; | |
| 349 | |
| 350 // Language code part of |page_language| is matched to one of |cld_language|. | |
| 351 // Country code is ignored here. | |
| 352 if (page_language_main_part == cld_language_main_part) { | |
| 353 // Languages are matched strictly. Reports false to metrics, but returns | |
| 354 // true. | |
| 355 TranslateCommonMetrics::ReportSimilarLanguageMatch(false); | |
| 356 return true; | |
| 357 } | |
| 358 | |
| 359 // Check if |page_language| and |cld_language| are in the similar language | |
| 360 // list and belong to the same language group. | |
| 361 int page_code = GetSimilarLanguageGroupCode(page_language); | |
| 362 bool match = page_code != 0 && | |
| 363 page_code == GetSimilarLanguageGroupCode(cld_language); | |
| 364 | |
| 365 TranslateCommonMetrics::ReportSimilarLanguageMatch(match); | |
| 366 return match; | |
| 367 } | |
| 368 | |
| 369 bool MaybeServerWrongConfiguration(const std::string& page_language, | |
| 370 const std::string& cld_language) { | |
| 371 // If |page_language| is not "en-*", respect it and just return false here. | |
| 372 if (!StartsWithASCII(page_language, "en", false)) | |
| 373 return false; | |
| 374 | |
| 375 // A server provides a language meta information representing "en-*". But it | |
| 376 // might be just a default value due to missing user configuration. | |
| 377 // Let's trust |cld_language| if the determined language is not difficult to | |
| 378 // distinguish from English, and the language is one of well-known languages | |
| 379 // which often provide "en-*" meta information mistakenly. | |
| 380 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | |
| 381 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | |
| 382 return true; | |
| 383 } | |
| 384 return false; | |
| 385 } | |
| 386 | |
| 387 std::string GetCLDVersion() { | |
| 388 switch (GetCLDMajorVersion()) { | |
| 389 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
| 390 case 1: | |
| 391 return CompactLangDet::DetectLanguageVersion(); | |
| 392 #endif | |
| 393 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
| 394 case 2: | |
| 395 return CLD2::DetectLanguageVersion(); | |
| 396 #endif | |
| 397 default: | |
| 398 NOTREACHED(); | |
| 399 } | |
| 400 return ""; | |
| 401 } | |
| 402 | |
| 403 } // namespace LanguageDetectionUtil | |
| OLD | NEW |