OLD | NEW |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/common/translate/language_detection_util.h" | 5 #include "components/translate/language_detection/language_detection_util.h" |
6 | 6 |
7 #include "base/logging.h" | 7 #include "base/logging.h" |
8 #include "base/metrics/field_trial.h" | 8 #include "base/metrics/field_trial.h" |
9 #include "base/strings/string_split.h" | 9 #include "base/strings/string_split.h" |
10 #include "base/strings/string_util.h" | 10 #include "base/strings/string_util.h" |
11 #include "base/strings/utf_string_conversions.h" | 11 #include "base/strings/utf_string_conversions.h" |
12 #include "base/time/time.h" | 12 #include "base/time/time.h" |
13 #include "chrome/common/chrome_constants.h" | 13 #include "components/translate/common/translate_constants.h" |
14 #include "chrome/common/translate/translate_common_metrics.h" | 14 #include "components/translate/common/translate_metrics.h" |
15 #include "chrome/common/translate/translate_util.h" | 15 #include "components/translate/common/translate_util.h" |
16 | 16 |
17 #if !defined(CLD_VERSION) || CLD_VERSION==1 | 17 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | 18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | 19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" |
20 #endif | 20 #endif |
21 | 21 |
22 #if !defined(CLD_VERSION) || CLD_VERSION==2 | 22 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
23 #include "third_party/cld_2/src/public/compact_lang_det.h" | 23 #include "third_party/cld_2/src/public/compact_lang_det.h" |
24 #endif | 24 #endif |
25 | 25 |
(...skipping 19 matching lines...) Expand all Loading... |
45 if (language.find(kSimilarLanguageCodes[i].code) != 0) | 45 if (language.find(kSimilarLanguageCodes[i].code) != 0) |
46 continue; | 46 continue; |
47 return kSimilarLanguageCodes[i].group; | 47 return kSimilarLanguageCodes[i].group; |
48 } | 48 } |
49 return 0; | 49 return 0; |
50 } | 50 } |
51 | 51 |
52 // Well-known languages which often have wrong server configuration of | 52 // Well-known languages which often have wrong server configuration of |
53 // Content-Language: en. | 53 // Content-Language: en. |
54 // TODO(toyoshim): Remove these static tables and caller functions to | 54 // TODO(toyoshim): Remove these static tables and caller functions to |
55 // chrome/common/translate, and implement them as std::set<>. | 55 // translate/common, and implement them as std::set<>. |
56 const char* kWellKnownCodesOnWrongConfiguration[] = { | 56 const char* kWellKnownCodesOnWrongConfiguration[] = { |
57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th" | 57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th" |
58 }; | 58 }; |
59 | 59 |
60 // Applies a series of language code modification in proper order. | 60 // Applies a series of language code modification in proper order. |
61 void ApplyLanguageCodeCorrection(std::string* code) { | 61 void ApplyLanguageCodeCorrection(std::string* code) { |
62 // Correct well-known format errors. | 62 // Correct well-known format errors. |
63 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); | 63 translate::CorrectLanguageCodeTypo(code); |
64 | 64 |
65 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { | 65 if (!translate::IsValidLanguageCode(*code)) { |
66 *code = std::string(); | 66 *code = std::string(); |
67 return; | 67 return; |
68 } | 68 } |
69 | 69 |
70 TranslateUtil::ToTranslateLanguageSynonym(code); | 70 translate::ToTranslateLanguageSynonym(code); |
71 } | 71 } |
72 | 72 |
73 int GetCLDMajorVersion() { | 73 int GetCLDMajorVersion() { |
74 #if !defined(CLD_VERSION) | 74 #if !defined(CLD_VERSION) |
75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); | 75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); |
76 if (group_name == "CLD2") | 76 if (group_name == "CLD2") |
77 return 2; | 77 return 2; |
78 else | 78 else |
79 return 1; | 79 return 1; |
80 #else | 80 #else |
81 return CLD_VERSION; | 81 return CLD_VERSION; |
82 #endif | 82 #endif |
83 } | 83 } |
84 | 84 |
85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
86 // failed. | 86 // failed. |
87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
88 std::string DetermineTextLanguage(const base::string16& text, | 88 std::string DetermineTextLanguage(const base::string16& text, |
89 bool* is_cld_reliable) { | 89 bool* is_cld_reliable) { |
90 std::string language = chrome::kUnknownLanguageCode; | 90 std::string language = translate::kUnknownLanguageCode; |
91 int text_bytes = 0; | 91 int text_bytes = 0; |
92 bool is_reliable = false; | 92 bool is_reliable = false; |
93 | 93 |
94 // Language or CLD2::Language | 94 // Language or CLD2::Language |
95 int cld_language = 0; | 95 int cld_language = 0; |
96 bool is_valid_language = false; | 96 bool is_valid_language = false; |
97 | 97 |
98 switch (GetCLDMajorVersion()) { | 98 switch (GetCLDMajorVersion()) { |
99 #if !defined(CLD_VERSION) || CLD_VERSION==1 | 99 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
100 case 1: { | 100 case 1: { |
101 int num_languages = 0; | 101 int num_languages = 0; |
102 cld_language = | 102 cld_language = |
103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, | 103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, |
104 &num_languages, NULL, &text_bytes); | 104 &num_languages, NULL, &text_bytes); |
105 is_valid_language = cld_language != NUM_LANGUAGES && | 105 is_valid_language = cld_language != NUM_LANGUAGES && |
106 cld_language != UNKNOWN_LANGUAGE && | 106 cld_language != UNKNOWN_LANGUAGE && |
107 cld_language != TG_UNKNOWN_LANGUAGE; | 107 cld_language != TG_UNKNOWN_LANGUAGE; |
108 break; | 108 break; |
109 } | 109 } |
110 #endif | 110 #endif |
111 #if !defined(CLD_VERSION) || CLD_VERSION==2 | 111 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
112 case 2: { | 112 case 2: { |
113 std::string utf8_text(UTF16ToUTF8(text)); | 113 std::string utf8_text(UTF16ToUTF8(text)); |
114 CLD2::Language language3[3]; | 114 CLD2::Language language3[3]; |
115 int percent3[3]; | 115 int percent3[3]; |
116 cld_language = | 116 cld_language = CLD2::DetectLanguageSummary( |
117 CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true, | 117 utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3, |
118 language3, percent3, | 118 &text_bytes, &is_reliable); |
119 &text_bytes, &is_reliable); | |
120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && | 119 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && |
121 cld_language != CLD2::UNKNOWN_LANGUAGE && | 120 cld_language != CLD2::UNKNOWN_LANGUAGE && |
122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; | 121 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; |
123 break; | 122 break; |
124 } | 123 } |
125 #endif | 124 #endif |
126 default: | 125 default: |
127 NOTREACHED(); | 126 NOTREACHED(); |
128 } | 127 } |
129 | 128 |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
186 const std::string& page_language, const std::string& cld_language) { | 185 const std::string& page_language, const std::string& cld_language) { |
187 // Translate server cannot treat general Chinese. If Content-Language and | 186 // Translate server cannot treat general Chinese. If Content-Language and |
188 // CLD agree that the language is Chinese and Content-Language doesn't know | 187 // CLD agree that the language is Chinese and Content-Language doesn't know |
189 // which dialect is used, CLD language has priority. | 188 // which dialect is used, CLD language has priority. |
190 // TODO(hajimehoshi): How about the other dialects like zh-MO? | 189 // TODO(hajimehoshi): How about the other dialects like zh-MO? |
191 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); | 190 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); |
192 } | 191 } |
193 | 192 |
194 } // namespace | 193 } // namespace |
195 | 194 |
196 namespace LanguageDetectionUtil { | 195 namespace translate { |
197 | 196 |
198 std::string DeterminePageLanguage(const std::string& code, | 197 std::string DeterminePageLanguage(const std::string& code, |
199 const std::string& html_lang, | 198 const std::string& html_lang, |
200 const base::string16& contents, | 199 const base::string16& contents, |
201 std::string* cld_language_p, | 200 std::string* cld_language_p, |
202 bool* is_cld_reliable_p) { | 201 bool* is_cld_reliable_p) { |
203 base::TimeTicks begin_time = base::TimeTicks::Now(); | 202 base::TimeTicks begin_time = base::TimeTicks::Now(); |
204 bool is_cld_reliable; | 203 bool is_cld_reliable; |
205 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); | 204 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); |
206 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time, | 205 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now()); |
207 base::TimeTicks::Now()); | |
208 | 206 |
209 if (cld_language_p != NULL) | 207 if (cld_language_p != NULL) |
210 *cld_language_p = cld_language; | 208 *cld_language_p = cld_language; |
211 if (is_cld_reliable_p != NULL) | 209 if (is_cld_reliable_p != NULL) |
212 *is_cld_reliable_p = is_cld_reliable; | 210 *is_cld_reliable_p = is_cld_reliable; |
213 TranslateUtil::ToTranslateLanguageSynonym(&cld_language); | 211 translate::ToTranslateLanguageSynonym(&cld_language); |
214 | 212 |
215 // Check if html lang attribute is valid. | 213 // Check if html lang attribute is valid. |
216 std::string modified_html_lang; | 214 std::string modified_html_lang; |
217 if (!html_lang.empty()) { | 215 if (!html_lang.empty()) { |
218 modified_html_lang = html_lang; | 216 modified_html_lang = html_lang; |
219 ApplyLanguageCodeCorrection(&modified_html_lang); | 217 ApplyLanguageCodeCorrection(&modified_html_lang); |
220 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang); | 218 translate::ReportHtmlLang(html_lang, modified_html_lang); |
221 VLOG(9) << "html lang based language code: " << modified_html_lang; | 219 VLOG(9) << "html lang based language code: " << modified_html_lang; |
222 } | 220 } |
223 | 221 |
224 // Check if Content-Language is valid. | 222 // Check if Content-Language is valid. |
225 std::string modified_code; | 223 std::string modified_code; |
226 if (!code.empty()) { | 224 if (!code.empty()) { |
227 modified_code = code; | 225 modified_code = code; |
228 ApplyLanguageCodeCorrection(&modified_code); | 226 ApplyLanguageCodeCorrection(&modified_code); |
229 TranslateCommonMetrics::ReportContentLanguage(code, modified_code); | 227 translate::ReportContentLanguage(code, modified_code); |
230 } | 228 } |
231 | 229 |
232 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt | 230 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt |
233 // |modified_code|. | 231 // |modified_code|. |
234 std::string language = modified_html_lang.empty() ? modified_code : | 232 std::string language = modified_html_lang.empty() ? modified_code : |
235 modified_html_lang; | 233 modified_html_lang; |
236 | 234 |
237 // If |language| is empty, just use CLD result even though it might be | 235 // If |language| is empty, just use CLD result even though it might be |
238 // chrome::kUnknownLanguageCode. | 236 // translate::kUnknownLanguageCode. |
239 if (language.empty()) { | 237 if (language.empty()) { |
240 TranslateCommonMetrics::ReportLanguageVerification( | 238 translate::ReportLanguageVerification( |
241 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); | 239 translate::LANGUAGE_VERIFICATION_CLD_ONLY); |
242 return cld_language; | 240 return cld_language; |
243 } | 241 } |
244 | 242 |
245 if (cld_language == chrome::kUnknownLanguageCode) { | 243 if (cld_language == kUnknownLanguageCode) { |
246 TranslateCommonMetrics::ReportLanguageVerification( | 244 translate::ReportLanguageVerification( |
247 TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN); | 245 translate::LANGUAGE_VERIFICATION_UNKNOWN); |
248 return language; | 246 return language; |
249 } else if (CanCLDComplementSubCode(language, cld_language)) { | 247 } else if (CanCLDComplementSubCode(language, cld_language)) { |
250 TranslateCommonMetrics::ReportLanguageVerification( | 248 translate::ReportLanguageVerification( |
251 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); | 249 translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); |
252 return cld_language; | 250 return cld_language; |
253 } else if (IsSameOrSimilarLanguages(language, cld_language)) { | 251 } else if (IsSameOrSimilarLanguages(language, cld_language)) { |
254 TranslateCommonMetrics::ReportLanguageVerification( | 252 translate::ReportLanguageVerification( |
255 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE); | 253 translate::LANGUAGE_VERIFICATION_CLD_AGREE); |
256 return language; | 254 return language; |
257 } else if (MaybeServerWrongConfiguration(language, cld_language)) { | 255 } else if (MaybeServerWrongConfiguration(language, cld_language)) { |
258 TranslateCommonMetrics::ReportLanguageVerification( | 256 translate::ReportLanguageVerification( |
259 TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD); | 257 translate::LANGUAGE_VERIFICATION_TRUST_CLD); |
260 return cld_language; | 258 return cld_language; |
261 } else { | 259 } else { |
262 TranslateCommonMetrics::ReportLanguageVerification( | 260 translate::ReportLanguageVerification( |
263 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); | 261 translate::LANGUAGE_VERIFICATION_CLD_DISAGREE); |
264 // Content-Language value might be wrong because CLD says that this page | 262 // Content-Language value might be wrong because CLD says that this page |
265 // is written in another language with confidence. | 263 // is written in another language with confidence. |
266 // In this case, Chrome doesn't rely on any of the language codes, and | 264 // In this case, Chrome doesn't rely on any of the language codes, and |
267 // gives up suggesting a translation. | 265 // gives up suggesting a translation. |
268 return std::string(chrome::kUnknownLanguageCode); | 266 return std::string(kUnknownLanguageCode); |
269 } | 267 } |
270 | 268 |
271 return language; | 269 return language; |
272 } | 270 } |
273 | 271 |
274 void CorrectLanguageCodeTypo(std::string* code) { | 272 void CorrectLanguageCodeTypo(std::string* code) { |
275 DCHECK(code); | 273 DCHECK(code); |
276 | 274 |
277 size_t coma_index = code->find(','); | 275 size_t coma_index = code->find(','); |
278 if (coma_index != std::string::npos) { | 276 if (coma_index != std::string::npos) { |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
345 base::SplitString(cld_language, '-', &chunks); | 343 base::SplitString(cld_language, '-', &chunks); |
346 if (chunks.size() == 0) | 344 if (chunks.size() == 0) |
347 return false; | 345 return false; |
348 std::string cld_language_main_part = chunks[0]; | 346 std::string cld_language_main_part = chunks[0]; |
349 | 347 |
350 // Language code part of |page_language| is matched to one of |cld_language|. | 348 // Language code part of |page_language| is matched to one of |cld_language|. |
351 // Country code is ignored here. | 349 // Country code is ignored here. |
352 if (page_language_main_part == cld_language_main_part) { | 350 if (page_language_main_part == cld_language_main_part) { |
353 // Languages are matched strictly. Reports false to metrics, but returns | 351 // Languages are matched strictly. Reports false to metrics, but returns |
354 // true. | 352 // true. |
355 TranslateCommonMetrics::ReportSimilarLanguageMatch(false); | 353 translate::ReportSimilarLanguageMatch(false); |
356 return true; | 354 return true; |
357 } | 355 } |
358 | 356 |
359 // Check if |page_language| and |cld_language| are in the similar language | 357 // Check if |page_language| and |cld_language| are in the similar language |
360 // list and belong to the same language group. | 358 // list and belong to the same language group. |
361 int page_code = GetSimilarLanguageGroupCode(page_language); | 359 int page_code = GetSimilarLanguageGroupCode(page_language); |
362 bool match = page_code != 0 && | 360 bool match = page_code != 0 && |
363 page_code == GetSimilarLanguageGroupCode(cld_language); | 361 page_code == GetSimilarLanguageGroupCode(cld_language); |
364 | 362 |
365 TranslateCommonMetrics::ReportSimilarLanguageMatch(match); | 363 translate::ReportSimilarLanguageMatch(match); |
366 return match; | 364 return match; |
367 } | 365 } |
368 | 366 |
369 bool MaybeServerWrongConfiguration(const std::string& page_language, | 367 bool MaybeServerWrongConfiguration(const std::string& page_language, |
370 const std::string& cld_language) { | 368 const std::string& cld_language) { |
371 // If |page_language| is not "en-*", respect it and just return false here. | 369 // If |page_language| is not "en-*", respect it and just return false here. |
372 if (!StartsWithASCII(page_language, "en", false)) | 370 if (!StartsWithASCII(page_language, "en", false)) |
373 return false; | 371 return false; |
374 | 372 |
375 // A server provides a language meta information representing "en-*". But it | 373 // A server provides a language meta information representing "en-*". But it |
(...skipping 17 matching lines...) Expand all Loading... |
393 #if !defined(CLD_VERSION) || CLD_VERSION==2 | 391 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
394 case 2: | 392 case 2: |
395 return CLD2::DetectLanguageVersion(); | 393 return CLD2::DetectLanguageVersion(); |
396 #endif | 394 #endif |
397 default: | 395 default: |
398 NOTREACHED(); | 396 NOTREACHED(); |
399 } | 397 } |
400 return ""; | 398 return ""; |
401 } | 399 } |
402 | 400 |
403 } // namespace LanguageDetectionUtil | 401 } // namespace translate |
OLD | NEW |