OLD | NEW |
---|---|
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/common/translate/language_detection_util.h" | 5 #include "components/translate/language_detection/language_detection_util.h" |
6 | 6 |
7 #include "base/logging.h" | 7 #include "base/logging.h" |
8 #include "base/metrics/field_trial.h" | 8 #include "base/metrics/field_trial.h" |
9 #include "base/strings/string_split.h" | 9 #include "base/strings/string_split.h" |
10 #include "base/strings/string_util.h" | 10 #include "base/strings/string_util.h" |
11 #include "base/strings/utf_string_conversions.h" | 11 #include "base/strings/utf_string_conversions.h" |
12 #include "base/time/time.h" | 12 #include "base/time/time.h" |
13 #include "chrome/common/chrome_constants.h" | 13 #include "components/translate/common/translate_constants.h" |
14 #include "chrome/common/translate/translate_common_metrics.h" | 14 #include "components/translate/common/translate_metrics.h" |
15 #include "chrome/common/translate/translate_util.h" | 15 #include "components/translate/common/translate_util.h" |
16 | 16 |
17 #if !defined(CLD_VERSION) || CLD_VERSION==1 | 17 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | 18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | 19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" |
20 #endif | 20 #endif |
21 | 21 |
22 #if !defined(CLD_VERSION) || CLD_VERSION==2 | 22 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
23 #include "third_party/cld_2/src/public/compact_lang_det.h" | 23 #include "third_party/cld_2/src/public/compact_lang_det.h" |
24 #endif | 24 #endif |
25 | 25 |
(...skipping 19 matching lines...) Expand all Loading... | |
45 if (language.find(kSimilarLanguageCodes[i].code) != 0) | 45 if (language.find(kSimilarLanguageCodes[i].code) != 0) |
46 continue; | 46 continue; |
47 return kSimilarLanguageCodes[i].group; | 47 return kSimilarLanguageCodes[i].group; |
48 } | 48 } |
49 return 0; | 49 return 0; |
50 } | 50 } |
51 | 51 |
52 // Well-known languages which often have wrong server configuration of | 52 // Well-known languages which often have wrong server configuration of |
53 // Content-Language: en. | 53 // Content-Language: en. |
54 // TODO(toyoshim): Remove these static tables and caller functions to | 54 // TODO(toyoshim): Remove these static tables and caller functions to |
55 // chrome/common/translate, and implement them as std::set<>. | 55 // chrome/common/translate, and implement them as std::set<>. |
blundell
2013/10/02 10:01:17
Path reference is outdated.
droger
2013/10/02 10:59:41
Changed the path to translate/common, but I'm not
| |
56 const char* kWellKnownCodesOnWrongConfiguration[] = { | 56 const char* kWellKnownCodesOnWrongConfiguration[] = { |
57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th" | 57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th" |
58 }; | 58 }; |
59 | 59 |
60 // Applies a series of language code modification in proper order. | 60 // Applies a series of language code modification in proper order. |
61 void ApplyLanguageCodeCorrection(std::string* code) { | 61 void ApplyLanguageCodeCorrection(std::string* code) { |
62 // Correct well-known format errors. | 62 // Correct well-known format errors. |
63 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); | 63 translate::CorrectLanguageCodeTypo(code); |
64 | 64 |
65 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { | 65 if (!translate::IsValidLanguageCode(*code)) { |
66 *code = std::string(); | 66 *code = std::string(); |
67 return; | 67 return; |
68 } | 68 } |
69 | 69 |
70 TranslateUtil::ToTranslateLanguageSynonym(code); | 70 translate::ToTranslateLanguageSynonym(code); |
71 } | 71 } |
72 | 72 |
73 int GetCLDMajorVersion() { | 73 int GetCLDMajorVersion() { |
74 #if !defined(CLD_VERSION) | 74 #if !defined(CLD_VERSION) |
75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); | 75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); |
76 if (group_name == "CLD2") | 76 if (group_name == "CLD2") |
77 return 2; | 77 return 2; |
78 else | 78 else |
79 return 1; | 79 return 1; |
80 #else | 80 #else |
81 return CLD_VERSION; | 81 return CLD_VERSION; |
82 #endif | 82 #endif |
83 } | 83 } |
84 | 84 |
85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
86 // failed. | 86 // failed. |
87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
88 std::string DetermineTextLanguage(const base::string16& text, | 88 std::string DetermineTextLanguage(const base::string16& text, |
89 bool* is_cld_reliable) { | 89 bool* is_cld_reliable) { |
90 std::string language = chrome::kUnknownLanguageCode; | 90 std::string language = translate::kUnknownLanguageCode; |
91 int text_bytes = 0; | 91 int text_bytes = 0; |
92 bool is_reliable = false; | 92 bool is_reliable = false; |
93 | 93 |
94 // Language or CLD2::Language | 94 // Language or CLD2::Language |
95 int cld_language = 0; | 95 int cld_language = 0; |
96 bool is_valid_language = false; | 96 bool is_valid_language = false; |
97 | 97 |
98 switch (GetCLDMajorVersion()) { | 98 switch (GetCLDMajorVersion()) { |
99 #if !defined(CLD_VERSION) || CLD_VERSION==1 | 99 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
100 case 1: { | 100 case 1: { |
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
175 const std::string& page_language, const std::string& cld_language) { | 175 const std::string& page_language, const std::string& cld_language) { |
176 // Translate server cannot treat general Chinese. If Content-Language and | 176 // Translate server cannot treat general Chinese. If Content-Language and |
177 // CLD agree that the language is Chinese and Content-Language doesn't know | 177 // CLD agree that the language is Chinese and Content-Language doesn't know |
178 // which dialect is used, CLD language has priority. | 178 // which dialect is used, CLD language has priority. |
179 // TODO(hajimehoshi): How about the other dialects like zh-MO? | 179 // TODO(hajimehoshi): How about the other dialects like zh-MO? |
180 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); | 180 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); |
181 } | 181 } |
182 | 182 |
183 } // namespace | 183 } // namespace |
184 | 184 |
185 namespace LanguageDetectionUtil { | 185 namespace translate { |
186 | 186 |
187 std::string DeterminePageLanguage(const std::string& code, | 187 std::string DeterminePageLanguage(const std::string& code, |
188 const std::string& html_lang, | 188 const std::string& html_lang, |
189 const base::string16& contents, | 189 const base::string16& contents, |
190 std::string* cld_language_p, | 190 std::string* cld_language_p, |
191 bool* is_cld_reliable_p) { | 191 bool* is_cld_reliable_p) { |
192 base::TimeTicks begin_time = base::TimeTicks::Now(); | 192 base::TimeTicks begin_time = base::TimeTicks::Now(); |
193 bool is_cld_reliable; | 193 bool is_cld_reliable; |
194 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); | 194 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); |
195 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time, | 195 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now()); |
196 base::TimeTicks::Now()); | |
197 | 196 |
198 if (cld_language_p != NULL) | 197 if (cld_language_p != NULL) |
199 *cld_language_p = cld_language; | 198 *cld_language_p = cld_language; |
200 if (is_cld_reliable_p != NULL) | 199 if (is_cld_reliable_p != NULL) |
201 *is_cld_reliable_p = is_cld_reliable; | 200 *is_cld_reliable_p = is_cld_reliable; |
202 TranslateUtil::ToTranslateLanguageSynonym(&cld_language); | 201 translate::ToTranslateLanguageSynonym(&cld_language); |
203 | 202 |
204 // Check if html lang attribute is valid. | 203 // Check if html lang attribute is valid. |
205 std::string modified_html_lang; | 204 std::string modified_html_lang; |
206 if (!html_lang.empty()) { | 205 if (!html_lang.empty()) { |
207 modified_html_lang = html_lang; | 206 modified_html_lang = html_lang; |
208 ApplyLanguageCodeCorrection(&modified_html_lang); | 207 ApplyLanguageCodeCorrection(&modified_html_lang); |
209 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang); | 208 translate::ReportHtmlLang(html_lang, modified_html_lang); |
210 VLOG(9) << "html lang based language code: " << modified_html_lang; | 209 VLOG(9) << "html lang based language code: " << modified_html_lang; |
211 } | 210 } |
212 | 211 |
213 // Check if Content-Language is valid. | 212 // Check if Content-Language is valid. |
214 std::string modified_code; | 213 std::string modified_code; |
215 if (!code.empty()) { | 214 if (!code.empty()) { |
216 modified_code = code; | 215 modified_code = code; |
217 ApplyLanguageCodeCorrection(&modified_code); | 216 ApplyLanguageCodeCorrection(&modified_code); |
218 TranslateCommonMetrics::ReportContentLanguage(code, modified_code); | 217 translate::ReportContentLanguage(code, modified_code); |
219 } | 218 } |
220 | 219 |
221 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt | 220 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt |
222 // |modified_code|. | 221 // |modified_code|. |
223 std::string language = modified_html_lang.empty() ? modified_code : | 222 std::string language = modified_html_lang.empty() ? modified_code : |
224 modified_html_lang; | 223 modified_html_lang; |
225 | 224 |
226 // If |language| is empty, just use CLD result even though it might be | 225 // If |language| is empty, just use CLD result even though it might be |
227 // chrome::kUnknownLanguageCode. | 226 // translate::kUnknownLanguageCode. |
228 if (language.empty()) { | 227 if (language.empty()) { |
229 TranslateCommonMetrics::ReportLanguageVerification( | 228 translate::ReportLanguageVerification( |
230 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); | 229 translate::LANGUAGE_VERIFICATION_CLD_ONLY); |
231 return cld_language; | 230 return cld_language; |
232 } | 231 } |
233 | 232 |
234 if (cld_language == chrome::kUnknownLanguageCode) { | 233 if (cld_language == kUnknownLanguageCode) { |
235 TranslateCommonMetrics::ReportLanguageVerification( | 234 translate::ReportLanguageVerification( |
236 TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN); | 235 translate::LANGUAGE_VERIFICATION_UNKNOWN); |
237 return language; | 236 return language; |
238 } else if (CanCLDComplementSubCode(language, cld_language)) { | 237 } else if (CanCLDComplementSubCode(language, cld_language)) { |
239 TranslateCommonMetrics::ReportLanguageVerification( | 238 translate::ReportLanguageVerification( |
240 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); | 239 translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); |
241 return cld_language; | 240 return cld_language; |
242 } else if (IsSameOrSimilarLanguages(language, cld_language)) { | 241 } else if (IsSameOrSimilarLanguages(language, cld_language)) { |
243 TranslateCommonMetrics::ReportLanguageVerification( | 242 translate::ReportLanguageVerification( |
244 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE); | 243 translate::LANGUAGE_VERIFICATION_CLD_AGREE); |
245 return language; | 244 return language; |
246 } else if (MaybeServerWrongConfiguration(language, cld_language)) { | 245 } else if (MaybeServerWrongConfiguration(language, cld_language)) { |
247 TranslateCommonMetrics::ReportLanguageVerification( | 246 translate::ReportLanguageVerification( |
248 TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD); | 247 translate::LANGUAGE_VERIFICATION_TRUST_CLD); |
249 return cld_language; | 248 return cld_language; |
250 } else { | 249 } else { |
251 TranslateCommonMetrics::ReportLanguageVerification( | 250 translate::ReportLanguageVerification( |
252 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); | 251 translate::LANGUAGE_VERIFICATION_CLD_DISAGREE); |
253 // Content-Language value might be wrong because CLD says that this page | 252 // Content-Language value might be wrong because CLD says that this page |
254 // is written in another language with confidence. | 253 // is written in another language with confidence. |
255 // In this case, Chrome doesn't rely on any of the language codes, and | 254 // In this case, Chrome doesn't rely on any of the language codes, and |
256 // gives up suggesting a translation. | 255 // gives up suggesting a translation. |
257 return std::string(chrome::kUnknownLanguageCode); | 256 return std::string(kUnknownLanguageCode); |
258 } | 257 } |
259 | 258 |
260 return language; | 259 return language; |
261 } | 260 } |
262 | 261 |
263 void CorrectLanguageCodeTypo(std::string* code) { | 262 void CorrectLanguageCodeTypo(std::string* code) { |
264 DCHECK(code); | 263 DCHECK(code); |
265 | 264 |
266 size_t coma_index = code->find(','); | 265 size_t coma_index = code->find(','); |
267 if (coma_index != std::string::npos) { | 266 if (coma_index != std::string::npos) { |
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
323 } | 322 } |
324 | 323 |
325 bool IsSameOrSimilarLanguages(const std::string& page_language, | 324 bool IsSameOrSimilarLanguages(const std::string& page_language, |
326 const std::string& cld_language) { | 325 const std::string& cld_language) { |
327 // Language code part of |page_language| is matched to one of |cld_language|. | 326 // Language code part of |page_language| is matched to one of |cld_language|. |
328 // Country code is ignored here. | 327 // Country code is ignored here. |
329 if (page_language.size() >= 2 && | 328 if (page_language.size() >= 2 && |
330 cld_language.find(page_language.c_str(), 0, 2) == 0) { | 329 cld_language.find(page_language.c_str(), 0, 2) == 0) { |
331 // Languages are matched strictly. Reports false to metrics, but returns | 330 // Languages are matched strictly. Reports false to metrics, but returns |
332 // true. | 331 // true. |
333 TranslateCommonMetrics::ReportSimilarLanguageMatch(false); | 332 translate::ReportSimilarLanguageMatch(false); |
334 return true; | 333 return true; |
335 } | 334 } |
336 | 335 |
337 // Check if |page_language| and |cld_language| are in the similar language | 336 // Check if |page_language| and |cld_language| are in the similar language |
338 // list and belong to the same language group. | 337 // list and belong to the same language group. |
339 int page_code = GetSimilarLanguageGroupCode(page_language); | 338 int page_code = GetSimilarLanguageGroupCode(page_language); |
340 bool match = page_code != 0 && | 339 bool match = page_code != 0 && |
341 page_code == GetSimilarLanguageGroupCode(cld_language); | 340 page_code == GetSimilarLanguageGroupCode(cld_language); |
342 | 341 |
343 TranslateCommonMetrics::ReportSimilarLanguageMatch(match); | 342 translate::ReportSimilarLanguageMatch(match); |
344 return match; | 343 return match; |
345 } | 344 } |
346 | 345 |
347 bool MaybeServerWrongConfiguration(const std::string& page_language, | 346 bool MaybeServerWrongConfiguration(const std::string& page_language, |
348 const std::string& cld_language) { | 347 const std::string& cld_language) { |
349 // If |page_language| is not "en-*", respect it and just return false here. | 348 // If |page_language| is not "en-*", respect it and just return false here. |
350 if (!StartsWithASCII(page_language, "en", false)) | 349 if (!StartsWithASCII(page_language, "en", false)) |
351 return false; | 350 return false; |
352 | 351 |
353 // A server provides a language meta information representing "en-*". But it | 352 // A server provides a language meta information representing "en-*". But it |
(...skipping 17 matching lines...) Expand all Loading... | |
371 #if !defined(CLD_VERSION) || CLD_VERSION==2 | 370 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
372 case 2: | 371 case 2: |
373 return CLD2::DetectLanguageVersion(); | 372 return CLD2::DetectLanguageVersion(); |
374 #endif | 373 #endif |
375 default: | 374 default: |
376 NOTREACHED(); | 375 NOTREACHED(); |
377 } | 376 } |
378 return ""; | 377 return ""; |
379 } | 378 } |
380 | 379 |
381 } // namespace LanguageDetectionUtil | 380 } // namespace translate |
OLD | NEW |