Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(419)

Side by Side Diff: components/translate/language_detection/language_detection_util.cc

Issue 25531002: Move language detection to a component (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Run translate unittests on iOS Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/common/translate/language_detection_util.h" 5 #include "components/translate/language_detection/language_detection_util.h"
6 6
7 #include "base/logging.h" 7 #include "base/logging.h"
8 #include "base/metrics/field_trial.h" 8 #include "base/metrics/field_trial.h"
9 #include "base/strings/string_split.h" 9 #include "base/strings/string_split.h"
10 #include "base/strings/string_util.h" 10 #include "base/strings/string_util.h"
11 #include "base/strings/utf_string_conversions.h" 11 #include "base/strings/utf_string_conversions.h"
12 #include "base/time/time.h" 12 #include "base/time/time.h"
13 #include "chrome/common/chrome_constants.h" 13 #include "components/translate/common/translate_constants.h"
14 #include "chrome/common/translate/translate_common_metrics.h" 14 #include "components/translate/common/translate_metrics.h"
15 #include "chrome/common/translate/translate_util.h" 15 #include "components/translate/common/translate_util.h"
16 16
17 #if !defined(CLD_VERSION) || CLD_VERSION==1 17 #if !defined(CLD_VERSION) || CLD_VERSION==1
18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" 18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" 19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
20 #endif 20 #endif
21 21
22 #if !defined(CLD_VERSION) || CLD_VERSION==2 22 #if !defined(CLD_VERSION) || CLD_VERSION==2
23 #include "third_party/cld_2/src/public/compact_lang_det.h" 23 #include "third_party/cld_2/src/public/compact_lang_det.h"
24 #endif 24 #endif
25 25
(...skipping 19 matching lines...) Expand all
45 if (language.find(kSimilarLanguageCodes[i].code) != 0) 45 if (language.find(kSimilarLanguageCodes[i].code) != 0)
46 continue; 46 continue;
47 return kSimilarLanguageCodes[i].group; 47 return kSimilarLanguageCodes[i].group;
48 } 48 }
49 return 0; 49 return 0;
50 } 50 }
51 51
52 // Well-known languages which often have wrong server configuration of 52 // Well-known languages which often have wrong server configuration of
53 // Content-Language: en. 53 // Content-Language: en.
54 // TODO(toyoshim): Remove these static tables and caller functions to 54 // TODO(toyoshim): Remove these static tables and caller functions to
55 // chrome/common/translate, and implement them as std::set<>. 55 // translate/common, and implement them as std::set<>.
56 const char* kWellKnownCodesOnWrongConfiguration[] = { 56 const char* kWellKnownCodesOnWrongConfiguration[] = {
57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th" 57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
58 }; 58 };
59 59
60 // Applies a series of language code modification in proper order. 60 // Applies a series of language code modification in proper order.
61 void ApplyLanguageCodeCorrection(std::string* code) { 61 void ApplyLanguageCodeCorrection(std::string* code) {
62 // Correct well-known format errors. 62 // Correct well-known format errors.
63 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); 63 translate::CorrectLanguageCodeTypo(code);
64 64
65 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { 65 if (!translate::IsValidLanguageCode(*code)) {
66 *code = std::string(); 66 *code = std::string();
67 return; 67 return;
68 } 68 }
69 69
70 TranslateUtil::ToTranslateLanguageSynonym(code); 70 translate::ToTranslateLanguageSynonym(code);
71 } 71 }
72 72
73 int GetCLDMajorVersion() { 73 int GetCLDMajorVersion() {
74 #if !defined(CLD_VERSION) 74 #if !defined(CLD_VERSION)
75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); 75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");
76 if (group_name == "CLD2") 76 if (group_name == "CLD2")
77 return 2; 77 return 2;
78 else 78 else
79 return 1; 79 return 1;
80 #else 80 #else
81 return CLD_VERSION; 81 return CLD_VERSION;
82 #endif 82 #endif
83 } 83 }
84 84
85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
86 // failed. 86 // failed.
87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
88 std::string DetermineTextLanguage(const base::string16& text, 88 std::string DetermineTextLanguage(const base::string16& text,
89 bool* is_cld_reliable) { 89 bool* is_cld_reliable) {
90 std::string language = chrome::kUnknownLanguageCode; 90 std::string language = translate::kUnknownLanguageCode;
91 int text_bytes = 0; 91 int text_bytes = 0;
92 bool is_reliable = false; 92 bool is_reliable = false;
93 93
94 // Language or CLD2::Language 94 // Language or CLD2::Language
95 int cld_language = 0; 95 int cld_language = 0;
96 bool is_valid_language = false; 96 bool is_valid_language = false;
97 97
98 switch (GetCLDMajorVersion()) { 98 switch (GetCLDMajorVersion()) {
99 #if !defined(CLD_VERSION) || CLD_VERSION==1 99 #if !defined(CLD_VERSION) || CLD_VERSION==1
100 case 1: { 100 case 1: {
101 int num_languages = 0; 101 int num_languages = 0;
102 cld_language = 102 cld_language =
103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, 103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
104 &num_languages, NULL, &text_bytes); 104 &num_languages, NULL, &text_bytes);
105 is_valid_language = cld_language != NUM_LANGUAGES && 105 is_valid_language = cld_language != NUM_LANGUAGES &&
106 cld_language != UNKNOWN_LANGUAGE && 106 cld_language != UNKNOWN_LANGUAGE &&
107 cld_language != TG_UNKNOWN_LANGUAGE; 107 cld_language != TG_UNKNOWN_LANGUAGE;
108 break; 108 break;
109 } 109 }
110 #endif 110 #endif
111 #if !defined(CLD_VERSION) || CLD_VERSION==2 111 #if !defined(CLD_VERSION) || CLD_VERSION==2
112 case 2: { 112 case 2: {
113 std::string utf8_text(UTF16ToUTF8(text)); 113 std::string utf8_text(UTF16ToUTF8(text));
114 CLD2::Language language3[3]; 114 CLD2::Language language3[3];
115 int percent3[3]; 115 int percent3[3];
116 cld_language = 116 cld_language = CLD2::DetectLanguageSummary(
117 CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true, 117 utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3,
118 language3, percent3, 118 &text_bytes, &is_reliable);
119 &text_bytes, &is_reliable);
120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && 119 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
121 cld_language != CLD2::UNKNOWN_LANGUAGE && 120 cld_language != CLD2::UNKNOWN_LANGUAGE &&
122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; 121 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
123 break; 122 break;
124 } 123 }
125 #endif 124 #endif
126 default: 125 default:
127 NOTREACHED(); 126 NOTREACHED();
128 } 127 }
129 128
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
186 const std::string& page_language, const std::string& cld_language) { 185 const std::string& page_language, const std::string& cld_language) {
187 // Translate server cannot treat general Chinese. If Content-Language and 186 // Translate server cannot treat general Chinese. If Content-Language and
188 // CLD agree that the language is Chinese and Content-Language doesn't know 187 // CLD agree that the language is Chinese and Content-Language doesn't know
189 // which dialect is used, CLD language has priority. 188 // which dialect is used, CLD language has priority.
190 // TODO(hajimehoshi): How about the other dialects like zh-MO? 189 // TODO(hajimehoshi): How about the other dialects like zh-MO?
191 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); 190 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
192 } 191 }
193 192
194 } // namespace 193 } // namespace
195 194
196 namespace LanguageDetectionUtil { 195 namespace translate {
197 196
198 std::string DeterminePageLanguage(const std::string& code, 197 std::string DeterminePageLanguage(const std::string& code,
199 const std::string& html_lang, 198 const std::string& html_lang,
200 const base::string16& contents, 199 const base::string16& contents,
201 std::string* cld_language_p, 200 std::string* cld_language_p,
202 bool* is_cld_reliable_p) { 201 bool* is_cld_reliable_p) {
203 base::TimeTicks begin_time = base::TimeTicks::Now(); 202 base::TimeTicks begin_time = base::TimeTicks::Now();
204 bool is_cld_reliable; 203 bool is_cld_reliable;
205 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); 204 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
206 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time, 205 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
207 base::TimeTicks::Now());
208 206
209 if (cld_language_p != NULL) 207 if (cld_language_p != NULL)
210 *cld_language_p = cld_language; 208 *cld_language_p = cld_language;
211 if (is_cld_reliable_p != NULL) 209 if (is_cld_reliable_p != NULL)
212 *is_cld_reliable_p = is_cld_reliable; 210 *is_cld_reliable_p = is_cld_reliable;
213 TranslateUtil::ToTranslateLanguageSynonym(&cld_language); 211 translate::ToTranslateLanguageSynonym(&cld_language);
214 212
215 // Check if html lang attribute is valid. 213 // Check if html lang attribute is valid.
216 std::string modified_html_lang; 214 std::string modified_html_lang;
217 if (!html_lang.empty()) { 215 if (!html_lang.empty()) {
218 modified_html_lang = html_lang; 216 modified_html_lang = html_lang;
219 ApplyLanguageCodeCorrection(&modified_html_lang); 217 ApplyLanguageCodeCorrection(&modified_html_lang);
220 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang); 218 translate::ReportHtmlLang(html_lang, modified_html_lang);
221 VLOG(9) << "html lang based language code: " << modified_html_lang; 219 VLOG(9) << "html lang based language code: " << modified_html_lang;
222 } 220 }
223 221
224 // Check if Content-Language is valid. 222 // Check if Content-Language is valid.
225 std::string modified_code; 223 std::string modified_code;
226 if (!code.empty()) { 224 if (!code.empty()) {
227 modified_code = code; 225 modified_code = code;
228 ApplyLanguageCodeCorrection(&modified_code); 226 ApplyLanguageCodeCorrection(&modified_code);
229 TranslateCommonMetrics::ReportContentLanguage(code, modified_code); 227 translate::ReportContentLanguage(code, modified_code);
230 } 228 }
231 229
232 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt 230 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
233 // |modified_code|. 231 // |modified_code|.
234 std::string language = modified_html_lang.empty() ? modified_code : 232 std::string language = modified_html_lang.empty() ? modified_code :
235 modified_html_lang; 233 modified_html_lang;
236 234
237 // If |language| is empty, just use CLD result even though it might be 235 // If |language| is empty, just use CLD result even though it might be
238 // chrome::kUnknownLanguageCode. 236 // translate::kUnknownLanguageCode.
239 if (language.empty()) { 237 if (language.empty()) {
240 TranslateCommonMetrics::ReportLanguageVerification( 238 translate::ReportLanguageVerification(
241 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); 239 translate::LANGUAGE_VERIFICATION_CLD_ONLY);
242 return cld_language; 240 return cld_language;
243 } 241 }
244 242
245 if (cld_language == chrome::kUnknownLanguageCode) { 243 if (cld_language == kUnknownLanguageCode) {
246 TranslateCommonMetrics::ReportLanguageVerification( 244 translate::ReportLanguageVerification(
247 TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN); 245 translate::LANGUAGE_VERIFICATION_UNKNOWN);
248 return language; 246 return language;
249 } else if (CanCLDComplementSubCode(language, cld_language)) { 247 } else if (CanCLDComplementSubCode(language, cld_language)) {
250 TranslateCommonMetrics::ReportLanguageVerification( 248 translate::ReportLanguageVerification(
251 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); 249 translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
252 return cld_language; 250 return cld_language;
253 } else if (IsSameOrSimilarLanguages(language, cld_language)) { 251 } else if (IsSameOrSimilarLanguages(language, cld_language)) {
254 TranslateCommonMetrics::ReportLanguageVerification( 252 translate::ReportLanguageVerification(
255 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE); 253 translate::LANGUAGE_VERIFICATION_CLD_AGREE);
256 return language; 254 return language;
257 } else if (MaybeServerWrongConfiguration(language, cld_language)) { 255 } else if (MaybeServerWrongConfiguration(language, cld_language)) {
258 TranslateCommonMetrics::ReportLanguageVerification( 256 translate::ReportLanguageVerification(
259 TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD); 257 translate::LANGUAGE_VERIFICATION_TRUST_CLD);
260 return cld_language; 258 return cld_language;
261 } else { 259 } else {
262 TranslateCommonMetrics::ReportLanguageVerification( 260 translate::ReportLanguageVerification(
263 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); 261 translate::LANGUAGE_VERIFICATION_CLD_DISAGREE);
264 // Content-Language value might be wrong because CLD says that this page 262 // Content-Language value might be wrong because CLD says that this page
265 // is written in another language with confidence. 263 // is written in another language with confidence.
266 // In this case, Chrome doesn't rely on any of the language codes, and 264 // In this case, Chrome doesn't rely on any of the language codes, and
267 // gives up suggesting a translation. 265 // gives up suggesting a translation.
268 return std::string(chrome::kUnknownLanguageCode); 266 return std::string(kUnknownLanguageCode);
269 } 267 }
270 268
271 return language; 269 return language;
272 } 270 }
273 271
274 void CorrectLanguageCodeTypo(std::string* code) { 272 void CorrectLanguageCodeTypo(std::string* code) {
275 DCHECK(code); 273 DCHECK(code);
276 274
277 size_t coma_index = code->find(','); 275 size_t coma_index = code->find(',');
278 if (coma_index != std::string::npos) { 276 if (coma_index != std::string::npos) {
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after
345 base::SplitString(cld_language, '-', &chunks); 343 base::SplitString(cld_language, '-', &chunks);
346 if (chunks.size() == 0) 344 if (chunks.size() == 0)
347 return false; 345 return false;
348 std::string cld_language_main_part = chunks[0]; 346 std::string cld_language_main_part = chunks[0];
349 347
350 // Language code part of |page_language| is matched to one of |cld_language|. 348 // Language code part of |page_language| is matched to one of |cld_language|.
351 // Country code is ignored here. 349 // Country code is ignored here.
352 if (page_language_main_part == cld_language_main_part) { 350 if (page_language_main_part == cld_language_main_part) {
353 // Languages are matched strictly. Reports false to metrics, but returns 351 // Languages are matched strictly. Reports false to metrics, but returns
354 // true. 352 // true.
355 TranslateCommonMetrics::ReportSimilarLanguageMatch(false); 353 translate::ReportSimilarLanguageMatch(false);
356 return true; 354 return true;
357 } 355 }
358 356
359 // Check if |page_language| and |cld_language| are in the similar language 357 // Check if |page_language| and |cld_language| are in the similar language
360 // list and belong to the same language group. 358 // list and belong to the same language group.
361 int page_code = GetSimilarLanguageGroupCode(page_language); 359 int page_code = GetSimilarLanguageGroupCode(page_language);
362 bool match = page_code != 0 && 360 bool match = page_code != 0 &&
363 page_code == GetSimilarLanguageGroupCode(cld_language); 361 page_code == GetSimilarLanguageGroupCode(cld_language);
364 362
365 TranslateCommonMetrics::ReportSimilarLanguageMatch(match); 363 translate::ReportSimilarLanguageMatch(match);
366 return match; 364 return match;
367 } 365 }
368 366
369 bool MaybeServerWrongConfiguration(const std::string& page_language, 367 bool MaybeServerWrongConfiguration(const std::string& page_language,
370 const std::string& cld_language) { 368 const std::string& cld_language) {
371 // If |page_language| is not "en-*", respect it and just return false here. 369 // If |page_language| is not "en-*", respect it and just return false here.
372 if (!StartsWithASCII(page_language, "en", false)) 370 if (!StartsWithASCII(page_language, "en", false))
373 return false; 371 return false;
374 372
375 // A server provides a language meta information representing "en-*". But it 373 // A server provides a language meta information representing "en-*". But it
(...skipping 17 matching lines...) Expand all
393 #if !defined(CLD_VERSION) || CLD_VERSION==2 391 #if !defined(CLD_VERSION) || CLD_VERSION==2
394 case 2: 392 case 2:
395 return CLD2::DetectLanguageVersion(); 393 return CLD2::DetectLanguageVersion();
396 #endif 394 #endif
397 default: 395 default:
398 NOTREACHED(); 396 NOTREACHED();
399 } 397 }
400 return ""; 398 return "";
401 } 399 }
402 400
403 } // namespace LanguageDetectionUtil 401 } // namespace translate
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698