OLD | NEW |
| (Empty) |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "components/translate/language_detection/language_detection_util.h" | |
6 | |
7 #include "base/logging.h" | |
8 #include "base/metrics/field_trial.h" | |
9 #include "base/strings/string_split.h" | |
10 #include "base/strings/string_util.h" | |
11 #include "base/strings/utf_string_conversions.h" | |
12 #include "base/time/time.h" | |
13 #include "components/translate/core/common/translate_constants.h" | |
14 #include "components/translate/core/common/translate_metrics.h" | |
15 #include "components/translate/core/common/translate_util.h" | |
16 | |
17 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | |
19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | |
20 #endif | |
21 | |
22 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
23 #include "third_party/cld_2/src/public/compact_lang_det.h" | |
24 #endif | |
25 | |
26 namespace { | |
27 | |
28 // Similar language code list. Some languages are very similar and difficult | |
29 // for CLD to distinguish. | |
30 struct SimilarLanguageCode { | |
31 const char* const code; | |
32 int group; | |
33 }; | |
34 | |
35 const SimilarLanguageCode kSimilarLanguageCodes[] = { | |
36 {"bs", 1}, | |
37 {"hr", 1}, | |
38 {"hi", 2}, | |
39 {"ne", 2}, | |
40 }; | |
41 | |
42 // Checks |kSimilarLanguageCodes| and returns group code. | |
43 int GetSimilarLanguageGroupCode(const std::string& language) { | |
44 for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) { | |
45 if (language.find(kSimilarLanguageCodes[i].code) != 0) | |
46 continue; | |
47 return kSimilarLanguageCodes[i].group; | |
48 } | |
49 return 0; | |
50 } | |
51 | |
52 // Well-known languages which often have wrong server configuration of | |
53 // Content-Language: en. | |
54 // TODO(toyoshim): Remove these static tables and caller functions to | |
55 // translate/common, and implement them as std::set<>. | |
56 const char* kWellKnownCodesOnWrongConfiguration[] = { | |
57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th" | |
58 }; | |
59 | |
60 // Applies a series of language code modification in proper order. | |
61 void ApplyLanguageCodeCorrection(std::string* code) { | |
62 // Correct well-known format errors. | |
63 translate::CorrectLanguageCodeTypo(code); | |
64 | |
65 if (!translate::IsValidLanguageCode(*code)) { | |
66 *code = std::string(); | |
67 return; | |
68 } | |
69 | |
70 translate::ToTranslateLanguageSynonym(code); | |
71 } | |
72 | |
73 int GetCLDMajorVersion() { | |
74 #if !defined(CLD_VERSION) | |
75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); | |
76 if (group_name == "CLD2") | |
77 return 2; | |
78 else | |
79 return 1; | |
80 #else | |
81 return CLD_VERSION; | |
82 #endif | |
83 } | |
84 | |
85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | |
86 // failed. | |
87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | |
88 std::string DetermineTextLanguage(const base::string16& text, | |
89 bool* is_cld_reliable) { | |
90 std::string language = translate::kUnknownLanguageCode; | |
91 int text_bytes = 0; | |
92 bool is_reliable = false; | |
93 | |
94 // Language or CLD2::Language | |
95 int cld_language = 0; | |
96 bool is_valid_language = false; | |
97 | |
98 switch (GetCLDMajorVersion()) { | |
99 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
100 case 1: { | |
101 int num_languages = 0; | |
102 cld_language = | |
103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, | |
104 &num_languages, NULL, &text_bytes); | |
105 is_valid_language = cld_language != NUM_LANGUAGES && | |
106 cld_language != UNKNOWN_LANGUAGE && | |
107 cld_language != TG_UNKNOWN_LANGUAGE; | |
108 break; | |
109 } | |
110 #endif | |
111 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
112 case 2: { | |
113 std::string utf8_text(base::UTF16ToUTF8(text)); | |
114 CLD2::Language language3[3]; | |
115 int percent3[3]; | |
116 CLD2::DetectLanguageSummary( | |
117 utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3, | |
118 &text_bytes, &is_reliable); | |
119 cld_language = language3[0]; | |
120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && | |
121 cld_language != CLD2::UNKNOWN_LANGUAGE && | |
122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; | |
123 break; | |
124 } | |
125 #endif | |
126 default: | |
127 NOTREACHED(); | |
128 } | |
129 | |
130 if (is_cld_reliable != NULL) | |
131 *is_cld_reliable = is_reliable; | |
132 | |
133 // We don't trust the result if the CLD reports that the detection is not | |
134 // reliable, or if the actual text used to detect the language was less than | |
135 // 100 bytes (short texts can often lead to wrong results). | |
136 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that | |
137 // the determined language code is correct with 50% confidence. Chrome should | |
138 // handle the real confidence value to judge. | |
139 if (is_reliable && text_bytes >= 100 && is_valid_language) { | |
140 // We should not use LanguageCode_ISO_639_1 because it does not cover all | |
141 // the languages CLD can detect. As a result, it'll return the invalid | |
142 // language code for tradtional Chinese among others. | |
143 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and | |
144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN | |
145 // for Simplified Chinese. | |
146 switch (GetCLDMajorVersion()) { | |
147 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
148 case 1: | |
149 language = | |
150 LanguageCodeWithDialects(static_cast<Language>(cld_language)); | |
151 break; | |
152 #endif | |
153 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
154 case 2: | |
155 // (1) CLD2's LanguageCode returns general Chinese 'zh' for | |
156 // CLD2::CHINESE, but Translate server doesn't accept it. This is | |
157 // converted to 'zh-CN' in the same way as CLD1's | |
158 // LanguageCodeWithDialects. | |
159 // | |
160 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for | |
161 // CLD2::CHINESE_T. This is technically more precise for the language | |
162 // code of traditional Chinese, while Translate server hasn't accepted | |
163 // zh-Hant yet. | |
164 if (cld_language == CLD2::CHINESE) { | |
165 language = "zh-CN"; | |
166 } else if (cld_language == CLD2::CHINESE_T) { | |
167 language = "zh-TW"; | |
168 } else { | |
169 language = | |
170 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); | |
171 } | |
172 break; | |
173 #endif | |
174 default: | |
175 NOTREACHED(); | |
176 } | |
177 } | |
178 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text | |
179 << "\n*************************************\n"; | |
180 return language; | |
181 } | |
182 | |
183 // Checks if CLD can complement a sub code when the page language doesn't know | |
184 // the sub code. | |
185 bool CanCLDComplementSubCode( | |
186 const std::string& page_language, const std::string& cld_language) { | |
187 // Translate server cannot treat general Chinese. If Content-Language and | |
188 // CLD agree that the language is Chinese and Content-Language doesn't know | |
189 // which dialect is used, CLD language has priority. | |
190 // TODO(hajimehoshi): How about the other dialects like zh-MO? | |
191 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); | |
192 } | |
193 | |
194 } // namespace | |
195 | |
196 namespace translate { | |
197 | |
198 std::string DeterminePageLanguage(const std::string& code, | |
199 const std::string& html_lang, | |
200 const base::string16& contents, | |
201 std::string* cld_language_p, | |
202 bool* is_cld_reliable_p) { | |
203 base::TimeTicks begin_time = base::TimeTicks::Now(); | |
204 bool is_cld_reliable; | |
205 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); | |
206 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now()); | |
207 | |
208 if (cld_language_p != NULL) | |
209 *cld_language_p = cld_language; | |
210 if (is_cld_reliable_p != NULL) | |
211 *is_cld_reliable_p = is_cld_reliable; | |
212 translate::ToTranslateLanguageSynonym(&cld_language); | |
213 | |
214 // Check if html lang attribute is valid. | |
215 std::string modified_html_lang; | |
216 if (!html_lang.empty()) { | |
217 modified_html_lang = html_lang; | |
218 ApplyLanguageCodeCorrection(&modified_html_lang); | |
219 translate::ReportHtmlLang(html_lang, modified_html_lang); | |
220 VLOG(9) << "html lang based language code: " << modified_html_lang; | |
221 } | |
222 | |
223 // Check if Content-Language is valid. | |
224 std::string modified_code; | |
225 if (!code.empty()) { | |
226 modified_code = code; | |
227 ApplyLanguageCodeCorrection(&modified_code); | |
228 translate::ReportContentLanguage(code, modified_code); | |
229 } | |
230 | |
231 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt | |
232 // |modified_code|. | |
233 std::string language = modified_html_lang.empty() ? modified_code : | |
234 modified_html_lang; | |
235 | |
236 // If |language| is empty, just use CLD result even though it might be | |
237 // translate::kUnknownLanguageCode. | |
238 if (language.empty()) { | |
239 translate::ReportLanguageVerification( | |
240 translate::LANGUAGE_VERIFICATION_CLD_ONLY); | |
241 return cld_language; | |
242 } | |
243 | |
244 if (cld_language == kUnknownLanguageCode) { | |
245 translate::ReportLanguageVerification( | |
246 translate::LANGUAGE_VERIFICATION_UNKNOWN); | |
247 return language; | |
248 } | |
249 | |
250 if (CanCLDComplementSubCode(language, cld_language)) { | |
251 translate::ReportLanguageVerification( | |
252 translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); | |
253 return cld_language; | |
254 } | |
255 | |
256 if (IsSameOrSimilarLanguages(language, cld_language)) { | |
257 translate::ReportLanguageVerification( | |
258 translate::LANGUAGE_VERIFICATION_CLD_AGREE); | |
259 return language; | |
260 } | |
261 | |
262 if (MaybeServerWrongConfiguration(language, cld_language)) { | |
263 translate::ReportLanguageVerification( | |
264 translate::LANGUAGE_VERIFICATION_TRUST_CLD); | |
265 return cld_language; | |
266 } | |
267 | |
268 // Content-Language value might be wrong because CLD says that this page is | |
269 // written in another language with confidence. In this case, Chrome doesn't | |
270 // rely on any of the language codes, and gives up suggesting a translation. | |
271 translate::ReportLanguageVerification( | |
272 translate::LANGUAGE_VERIFICATION_CLD_DISAGREE); | |
273 return kUnknownLanguageCode; | |
274 } | |
275 | |
276 void CorrectLanguageCodeTypo(std::string* code) { | |
277 DCHECK(code); | |
278 | |
279 size_t coma_index = code->find(','); | |
280 if (coma_index != std::string::npos) { | |
281 // There are more than 1 language specified, just keep the first one. | |
282 *code = code->substr(0, coma_index); | |
283 } | |
284 base::TrimWhitespaceASCII(*code, base::TRIM_ALL, code); | |
285 | |
286 // An underscore instead of a dash is a frequent mistake. | |
287 size_t underscore_index = code->find('_'); | |
288 if (underscore_index != std::string::npos) | |
289 (*code)[underscore_index] = '-'; | |
290 | |
291 // Change everything up to a dash to lower-case and everything after to upper. | |
292 size_t dash_index = code->find('-'); | |
293 if (dash_index != std::string::npos) { | |
294 *code = StringToLowerASCII(code->substr(0, dash_index)) + | |
295 StringToUpperASCII(code->substr(dash_index)); | |
296 } else { | |
297 *code = StringToLowerASCII(*code); | |
298 } | |
299 } | |
300 | |
301 bool IsValidLanguageCode(const std::string& code) { | |
302 // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/. | |
303 // TODO(hajimehoshi): How about es-419, which is used as an Accept language? | |
304 std::vector<std::string> chunks; | |
305 base::SplitString(code, '-', &chunks); | |
306 | |
307 if (chunks.size() < 1 || 2 < chunks.size()) | |
308 return false; | |
309 | |
310 const std::string& main_code = chunks[0]; | |
311 | |
312 if (main_code.size() < 1 || 3 < main_code.size()) | |
313 return false; | |
314 | |
315 for (std::string::const_iterator it = main_code.begin(); | |
316 it != main_code.end(); ++it) { | |
317 if (!IsAsciiAlpha(*it)) | |
318 return false; | |
319 } | |
320 | |
321 if (chunks.size() == 1) | |
322 return true; | |
323 | |
324 const std::string& sub_code = chunks[1]; | |
325 | |
326 if (sub_code.size() != 2) | |
327 return false; | |
328 | |
329 for (std::string::const_iterator it = sub_code.begin(); | |
330 it != sub_code.end(); ++it) { | |
331 if (!IsAsciiAlpha(*it)) | |
332 return false; | |
333 } | |
334 | |
335 return true; | |
336 } | |
337 | |
338 bool IsSameOrSimilarLanguages(const std::string& page_language, | |
339 const std::string& cld_language) { | |
340 std::vector<std::string> chunks; | |
341 | |
342 base::SplitString(page_language, '-', &chunks); | |
343 if (chunks.size() == 0) | |
344 return false; | |
345 std::string page_language_main_part = chunks[0]; | |
346 | |
347 base::SplitString(cld_language, '-', &chunks); | |
348 if (chunks.size() == 0) | |
349 return false; | |
350 std::string cld_language_main_part = chunks[0]; | |
351 | |
352 // Language code part of |page_language| is matched to one of |cld_language|. | |
353 // Country code is ignored here. | |
354 if (page_language_main_part == cld_language_main_part) { | |
355 // Languages are matched strictly. Reports false to metrics, but returns | |
356 // true. | |
357 translate::ReportSimilarLanguageMatch(false); | |
358 return true; | |
359 } | |
360 | |
361 // Check if |page_language| and |cld_language| are in the similar language | |
362 // list and belong to the same language group. | |
363 int page_code = GetSimilarLanguageGroupCode(page_language); | |
364 bool match = page_code != 0 && | |
365 page_code == GetSimilarLanguageGroupCode(cld_language); | |
366 | |
367 translate::ReportSimilarLanguageMatch(match); | |
368 return match; | |
369 } | |
370 | |
371 bool MaybeServerWrongConfiguration(const std::string& page_language, | |
372 const std::string& cld_language) { | |
373 // If |page_language| is not "en-*", respect it and just return false here. | |
374 if (!StartsWithASCII(page_language, "en", false)) | |
375 return false; | |
376 | |
377 // A server provides a language meta information representing "en-*". But it | |
378 // might be just a default value due to missing user configuration. | |
379 // Let's trust |cld_language| if the determined language is not difficult to | |
380 // distinguish from English, and the language is one of well-known languages | |
381 // which often provide "en-*" meta information mistakenly. | |
382 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | |
383 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | |
384 return true; | |
385 } | |
386 return false; | |
387 } | |
388 | |
389 } // namespace translate | |
OLD | NEW |