Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(66)

Side by Side Diff: chrome/common/translate/language_detection_util.cc

Issue 25531002: Move language detection to a component (Closed) Base URL: http://git.chromium.org/chromium/src.git@master
Patch Set: Run translate unittests on iOS Created 7 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "chrome/common/translate/language_detection_util.h"
6
7 #include "base/logging.h"
8 #include "base/metrics/field_trial.h"
9 #include "base/strings/string_split.h"
10 #include "base/strings/string_util.h"
11 #include "base/strings/utf_string_conversions.h"
12 #include "base/time/time.h"
13 #include "chrome/common/chrome_constants.h"
14 #include "chrome/common/translate/translate_common_metrics.h"
15 #include "chrome/common/translate/translate_util.h"
16
17 #if !defined(CLD_VERSION) || CLD_VERSION==1
18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
20 #endif
21
22 #if !defined(CLD_VERSION) || CLD_VERSION==2
23 #include "third_party/cld_2/src/public/compact_lang_det.h"
24 #endif
25
26 namespace {
27
28 // Similar language code list. Some languages are very similar and difficult
29 // for CLD to distinguish.
30 struct SimilarLanguageCode {
31 const char* const code;
32 int group;
33 };
34
35 const SimilarLanguageCode kSimilarLanguageCodes[] = {
36 {"bs", 1},
37 {"hr", 1},
38 {"hi", 2},
39 {"ne", 2},
40 };
41
42 // Checks |kSimilarLanguageCodes| and returns group code.
43 int GetSimilarLanguageGroupCode(const std::string& language) {
44 for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {
45 if (language.find(kSimilarLanguageCodes[i].code) != 0)
46 continue;
47 return kSimilarLanguageCodes[i].group;
48 }
49 return 0;
50 }
51
52 // Well-known languages which often have wrong server configuration of
53 // Content-Language: en.
54 // TODO(toyoshim): Remove these static tables and caller functions to
55 // chrome/common/translate, and implement them as std::set<>.
56 const char* kWellKnownCodesOnWrongConfiguration[] = {
57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
58 };
59
60 // Applies a series of language code modification in proper order.
61 void ApplyLanguageCodeCorrection(std::string* code) {
62 // Correct well-known format errors.
63 LanguageDetectionUtil::CorrectLanguageCodeTypo(code);
64
65 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) {
66 *code = std::string();
67 return;
68 }
69
70 TranslateUtil::ToTranslateLanguageSynonym(code);
71 }
72
73 int GetCLDMajorVersion() {
74 #if !defined(CLD_VERSION)
75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");
76 if (group_name == "CLD2")
77 return 2;
78 else
79 return 1;
80 #else
81 return CLD_VERSION;
82 #endif
83 }
84
85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
86 // failed.
87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
88 std::string DetermineTextLanguage(const base::string16& text,
89 bool* is_cld_reliable) {
90 std::string language = chrome::kUnknownLanguageCode;
91 int text_bytes = 0;
92 bool is_reliable = false;
93
94 // Language or CLD2::Language
95 int cld_language = 0;
96 bool is_valid_language = false;
97
98 switch (GetCLDMajorVersion()) {
99 #if !defined(CLD_VERSION) || CLD_VERSION==1
100 case 1: {
101 int num_languages = 0;
102 cld_language =
103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
104 &num_languages, NULL, &text_bytes);
105 is_valid_language = cld_language != NUM_LANGUAGES &&
106 cld_language != UNKNOWN_LANGUAGE &&
107 cld_language != TG_UNKNOWN_LANGUAGE;
108 break;
109 }
110 #endif
111 #if !defined(CLD_VERSION) || CLD_VERSION==2
112 case 2: {
113 std::string utf8_text(UTF16ToUTF8(text));
114 CLD2::Language language3[3];
115 int percent3[3];
116 cld_language =
117 CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true,
118 language3, percent3,
119 &text_bytes, &is_reliable);
120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
121 cld_language != CLD2::UNKNOWN_LANGUAGE &&
122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
123 break;
124 }
125 #endif
126 default:
127 NOTREACHED();
128 }
129
130 if (is_cld_reliable != NULL)
131 *is_cld_reliable = is_reliable;
132
133 // We don't trust the result if the CLD reports that the detection is not
134 // reliable, or if the actual text used to detect the language was less than
135 // 100 bytes (short texts can often lead to wrong results).
136 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
137 // the determined language code is correct with 50% confidence. Chrome should
138 // handle the real confidence value to judge.
139 if (is_reliable && text_bytes >= 100 && is_valid_language) {
140 // We should not use LanguageCode_ISO_639_1 because it does not cover all
141 // the languages CLD can detect. As a result, it'll return the invalid
142 // language code for tradtional Chinese among others.
143 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
145 // for Simplified Chinese.
146 switch (GetCLDMajorVersion()) {
147 #if !defined(CLD_VERSION) || CLD_VERSION==1
148 case 1:
149 language =
150 LanguageCodeWithDialects(static_cast<Language>(cld_language));
151 break;
152 #endif
153 #if !defined(CLD_VERSION) || CLD_VERSION==2
154 case 2:
155 // (1) CLD2's LanguageCode returns general Chinese 'zh' for
156 // CLD2::CHINESE, but Translate server doesn't accept it. This is
157 // converted to 'zh-CN' in the same way as CLD1's
158 // LanguageCodeWithDialects.
159 //
160 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for
161 // CLD2::CHINESE_T. This is technically more precise for the language
162 // code of traditional Chinese, while Translate server hasn't accepted
163 // zh-Hant yet.
164 if (cld_language == CLD2::CHINESE) {
165 language = "zh-CN";
166 } else if (cld_language == CLD2::CHINESE_T) {
167 language = "zh-TW";
168 } else {
169 language =
170 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
171 }
172 break;
173 #endif
174 default:
175 NOTREACHED();
176 }
177 }
178 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
179 << "\n*************************************\n";
180 return language;
181 }
182
183 // Checks if CLD can complement a sub code when the page language doesn't know
184 // the sub code.
185 bool CanCLDComplementSubCode(
186 const std::string& page_language, const std::string& cld_language) {
187 // Translate server cannot treat general Chinese. If Content-Language and
188 // CLD agree that the language is Chinese and Content-Language doesn't know
189 // which dialect is used, CLD language has priority.
190 // TODO(hajimehoshi): How about the other dialects like zh-MO?
191 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
192 }
193
194 } // namespace
195
196 namespace LanguageDetectionUtil {
197
198 std::string DeterminePageLanguage(const std::string& code,
199 const std::string& html_lang,
200 const base::string16& contents,
201 std::string* cld_language_p,
202 bool* is_cld_reliable_p) {
203 base::TimeTicks begin_time = base::TimeTicks::Now();
204 bool is_cld_reliable;
205 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
206 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time,
207 base::TimeTicks::Now());
208
209 if (cld_language_p != NULL)
210 *cld_language_p = cld_language;
211 if (is_cld_reliable_p != NULL)
212 *is_cld_reliable_p = is_cld_reliable;
213 TranslateUtil::ToTranslateLanguageSynonym(&cld_language);
214
215 // Check if html lang attribute is valid.
216 std::string modified_html_lang;
217 if (!html_lang.empty()) {
218 modified_html_lang = html_lang;
219 ApplyLanguageCodeCorrection(&modified_html_lang);
220 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang);
221 VLOG(9) << "html lang based language code: " << modified_html_lang;
222 }
223
224 // Check if Content-Language is valid.
225 std::string modified_code;
226 if (!code.empty()) {
227 modified_code = code;
228 ApplyLanguageCodeCorrection(&modified_code);
229 TranslateCommonMetrics::ReportContentLanguage(code, modified_code);
230 }
231
232 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
233 // |modified_code|.
234 std::string language = modified_html_lang.empty() ? modified_code :
235 modified_html_lang;
236
237 // If |language| is empty, just use CLD result even though it might be
238 // chrome::kUnknownLanguageCode.
239 if (language.empty()) {
240 TranslateCommonMetrics::ReportLanguageVerification(
241 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY);
242 return cld_language;
243 }
244
245 if (cld_language == chrome::kUnknownLanguageCode) {
246 TranslateCommonMetrics::ReportLanguageVerification(
247 TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN);
248 return language;
249 } else if (CanCLDComplementSubCode(language, cld_language)) {
250 TranslateCommonMetrics::ReportLanguageVerification(
251 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
252 return cld_language;
253 } else if (IsSameOrSimilarLanguages(language, cld_language)) {
254 TranslateCommonMetrics::ReportLanguageVerification(
255 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE);
256 return language;
257 } else if (MaybeServerWrongConfiguration(language, cld_language)) {
258 TranslateCommonMetrics::ReportLanguageVerification(
259 TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD);
260 return cld_language;
261 } else {
262 TranslateCommonMetrics::ReportLanguageVerification(
263 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE);
264 // Content-Language value might be wrong because CLD says that this page
265 // is written in another language with confidence.
266 // In this case, Chrome doesn't rely on any of the language codes, and
267 // gives up suggesting a translation.
268 return std::string(chrome::kUnknownLanguageCode);
269 }
270
271 return language;
272 }
273
274 void CorrectLanguageCodeTypo(std::string* code) {
275 DCHECK(code);
276
277 size_t coma_index = code->find(',');
278 if (coma_index != std::string::npos) {
279 // There are more than 1 language specified, just keep the first one.
280 *code = code->substr(0, coma_index);
281 }
282 TrimWhitespaceASCII(*code, TRIM_ALL, code);
283
284 // An underscore instead of a dash is a frequent mistake.
285 size_t underscore_index = code->find('_');
286 if (underscore_index != std::string::npos)
287 (*code)[underscore_index] = '-';
288
289 // Change everything up to a dash to lower-case and everything after to upper.
290 size_t dash_index = code->find('-');
291 if (dash_index != std::string::npos) {
292 *code = StringToLowerASCII(code->substr(0, dash_index)) +
293 StringToUpperASCII(code->substr(dash_index));
294 } else {
295 *code = StringToLowerASCII(*code);
296 }
297 }
298
299 bool IsValidLanguageCode(const std::string& code) {
300 // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
301 // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
302 std::vector<std::string> chunks;
303 base::SplitString(code, '-', &chunks);
304
305 if (chunks.size() < 1 || 2 < chunks.size())
306 return false;
307
308 const std::string& main_code = chunks[0];
309
310 if (main_code.size() < 1 || 3 < main_code.size())
311 return false;
312
313 for (std::string::const_iterator it = main_code.begin();
314 it != main_code.end(); ++it) {
315 if (!IsAsciiAlpha(*it))
316 return false;
317 }
318
319 if (chunks.size() == 1)
320 return true;
321
322 const std::string& sub_code = chunks[1];
323
324 if (sub_code.size() != 2)
325 return false;
326
327 for (std::string::const_iterator it = sub_code.begin();
328 it != sub_code.end(); ++it) {
329 if (!IsAsciiAlpha(*it))
330 return false;
331 }
332
333 return true;
334 }
335
336 bool IsSameOrSimilarLanguages(const std::string& page_language,
337 const std::string& cld_language) {
338 std::vector<std::string> chunks;
339
340 base::SplitString(page_language, '-', &chunks);
341 if (chunks.size() == 0)
342 return false;
343 std::string page_language_main_part = chunks[0];
344
345 base::SplitString(cld_language, '-', &chunks);
346 if (chunks.size() == 0)
347 return false;
348 std::string cld_language_main_part = chunks[0];
349
350 // Language code part of |page_language| is matched to one of |cld_language|.
351 // Country code is ignored here.
352 if (page_language_main_part == cld_language_main_part) {
353 // Languages are matched strictly. Reports false to metrics, but returns
354 // true.
355 TranslateCommonMetrics::ReportSimilarLanguageMatch(false);
356 return true;
357 }
358
359 // Check if |page_language| and |cld_language| are in the similar language
360 // list and belong to the same language group.
361 int page_code = GetSimilarLanguageGroupCode(page_language);
362 bool match = page_code != 0 &&
363 page_code == GetSimilarLanguageGroupCode(cld_language);
364
365 TranslateCommonMetrics::ReportSimilarLanguageMatch(match);
366 return match;
367 }
368
369 bool MaybeServerWrongConfiguration(const std::string& page_language,
370 const std::string& cld_language) {
371 // If |page_language| is not "en-*", respect it and just return false here.
372 if (!StartsWithASCII(page_language, "en", false))
373 return false;
374
375 // A server provides a language meta information representing "en-*". But it
376 // might be just a default value due to missing user configuration.
377 // Let's trust |cld_language| if the determined language is not difficult to
378 // distinguish from English, and the language is one of well-known languages
379 // which often provide "en-*" meta information mistakenly.
380 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
381 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
382 return true;
383 }
384 return false;
385 }
386
387 std::string GetCLDVersion() {
388 switch (GetCLDMajorVersion()) {
389 #if !defined(CLD_VERSION) || CLD_VERSION==1
390 case 1:
391 return CompactLangDet::DetectLanguageVersion();
392 #endif
393 #if !defined(CLD_VERSION) || CLD_VERSION==2
394 case 2:
395 return CLD2::DetectLanguageVersion();
396 #endif
397 default:
398 NOTREACHED();
399 }
400 return "";
401 }
402
403 } // namespace LanguageDetectionUtil
OLDNEW
« no previous file with comments | « chrome/common/translate/language_detection_util.h ('k') | chrome/common/translate/language_detection_util_unittest.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698