OLD | NEW |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "chrome/common/translate/language_detection_util.h" | 5 #include "chrome/common/translate/language_detection_util.h" |
6 | 6 |
7 #include "base/logging.h" | 7 #include "base/logging.h" |
8 #include "base/metrics/field_trial.h" | |
9 #include "base/strings/string_split.h" | 8 #include "base/strings/string_split.h" |
10 #include "base/strings/string_util.h" | 9 #include "base/strings/string_util.h" |
11 #include "base/strings/utf_string_conversions.h" | |
12 #include "base/time/time.h" | 10 #include "base/time/time.h" |
13 #include "chrome/common/chrome_constants.h" | 11 #include "chrome/common/chrome_constants.h" |
14 #include "chrome/common/translate/translate_common_metrics.h" | 12 #include "chrome/common/translate/translate_common_metrics.h" |
15 #include "chrome/common/translate/translate_util.h" | 13 #include "chrome/common/translate/translate_util.h" |
16 | |
17 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | 14 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | 15 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" |
20 #endif | |
21 | |
22 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
23 #include "third_party/cld_2/src/public/compact_lang_det.h" | |
24 #endif | |
25 | 16 |
26 namespace { | 17 namespace { |
27 | 18 |
28 // Similar language code list. Some languages are very similar and difficult | 19 // Similar language code list. Some languages are very similar and difficult |
29 // for CLD to distinguish. | 20 // for CLD to distinguish. |
30 struct SimilarLanguageCode { | 21 struct SimilarLanguageCode { |
31 const char* const code; | 22 const char* const code; |
32 int group; | 23 int group; |
33 }; | 24 }; |
34 | 25 |
(...skipping 28 matching lines...) Expand all Loading... |
63 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); | 54 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); |
64 | 55 |
65 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { | 56 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { |
66 *code = std::string(); | 57 *code = std::string(); |
67 return; | 58 return; |
68 } | 59 } |
69 | 60 |
70 TranslateUtil::ToTranslateLanguageSynonym(code); | 61 TranslateUtil::ToTranslateLanguageSynonym(code); |
71 } | 62 } |
72 | 63 |
73 int GetCLDMajorVersion() { | |
74 #if !defined(CLD_VERSION) | |
75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); | |
76 if (group_name == "CLD2") | |
77 return 2; | |
78 else | |
79 return 1; | |
80 #else | |
81 return CLD_VERSION; | |
82 #endif | |
83 } | |
84 | |
85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | 64 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
86 // failed. | 65 // failed. |
87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | 66 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
88 std::string DetermineTextLanguage(const base::string16& text, | 67 std::string DetermineTextLanguage(const base::string16& text, |
89 bool* is_cld_reliable) { | 68 bool* is_cld_reliable) { |
90 std::string language = chrome::kUnknownLanguageCode; | 69 std::string language = chrome::kUnknownLanguageCode; |
| 70 int num_languages = 0; |
91 int text_bytes = 0; | 71 int text_bytes = 0; |
92 bool is_reliable = false; | 72 bool is_reliable = false; |
93 | 73 Language cld_language = |
94 // Language or CLD2::Language | 74 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, |
95 int cld_language = 0; | 75 &num_languages, NULL, &text_bytes); |
96 bool is_valid_language = false; | |
97 | |
98 switch (GetCLDMajorVersion()) { | |
99 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
100 case 1: { | |
101 int num_languages = 0; | |
102 cld_language = | |
103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, | |
104 &num_languages, NULL, &text_bytes); | |
105 is_valid_language = cld_language != NUM_LANGUAGES && | |
106 cld_language != UNKNOWN_LANGUAGE && | |
107 cld_language != TG_UNKNOWN_LANGUAGE; | |
108 break; | |
109 } | |
110 #endif | |
111 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
112 case 2: { | |
113 std::string utf8_text(UTF16ToUTF8(text)); | |
114 CLD2::Language language3[3]; | |
115 int percent3[3]; | |
116 cld_language = | |
117 CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true, | |
118 language3, percent3, | |
119 &text_bytes, &is_reliable); | |
120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && | |
121 cld_language != CLD2::UNKNOWN_LANGUAGE && | |
122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; | |
123 break; | |
124 } | |
125 #endif | |
126 default: | |
127 NOTREACHED(); | |
128 } | |
129 | |
130 if (is_cld_reliable != NULL) | 76 if (is_cld_reliable != NULL) |
131 *is_cld_reliable = is_reliable; | 77 *is_cld_reliable = is_reliable; |
132 | 78 |
133 // We don't trust the result if the CLD reports that the detection is not | 79 // We don't trust the result if the CLD reports that the detection is not |
134 // reliable, or if the actual text used to detect the language was less than | 80 // reliable, or if the actual text used to detect the language was less than |
135 // 100 bytes (short texts can often lead to wrong results). | 81 // 100 bytes (short texts can often lead to wrong results). |
136 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that | 82 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that |
137 // the determined language code is correct with 50% confidence. Chrome should | 83 // the determined language code is correct with 50% confidence. Chrome should |
138 // handle the real confidence value to judge. | 84 // handle the real confidence value to judge. |
139 if (is_reliable && text_bytes >= 100 && is_valid_language) { | 85 if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES && |
| 86 cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) { |
140 // We should not use LanguageCode_ISO_639_1 because it does not cover all | 87 // We should not use LanguageCode_ISO_639_1 because it does not cover all |
141 // the languages CLD can detect. As a result, it'll return the invalid | 88 // the languages CLD can detect. As a result, it'll return the invalid |
142 // language code for tradtional Chinese among others. | 89 // language code for tradtional Chinese among others. |
143 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and | 90 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and |
144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN | 91 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN |
145 // for Simplified Chinese. | 92 // for Simplified Chinese. |
146 switch (GetCLDMajorVersion()) { | 93 language = LanguageCodeWithDialects(cld_language); |
147 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
148 case 1: | |
149 language = | |
150 LanguageCodeWithDialects(static_cast<Language>(cld_language)); | |
151 break; | |
152 #endif | |
153 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
154 case 2: | |
155 if (cld_language == CLD2::CHINESE) { | |
156 language = "zh-CN"; | |
157 } else { | |
158 language = | |
159 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); | |
160 } | |
161 break; | |
162 #endif | |
163 default: | |
164 NOTREACHED(); | |
165 } | |
166 } | 94 } |
167 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text | 95 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text |
168 << "\n*************************************\n"; | 96 << "\n*************************************\n"; |
169 return language; | 97 return language; |
170 } | 98 } |
171 | 99 |
172 // Checks if CLD can complement a sub code when the page language doesn't know | 100 // Checks if CLD can complement a sub code when the page language doesn't know |
173 // the sub code. | 101 // the sub code. |
174 bool CanCLDComplementSubCode( | 102 bool CanCLDComplementSubCode( |
175 const std::string& page_language, const std::string& cld_language) { | 103 const std::string& page_language, const std::string& cld_language) { |
(...skipping 180 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
356 // distinguish from English, and the language is one of well-known languages | 284 // distinguish from English, and the language is one of well-known languages |
357 // which often provide "en-*" meta information mistakenly. | 285 // which often provide "en-*" meta information mistakenly. |
358 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 286 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
359 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 287 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
360 return true; | 288 return true; |
361 } | 289 } |
362 return false; | 290 return false; |
363 } | 291 } |
364 | 292 |
365 std::string GetCLDVersion() { | 293 std::string GetCLDVersion() { |
366 switch (GetCLDMajorVersion()) { | 294 return CompactLangDet::DetectLanguageVersion(); |
367 #if !defined(CLD_VERSION) || CLD_VERSION==1 | |
368 case 1: | |
369 return CompactLangDet::DetectLanguageVersion(); | |
370 #endif | |
371 #if !defined(CLD_VERSION) || CLD_VERSION==2 | |
372 case 2: | |
373 return CLD2::DetectLanguageVersion(); | |
374 #endif | |
375 default: | |
376 NOTREACHED(); | |
377 } | |
378 return ""; | |
379 } | 295 } |
380 | 296 |
381 } // namespace LanguageDetectionUtil | 297 } // namespace LanguageDetectionUtil |
OLD | NEW |