Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(121)

Side by Side Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 1259883007: Remove the Finch test 'CLD1VsCLD2' (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/translate/core/language_detection/language_detection_util.h " 5 #include "components/translate/core/language_detection/language_detection_util.h "
6 6
7 #include "base/logging.h" 7 #include "base/logging.h"
8 #include "base/metrics/field_trial.h"
9 #include "base/strings/string_split.h" 8 #include "base/strings/string_split.h"
10 #include "base/strings/string_util.h" 9 #include "base/strings/string_util.h"
11 #include "base/strings/utf_string_conversions.h" 10 #include "base/strings/utf_string_conversions.h"
12 #include "base/time/time.h" 11 #include "base/time/time.h"
13 #include "components/translate/core/common/translate_constants.h" 12 #include "components/translate/core/common/translate_constants.h"
14 #include "components/translate/core/common/translate_metrics.h" 13 #include "components/translate/core/common/translate_metrics.h"
15 #include "components/translate/core/common/translate_util.h" 14 #include "components/translate/core/common/translate_util.h"
16 15
17 #if !defined(CLD_VERSION) || CLD_VERSION==1 16 #if CLD_VERSION==1
18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" 17 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" 18 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
20 #endif 19 #endif
21 20
22 #if !defined(CLD_VERSION) || CLD_VERSION==2 21 #if CLD_VERSION==2
23 #include "third_party/cld_2/src/public/compact_lang_det.h" 22 #include "third_party/cld_2/src/public/compact_lang_det.h"
24 #endif 23 #endif
25 24
26 namespace { 25 namespace {
27 26
28 // Similar language code list. Some languages are very similar and difficult 27 // Similar language code list. Some languages are very similar and difficult
29 // for CLD to distinguish. 28 // for CLD to distinguish.
30 struct SimilarLanguageCode { 29 struct SimilarLanguageCode {
31 const char* const code; 30 const char* const code;
32 int group; 31 int group;
(...skipping 30 matching lines...) Expand all
63 translate::CorrectLanguageCodeTypo(code); 62 translate::CorrectLanguageCodeTypo(code);
64 63
65 if (!translate::IsValidLanguageCode(*code)) { 64 if (!translate::IsValidLanguageCode(*code)) {
66 *code = std::string(); 65 *code = std::string();
67 return; 66 return;
68 } 67 }
69 68
70 translate::ToTranslateLanguageSynonym(code); 69 translate::ToTranslateLanguageSynonym(code);
71 } 70 }
72 71
73 int GetCLDMajorVersion() {
74 #if !defined(CLD_VERSION)
75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");
76 if (group_name == "CLD2")
77 return 2;
78 else
79 return 1;
80 #else
81 return CLD_VERSION;
82 #endif
83 }
84
85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it 72 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
86 // failed. 73 // failed.
87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. 74 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
88 std::string DetermineTextLanguage(const base::string16& text, 75 std::string DetermineTextLanguage(const base::string16& text,
89 bool* is_cld_reliable) { 76 bool* is_cld_reliable) {
90 std::string language = translate::kUnknownLanguageCode; 77 std::string language = translate::kUnknownLanguageCode;
91 int num_bytes_evaluated = 0; 78 int num_bytes_evaluated = 0;
92 bool is_reliable = false; 79 bool is_reliable = false;
93 const bool is_plain_text = true; 80 const bool is_plain_text = true;
94 81
95 // Language or CLD2::Language 82 // Language or CLD2::Language
96 int cld_language = 0; 83 int cld_language = 0;
97 bool is_valid_language = false; 84 bool is_valid_language = false;
98 85
99 switch (GetCLDMajorVersion()) { 86 #if CLD_VERSION==1
100 #if !defined(CLD_VERSION) || CLD_VERSION==1 87 int num_languages = 0;
101 case 1: { 88 cld_language = DetectLanguageOfUnicodeText(
102 int num_languages = 0; 89 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL,
103 cld_language = DetectLanguageOfUnicodeText( 90 &num_bytes_evaluated);
104 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL, 91 is_valid_language = cld_language != NUM_LANGUAGES &&
105 &num_bytes_evaluated); 92 cld_language != UNKNOWN_LANGUAGE &&
Takashi Toyoshima 2015/07/30 08:41:05 wrong indent
hajimehoshi 2015/07/30 08:51:29 Done.
106 is_valid_language = cld_language != NUM_LANGUAGES && 93 cld_language != TG_UNKNOWN_LANGUAGE;
107 cld_language != UNKNOWN_LANGUAGE &&
108 cld_language != TG_UNKNOWN_LANGUAGE;
109 break;
110 }
111 #endif 94 #endif
112 #if !defined(CLD_VERSION) || CLD_VERSION==2 95 #if CLD_VERSION==2
113 case 2: { 96 const std::string utf8_text(base::UTF16ToUTF8(text));
114 const std::string utf8_text(base::UTF16ToUTF8(text)); 97 const int num_utf8_bytes = static_cast<int>(utf8_text.size());
115 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); 98 const char* raw_utf8_bytes = utf8_text.c_str();
116 const char* raw_utf8_bytes = utf8_text.c_str(); 99 cld_language = CLD2::DetectLanguageCheckUTF8(
117 cld_language = CLD2::DetectLanguageCheckUTF8( 100 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable,
118 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable, 101 &num_bytes_evaluated);
119 &num_bytes_evaluated);
120 102
121 if (num_bytes_evaluated < num_utf8_bytes && 103 if (num_bytes_evaluated < num_utf8_bytes &&
122 cld_language == CLD2::UNKNOWN_LANGUAGE) { 104 cld_language == CLD2::UNKNOWN_LANGUAGE) {
123 // Invalid UTF8 encountered, see bug http://crbug.com/444258. 105 // Invalid UTF8 encountered, see bug http://crbug.com/444258.
124 // Retry using only the valid characters. This time the check for valid 106 // Retry using only the valid characters. This time the check for valid
125 // UTF8 can be skipped since the precise number of valid bytes is known. 107 // UTF8 can be skipped since the precise number of valid bytes is known.
126 cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated, 108 cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated,
127 is_plain_text, &is_reliable); 109 is_plain_text, &is_reliable);
128 } 110 }
129 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && 111 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
130 cld_language != CLD2::UNKNOWN_LANGUAGE && 112 cld_language != CLD2::UNKNOWN_LANGUAGE &&
Takashi Toyoshima 2015/07/30 08:41:05 wrong indent
hajimehoshi 2015/07/30 08:51:29 Done.
131 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; 113 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
132 break;
133 }
134 #endif 114 #endif
Takashi Toyoshima 2015/07/30 08:41:05 up to you, but could be something like this? #if
hajimehoshi 2015/07/30 08:51:29 Done.
135 default:
136 NOTREACHED();
137 }
138 115
139 if (is_cld_reliable != NULL) 116 if (is_cld_reliable != NULL)
140 *is_cld_reliable = is_reliable; 117 *is_cld_reliable = is_reliable;
141 118
142 // We don't trust the result if the CLD reports that the detection is not 119 // We don't trust the result if the CLD reports that the detection is not
143 // reliable, or if the actual text used to detect the language was less than 120 // reliable, or if the actual text used to detect the language was less than
144 // 100 bytes (short texts can often lead to wrong results). 121 // 100 bytes (short texts can often lead to wrong results).
145 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that 122 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
146 // the determined language code is correct with 50% confidence. Chrome should 123 // the determined language code is correct with 50% confidence. Chrome should
147 // handle the real confidence value to judge. 124 // handle the real confidence value to judge.
148 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) { 125 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) {
149 // We should not use LanguageCode_ISO_639_1 because it does not cover all 126 // We should not use LanguageCode_ISO_639_1 because it does not cover all
150 // the languages CLD can detect. As a result, it'll return the invalid 127 // the languages CLD can detect. As a result, it'll return the invalid
151 // language code for tradtional Chinese among others. 128 // language code for tradtional Chinese among others.
152 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and 129 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
153 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN 130 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
154 // for Simplified Chinese. 131 // for Simplified Chinese.
155 switch (GetCLDMajorVersion()) { 132 #if CLD_VERSION==1
156 #if !defined(CLD_VERSION) || CLD_VERSION==1 133 language = LanguageCodeWithDialects(static_cast<Language>(cld_language));
157 case 1:
158 language =
159 LanguageCodeWithDialects(static_cast<Language>(cld_language));
160 break;
161 #endif 134 #endif
162 #if !defined(CLD_VERSION) || CLD_VERSION==2 135 #if CLD_VERSION==2
163 case 2: 136 // (1) CLD2's LanguageCode returns general Chinese 'zh' for
164 // (1) CLD2's LanguageCode returns general Chinese 'zh' for 137 // CLD2::CHINESE, but Translate server doesn't accept it. This is
165 // CLD2::CHINESE, but Translate server doesn't accept it. This is 138 // converted to 'zh-CN' in the same way as CLD1's
166 // converted to 'zh-CN' in the same way as CLD1's 139 // LanguageCodeWithDialects.
167 // LanguageCodeWithDialects. 140 //
168 // 141 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for
169 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for 142 // CLD2::CHINESE_T. This is technically more precise for the language
170 // CLD2::CHINESE_T. This is technically more precise for the language 143 // code of traditional Chinese, while Translate server hasn't accepted
171 // code of traditional Chinese, while Translate server hasn't accepted 144 // zh-Hant yet.
172 // zh-Hant yet. 145 if (cld_language == CLD2::CHINESE)
173 if (cld_language == CLD2::CHINESE) { 146 language = "zh-CN";
174 language = "zh-CN"; 147 else if (cld_language == CLD2::CHINESE_T)
175 } else if (cld_language == CLD2::CHINESE_T) { 148 language = "zh-TW";
176 language = "zh-TW"; 149 else
177 } else { 150 language = CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
178 language =
179 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
180 }
181 break;
182 #endif 151 #endif
Takashi Toyoshima 2015/07/30 08:41:05 ditto
hajimehoshi 2015/07/30 08:51:29 Done.
183 default:
184 NOTREACHED();
185 }
186 } 152 }
187 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text 153 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
188 << "\n*************************************\n"; 154 << "\n*************************************\n";
189 return language; 155 return language;
190 } 156 }
191 157
192 // Checks if CLD can complement a sub code when the page language doesn't know 158 // Checks if CLD can complement a sub code when the page language doesn't know
193 // the sub code. 159 // the sub code.
194 bool CanCLDComplementSubCode( 160 bool CanCLDComplementSubCode(
195 const std::string& page_language, const std::string& cld_language) { 161 const std::string& page_language, const std::string& cld_language) {
(...skipping 194 matching lines...) Expand 10 before | Expand all | Expand 10 after
390 // distinguish from English, and the language is one of well-known languages 356 // distinguish from English, and the language is one of well-known languages
391 // which often provide "en-*" meta information mistakenly. 357 // which often provide "en-*" meta information mistakenly.
392 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 358 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
393 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 359 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
394 return true; 360 return true;
395 } 361 }
396 return false; 362 return false;
397 } 363 }
398 364
399 } // namespace translate 365 } // namespace translate
OLDNEW
« build/config/BUILD.gn ('K') | « components/translate/core/language_detection/BUILD.gn ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698