Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(275)

Side by Side Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 1263613002: Implement CLD hints to CLD2 calls. Edit CLD2 result to return top language instead of summary langu… (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: DetectLanguageSummaryV2 function call change Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/translate/core/language_detection/language_detection_util.h " 5 #include "components/translate/core/language_detection/language_detection_util.h "
6 6
7 #include "base/logging.h" 7 #include "base/logging.h"
8 #include "base/metrics/field_trial.h" 8 #include "base/metrics/field_trial.h"
9 #include "base/strings/string_split.h" 9 #include "base/strings/string_split.h"
10 #include "base/strings/string_util.h" 10 #include "base/strings/string_util.h"
11 #include "base/strings/utf_string_conversions.h" 11 #include "base/strings/utf_string_conversions.h"
12 #include "base/time/time.h" 12 #include "base/time/time.h"
13 #include "components/translate/core/common/translate_constants.h" 13 #include "components/translate/core/common/translate_constants.h"
14 #include "components/translate/core/common/translate_metrics.h" 14 #include "components/translate/core/common/translate_metrics.h"
15 #include "components/translate/core/common/translate_util.h" 15 #include "components/translate/core/common/translate_util.h"
16 16
17 #if !defined(CLD_VERSION) || CLD_VERSION==1 17 #if !defined(CLD_VERSION) || CLD_VERSION==1
18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" 18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" 19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
20 #endif 20 #endif
21 21
22 #if !defined(CLD_VERSION) || CLD_VERSION==2 22 #if !defined(CLD_VERSION) || CLD_VERSION==2
23 #include "third_party/cld_2/src/public/compact_lang_det.h" 23 #include "third_party/cld_2/src/public/compact_lang_det.h"
24 #include "third_party/cld_2/src/public/encodings.h"
24 #endif 25 #endif
25 26
26 namespace { 27 namespace {
27 28
28 // Similar language code list. Some languages are very similar and difficult 29 // Similar language code list. Some languages are very similar and difficult
29 // for CLD to distinguish. 30 // for CLD to distinguish.
30 struct SimilarLanguageCode { 31 struct SimilarLanguageCode {
31 const char* const code; 32 const char* const code;
32 int group; 33 int group;
33 }; 34 };
(...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after
79 return 1; 80 return 1;
80 #else 81 #else
81 return CLD_VERSION; 82 return CLD_VERSION;
82 #endif 83 #endif
83 } 84 }
84 85
85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it 86 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
86 // failed. 87 // failed.
87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. 88 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
88 std::string DetermineTextLanguage(const base::string16& text, 89 std::string DetermineTextLanguage(const base::string16& text,
89 bool* is_cld_reliable) { 90 bool* is_cld_reliable,
91 std::string& code,
92 std::string& html_lang) {
90 std::string language = translate::kUnknownLanguageCode; 93 std::string language = translate::kUnknownLanguageCode;
91 int num_bytes_evaluated = 0; 94 int num_bytes_evaluated = 0;
92 bool is_reliable = false; 95 bool is_reliable = false;
93 const bool is_plain_text = true; 96 const bool is_plain_text = true;
94 97
95 // Language or CLD2::Language 98 // Language or CLD2::Language
96 int cld_language = 0; 99 int cld_language = 0;
97 bool is_valid_language = false; 100 bool is_valid_language = false;
98 101
99 switch (GetCLDMajorVersion()) { 102 switch (GetCLDMajorVersion()) {
100 #if !defined(CLD_VERSION) || CLD_VERSION==1 103 #if !defined(CLD_VERSION) || CLD_VERSION==1
101 case 1: { 104 case 1: {
102 int num_languages = 0; 105 int num_languages = 0;
103 cld_language = DetectLanguageOfUnicodeText( 106 cld_language = DetectLanguageOfUnicodeText(
104 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL, 107 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL,
105 &num_bytes_evaluated); 108 &num_bytes_evaluated);
106 is_valid_language = cld_language != NUM_LANGUAGES && 109 is_valid_language = cld_language != NUM_LANGUAGES &&
107 cld_language != UNKNOWN_LANGUAGE && 110 cld_language != UNKNOWN_LANGUAGE &&
108 cld_language != TG_UNKNOWN_LANGUAGE; 111 cld_language != TG_UNKNOWN_LANGUAGE;
109 break; 112 break;
110 } 113 }
111 #endif 114 #endif
112 #if !defined(CLD_VERSION) || CLD_VERSION==2 115 #if !defined(CLD_VERSION) || CLD_VERSION==2
113 case 2: { 116 case 2: {
114 const std::string utf8_text(base::UTF16ToUTF8(text)); 117 const std::string utf8_text(base::UTF16ToUTF8(text));
115 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); 118 const int num_utf8_bytes = static_cast<int>(utf8_text.size());
116 const char* raw_utf8_bytes = utf8_text.c_str(); 119 const char* raw_utf8_bytes = utf8_text.c_str();
117 cld_language = CLD2::DetectLanguageCheckUTF8( 120
118 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable, 121 CLD2::Language language3[3];
119 &num_bytes_evaluated); 122 int percent3[3];
123 int flags = 0; // No flags, see compact_lang_det.h for details.
124 int text_bytes; // Amount of non-tag/letters-only text (assumed 0).
125 double normalized_score3[3];
126
127 const char* tld_hint = "";
128 int encoding_hint = CLD2::UNKNOWN_ENCODING;
129 CLD2::Language language_hint =
130 CLD2::GetLanguageFromName(html_lang.c_str());
131 CLD2::CLDHints cldhints = {code.c_str(), tld_hint, encoding_hint,
132 language_hint};
133
134 cld_language = CLD2::ExtDetectLanguageSummaryCheckUTF8(
135 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags,
136 language3, percent3, normalized_score3, nullptr, &text_bytes,
Andrew Hayden (chromium.org) 2015/08/05 11:22:45 Please comment the meaning of the nullptr here, we
137 &is_reliable, &num_bytes_evaluated);
120 138
121 if (num_bytes_evaluated < num_utf8_bytes && 139 if (num_bytes_evaluated < num_utf8_bytes &&
122 cld_language == CLD2::UNKNOWN_LANGUAGE) { 140 cld_language == CLD2::UNKNOWN_LANGUAGE) {
123 // Invalid UTF8 encountered, see bug http://crbug.com/444258. 141 // Invalid UTF8 encountered, see bug http://crbug.com/444258.
124 // Retry using only the valid characters. This time the check for valid 142 // Retry using only the valid characters. This time the check for valid
125 // UTF8 can be skipped since the precise number of valid bytes is known. 143 // UTF8 can be skipped since the precise number of valid bytes is known.
126 cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated, 144 cld_language = CLD2::ExtDetectLanguageSummary(
127 is_plain_text, &is_reliable); 145 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags,
146 language3, percent3, normalized_score3, nullptr, &text_bytes,
Andrew Hayden (chromium.org) 2015/08/05 11:22:45 And same here, please
147 &is_reliable);
128 } 148 }
129 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && 149 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
130 cld_language != CLD2::UNKNOWN_LANGUAGE && 150 cld_language != CLD2::UNKNOWN_LANGUAGE &&
131 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; 151 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
152
153 // Choose top language.
154 cld_language = language3[0];
132 break; 155 break;
133 } 156 }
134 #endif 157 #endif
135 default: 158 default:
136 NOTREACHED(); 159 NOTREACHED();
137 } 160 }
138 161
139 if (is_cld_reliable != NULL) 162 if (is_cld_reliable != NULL)
140 *is_cld_reliable = is_reliable; 163 *is_cld_reliable = is_reliable;
141 164
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after
206 229
207 namespace translate { 230 namespace translate {
208 231
209 std::string DeterminePageLanguage(const std::string& code, 232 std::string DeterminePageLanguage(const std::string& code,
210 const std::string& html_lang, 233 const std::string& html_lang,
211 const base::string16& contents, 234 const base::string16& contents,
212 std::string* cld_language_p, 235 std::string* cld_language_p,
213 bool* is_cld_reliable_p) { 236 bool* is_cld_reliable_p) {
214 base::TimeTicks begin_time = base::TimeTicks::Now(); 237 base::TimeTicks begin_time = base::TimeTicks::Now();
215 bool is_cld_reliable; 238 bool is_cld_reliable;
216 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
217 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
218
219 if (cld_language_p != NULL)
220 *cld_language_p = cld_language;
221 if (is_cld_reliable_p != NULL)
222 *is_cld_reliable_p = is_cld_reliable;
223 translate::ToTranslateLanguageSynonym(&cld_language);
224
225 // Check if html lang attribute is valid. 239 // Check if html lang attribute is valid.
226 std::string modified_html_lang; 240 std::string modified_html_lang;
227 if (!html_lang.empty()) { 241 if (!html_lang.empty()) {
228 modified_html_lang = html_lang; 242 modified_html_lang = html_lang;
229 ApplyLanguageCodeCorrection(&modified_html_lang); 243 ApplyLanguageCodeCorrection(&modified_html_lang);
230 translate::ReportHtmlLang(html_lang, modified_html_lang); 244 translate::ReportHtmlLang(html_lang, modified_html_lang);
231 VLOG(9) << "html lang based language code: " << modified_html_lang; 245 VLOG(9) << "html lang based language code: " << modified_html_lang;
232 } 246 }
233 247
234 // Check if Content-Language is valid. 248 // Check if Content-Language is valid.
235 std::string modified_code; 249 std::string modified_code;
236 if (!code.empty()) { 250 if (!code.empty()) {
237 modified_code = code; 251 modified_code = code;
238 ApplyLanguageCodeCorrection(&modified_code); 252 ApplyLanguageCodeCorrection(&modified_code);
239 translate::ReportContentLanguage(code, modified_code); 253 translate::ReportContentLanguage(code, modified_code);
240 } 254 }
241 255
256 std::string cld_language = DetermineTextLanguage(
257 contents, &is_cld_reliable, modified_code, modified_html_lang);
258 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
259
260 if (cld_language_p != NULL)
261 *cld_language_p = cld_language;
262 if (is_cld_reliable_p != NULL)
263 *is_cld_reliable_p = is_cld_reliable;
264 translate::ToTranslateLanguageSynonym(&cld_language);
265
242 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt 266 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
243 // |modified_code|. 267 // |modified_code|.
244 std::string language = modified_html_lang.empty() ? modified_code : 268 std::string language = modified_html_lang.empty() ? modified_code :
245 modified_html_lang; 269 modified_html_lang;
246 270
247 // If |language| is empty, just use CLD result even though it might be 271 // If |language| is empty, just use CLD result even though it might be
248 // translate::kUnknownLanguageCode. 272 // translate::kUnknownLanguageCode.
249 if (language.empty()) { 273 if (language.empty()) {
250 translate::ReportLanguageVerification( 274 translate::ReportLanguageVerification(
251 translate::LANGUAGE_VERIFICATION_CLD_ONLY); 275 translate::LANGUAGE_VERIFICATION_CLD_ONLY);
(...skipping 138 matching lines...) Expand 10 before | Expand all | Expand 10 after
390 // distinguish from English, and the language is one of well-known languages 414 // distinguish from English, and the language is one of well-known languages
391 // which often provide "en-*" meta information mistakenly. 415 // which often provide "en-*" meta information mistakenly.
392 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 416 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
393 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 417 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
394 return true; 418 return true;
395 } 419 }
396 return false; 420 return false;
397 } 421 }
398 422
399 } // namespace translate 423 } // namespace translate
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698