Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(161)

Side by Side Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 1263613002: Implement CLD hints to CLD2 calls. Edit CLD2 result to return top language instead of summary langu… (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Added inline comments on CLD params Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/translate/core/language_detection/language_detection_util.h " 5 #include "components/translate/core/language_detection/language_detection_util.h "
6 6
7 #include "base/logging.h" 7 #include "base/logging.h"
8 #include "base/strings/string_split.h" 8 #include "base/strings/string_split.h"
9 #include "base/strings/string_util.h" 9 #include "base/strings/string_util.h"
10 #include "base/strings/utf_string_conversions.h" 10 #include "base/strings/utf_string_conversions.h"
11 #include "base/time/time.h" 11 #include "base/time/time.h"
12 #include "components/translate/core/common/translate_constants.h" 12 #include "components/translate/core/common/translate_constants.h"
13 #include "components/translate/core/common/translate_metrics.h" 13 #include "components/translate/core/common/translate_metrics.h"
14 #include "components/translate/core/common/translate_util.h" 14 #include "components/translate/core/common/translate_util.h"
15 15
16 #if CLD_VERSION==1 16 #if CLD_VERSION==1
17 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" 17 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
18 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" 18 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
19 #endif 19 #endif
20 20
21 #if CLD_VERSION==2 21 #if CLD_VERSION==2
22 #include "third_party/cld_2/src/public/compact_lang_det.h" 22 #include "third_party/cld_2/src/public/compact_lang_det.h"
23 #include "third_party/cld_2/src/public/encodings.h"
23 #endif 24 #endif
24 25
25 namespace { 26 namespace {
26 27
27 // Similar language code list. Some languages are very similar and difficult 28 // Similar language code list. Some languages are very similar and difficult
28 // for CLD to distinguish. 29 // for CLD to distinguish.
29 struct SimilarLanguageCode { 30 struct SimilarLanguageCode {
30 const char* const code; 31 const char* const code;
31 int group; 32 int group;
32 }; 33 };
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after
66 return; 67 return;
67 } 68 }
68 69
69 translate::ToTranslateLanguageSynonym(code); 70 translate::ToTranslateLanguageSynonym(code);
70 } 71 }
71 72
72 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it 73 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
73 // failed. 74 // failed.
74 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. 75 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
75 std::string DetermineTextLanguage(const base::string16& text, 76 std::string DetermineTextLanguage(const base::string16& text,
76 bool* is_cld_reliable) { 77 bool* is_cld_reliable,
78 std::string& code,
79 std::string& html_lang) {
77 std::string language = translate::kUnknownLanguageCode; 80 std::string language = translate::kUnknownLanguageCode;
78 int num_bytes_evaluated = 0; 81 int num_bytes_evaluated = 0;
79 bool is_reliable = false; 82 bool is_reliable = false;
80 const bool is_plain_text = true; 83 const bool is_plain_text = true;
81 84
82 // Language or CLD2::Language 85 // Language or CLD2::Language
83 int cld_language = 0; 86 int cld_language = 0;
84 bool is_valid_language = false; 87 bool is_valid_language = false;
85 88
86 #if CLD_VERSION==1 89 #if CLD_VERSION==1
87 int num_languages = 0; 90 int num_languages = 0;
88 cld_language = DetectLanguageOfUnicodeText( 91 cld_language = DetectLanguageOfUnicodeText(NULL, text.c_str(), is_plain_text,
89 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL, 92 &is_reliable, &num_languages, NULL,
90 &num_bytes_evaluated); 93 &num_bytes_evaluated);
91 is_valid_language = cld_language != NUM_LANGUAGES && 94 is_valid_language = cld_language != NUM_LANGUAGES &&
92 cld_language != UNKNOWN_LANGUAGE && 95 cld_language != UNKNOWN_LANGUAGE &&
93 cld_language != TG_UNKNOWN_LANGUAGE; 96 cld_language != TG_UNKNOWN_LANGUAGE;
94 #elif CLD_VERSION==2 97 #elif CLD_VERSION==2
95 const std::string utf8_text(base::UTF16ToUTF8(text)); 98 const std::string utf8_text(base::UTF16ToUTF8(text));
96 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); 99 const int num_utf8_bytes = static_cast<int>(utf8_text.size());
97 const char* raw_utf8_bytes = utf8_text.c_str(); 100 const char* raw_utf8_bytes = utf8_text.c_str();
98 cld_language = CLD2::DetectLanguageCheckUTF8( 101
99 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable, 102 CLD2::Language language3[3];
103 int percent3[3];
104 int flags = 0; // No flags, see compact_lang_det.h for details.
105 int text_bytes; // Amount of non-tag/letters-only text (assumed 0).
106 double normalized_score3[3];
107
108 const char* tld_hint = "";
109 int encoding_hint = CLD2::UNKNOWN_ENCODING;
110 CLD2::Language language_hint = CLD2::GetLanguageFromName(html_lang.c_str());
111 CLD2::CLDHints cldhints = {code.c_str(), tld_hint, encoding_hint,
112 language_hint};
113
114 cld_language = CLD2::ExtDetectLanguageSummaryCheckUTF8(
115 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags,
116 language3, percent3, normalized_score3,
117 nullptr /* No ResultChunkVector used */, &text_bytes, &is_reliable,
100 &num_bytes_evaluated); 118 &num_bytes_evaluated);
101 119
102 if (num_bytes_evaluated < num_utf8_bytes && 120 if (num_bytes_evaluated < num_utf8_bytes &&
103 cld_language == CLD2::UNKNOWN_LANGUAGE) { 121 cld_language == CLD2::UNKNOWN_LANGUAGE) {
104 // Invalid UTF8 encountered, see bug http://crbug.com/444258. 122 // Invalid UTF8 encountered, see bug http://crbug.com/444258.
105 // Retry using only the valid characters. This time the check for valid 123 // Retry using only the valid characters. This time the check for valid
106 // UTF8 can be skipped since the precise number of valid bytes is known. 124 // UTF8 can be skipped since the precise number of valid bytes is known.
107 cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated, 125 cld_language = CLD2::ExtDetectLanguageSummary(
108 is_plain_text, &is_reliable); 126 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags,
127 language3, percent3, normalized_score3,
128 nullptr /* No ResultChunkVector used */, &text_bytes, &is_reliable);
109 } 129 }
110 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && 130 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
111 cld_language != CLD2::UNKNOWN_LANGUAGE && 131 cld_language != CLD2::UNKNOWN_LANGUAGE &&
112 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; 132 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
133
134 // Choose top language.
135 cld_language = language3[0];
113 #else 136 #else
114 # error "CLD_VERSION must be 1 or 2" 137 # error "CLD_VERSION must be 1 or 2"
115 #endif 138 #endif
116 139
117 if (is_cld_reliable != NULL) 140 if (is_cld_reliable != NULL)
118 *is_cld_reliable = is_reliable; 141 *is_cld_reliable = is_reliable;
119 142
120 // We don't trust the result if the CLD reports that the detection is not 143 // We don't trust the result if the CLD reports that the detection is not
121 // reliable, or if the actual text used to detect the language was less than 144 // reliable, or if the actual text used to detect the language was less than
122 // 100 bytes (short texts can often lead to wrong results). 145 // 100 bytes (short texts can often lead to wrong results).
(...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after
174 197
175 namespace translate { 198 namespace translate {
176 199
177 std::string DeterminePageLanguage(const std::string& code, 200 std::string DeterminePageLanguage(const std::string& code,
178 const std::string& html_lang, 201 const std::string& html_lang,
179 const base::string16& contents, 202 const base::string16& contents,
180 std::string* cld_language_p, 203 std::string* cld_language_p,
181 bool* is_cld_reliable_p) { 204 bool* is_cld_reliable_p) {
182 base::TimeTicks begin_time = base::TimeTicks::Now(); 205 base::TimeTicks begin_time = base::TimeTicks::Now();
183 bool is_cld_reliable; 206 bool is_cld_reliable;
184 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
185 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
186
187 if (cld_language_p != NULL)
188 *cld_language_p = cld_language;
189 if (is_cld_reliable_p != NULL)
190 *is_cld_reliable_p = is_cld_reliable;
191 translate::ToTranslateLanguageSynonym(&cld_language);
192
193 // Check if html lang attribute is valid. 207 // Check if html lang attribute is valid.
194 std::string modified_html_lang; 208 std::string modified_html_lang;
195 if (!html_lang.empty()) { 209 if (!html_lang.empty()) {
196 modified_html_lang = html_lang; 210 modified_html_lang = html_lang;
197 ApplyLanguageCodeCorrection(&modified_html_lang); 211 ApplyLanguageCodeCorrection(&modified_html_lang);
198 translate::ReportHtmlLang(html_lang, modified_html_lang); 212 translate::ReportHtmlLang(html_lang, modified_html_lang);
199 VLOG(9) << "html lang based language code: " << modified_html_lang; 213 VLOG(9) << "html lang based language code: " << modified_html_lang;
200 } 214 }
201 215
202 // Check if Content-Language is valid. 216 // Check if Content-Language is valid.
203 std::string modified_code; 217 std::string modified_code;
204 if (!code.empty()) { 218 if (!code.empty()) {
205 modified_code = code; 219 modified_code = code;
206 ApplyLanguageCodeCorrection(&modified_code); 220 ApplyLanguageCodeCorrection(&modified_code);
207 translate::ReportContentLanguage(code, modified_code); 221 translate::ReportContentLanguage(code, modified_code);
208 } 222 }
209 223
224 std::string cld_language = DetermineTextLanguage(
225 contents, &is_cld_reliable, modified_code, modified_html_lang);
226 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
227
228 if (cld_language_p != NULL)
229 *cld_language_p = cld_language;
230 if (is_cld_reliable_p != NULL)
231 *is_cld_reliable_p = is_cld_reliable;
232 translate::ToTranslateLanguageSynonym(&cld_language);
233
210 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt 234 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
211 // |modified_code|. 235 // |modified_code|.
212 std::string language = modified_html_lang.empty() ? modified_code : 236 std::string language = modified_html_lang.empty() ? modified_code :
213 modified_html_lang; 237 modified_html_lang;
214 238
215 // If |language| is empty, just use CLD result even though it might be 239 // If |language| is empty, just use CLD result even though it might be
216 // translate::kUnknownLanguageCode. 240 // translate::kUnknownLanguageCode.
217 if (language.empty()) { 241 if (language.empty()) {
218 translate::ReportLanguageVerification( 242 translate::ReportLanguageVerification(
219 translate::LANGUAGE_VERIFICATION_CLD_ONLY); 243 translate::LANGUAGE_VERIFICATION_CLD_ONLY);
(...skipping 138 matching lines...) Expand 10 before | Expand all | Expand 10 after
358 // distinguish from English, and the language is one of well-known languages 382 // distinguish from English, and the language is one of well-known languages
359 // which often provide "en-*" meta information mistakenly. 383 // which often provide "en-*" meta information mistakenly.
360 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 384 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
361 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 385 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
362 return true; 386 return true;
363 } 387 }
364 return false; 388 return false;
365 } 389 }
366 390
367 } // namespace translate 391 } // namespace translate
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698