Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(564)

Side by Side Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 1920853002: Drop support for Compact Language Detector v1. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Preserve yfriedman@ comment in chrome/android/chrome_apk.gyp Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « components/translate/core/language_detection/BUILD.gn ('k') | extensions/BUILD.gn » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/translate/core/language_detection/language_detection_util.h " 5 #include "components/translate/core/language_detection/language_detection_util.h "
6 6
7 #include <stddef.h> 7 #include <stddef.h>
8 8
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/macros.h" 10 #include "base/macros.h"
11 #include "base/metrics/histogram_macros.h" 11 #include "base/metrics/histogram_macros.h"
12 #include "base/strings/string_split.h" 12 #include "base/strings/string_split.h"
13 #include "base/strings/string_util.h" 13 #include "base/strings/string_util.h"
14 #include "base/strings/utf_string_conversions.h" 14 #include "base/strings/utf_string_conversions.h"
15 #include "base/time/time.h" 15 #include "base/time/time.h"
16 #include "components/translate/core/common/translate_constants.h" 16 #include "components/translate/core/common/translate_constants.h"
17 #include "components/translate/core/common/translate_metrics.h" 17 #include "components/translate/core/common/translate_metrics.h"
18 #include "components/translate/core/common/translate_util.h" 18 #include "components/translate/core/common/translate_util.h"
19
20 #if CLD_VERSION==1
21 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
22 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
23 #endif
24
25 #if CLD_VERSION==2
26 #include "third_party/cld_2/src/public/compact_lang_det.h" 19 #include "third_party/cld_2/src/public/compact_lang_det.h"
27 #include "third_party/cld_2/src/public/encodings.h" 20 #include "third_party/cld_2/src/public/encodings.h"
28 #endif
29 21
30 namespace { 22 namespace {
31 23
32 // Similar language code list. Some languages are very similar and difficult 24 // Similar language code list. Some languages are very similar and difficult
33 // for CLD to distinguish. 25 // for CLD to distinguish.
34 struct SimilarLanguageCode { 26 struct SimilarLanguageCode {
35 const char* const code; 27 const char* const code;
36 int group; 28 int group;
37 }; 29 };
38 30
(...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after
83 std::string& html_lang) { 75 std::string& html_lang) {
84 std::string language = translate::kUnknownLanguageCode; 76 std::string language = translate::kUnknownLanguageCode;
85 int num_bytes_evaluated = 0; 77 int num_bytes_evaluated = 0;
86 bool is_reliable = false; 78 bool is_reliable = false;
87 const bool is_plain_text = true; 79 const bool is_plain_text = true;
88 80
89 // Language or CLD2::Language 81 // Language or CLD2::Language
90 int cld_language = 0; 82 int cld_language = 0;
91 bool is_valid_language = false; 83 bool is_valid_language = false;
92 84
93 #if CLD_VERSION==1
94 int num_languages = 0;
95 cld_language = DetectLanguageOfUnicodeText(NULL, text.c_str(), is_plain_text,
96 &is_reliable, &num_languages, NULL,
97 &num_bytes_evaluated);
98 is_valid_language = cld_language != NUM_LANGUAGES &&
99 cld_language != UNKNOWN_LANGUAGE &&
100 cld_language != TG_UNKNOWN_LANGUAGE;
101 #elif CLD_VERSION==2
102 const std::string utf8_text(base::UTF16ToUTF8(text)); 85 const std::string utf8_text(base::UTF16ToUTF8(text));
103 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); 86 const int num_utf8_bytes = static_cast<int>(utf8_text.size());
104 const char* raw_utf8_bytes = utf8_text.c_str(); 87 const char* raw_utf8_bytes = utf8_text.c_str();
105 88
106 CLD2::Language language3[3]; 89 CLD2::Language language3[3];
107 int percent3[3]; 90 int percent3[3];
108 int flags = 0; // No flags, see compact_lang_det.h for details. 91 int flags = 0; // No flags, see compact_lang_det.h for details.
109 int text_bytes; // Amount of non-tag/letters-only text (assumed 0). 92 int text_bytes; // Amount of non-tag/letters-only text (assumed 0).
110 double normalized_score3[3]; 93 double normalized_score3[3];
111 94
(...skipping 23 matching lines...) Expand all
135 cld_language != CLD2::UNKNOWN_LANGUAGE && 118 cld_language != CLD2::UNKNOWN_LANGUAGE &&
136 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; 119 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
137 120
138 // Choose top language. 121 // Choose top language.
139 cld_language = language3[0]; 122 cld_language = language3[0];
140 UMA_HISTOGRAM_ENUMERATION("Translate.CLD2.LanguageDetected", 123 UMA_HISTOGRAM_ENUMERATION("Translate.CLD2.LanguageDetected",
141 cld_language, CLD2::NUM_LANGUAGES); 124 cld_language, CLD2::NUM_LANGUAGES);
142 if (is_valid_language) 125 if (is_valid_language)
143 UMA_HISTOGRAM_PERCENTAGE("Translate.CLD2.LanguageAccuracy", percent3[0]); 126 UMA_HISTOGRAM_PERCENTAGE("Translate.CLD2.LanguageAccuracy", percent3[0]);
144 127
145
146 #else
147 # error "CLD_VERSION must be 1 or 2"
148 #endif
149
150 if (is_cld_reliable != NULL) 128 if (is_cld_reliable != NULL)
151 *is_cld_reliable = is_reliable; 129 *is_cld_reliable = is_reliable;
152 130
153 // We don't trust the result if the CLD reports that the detection is not 131 // We don't trust the result if the CLD reports that the detection is not
154 // reliable, or if the actual text used to detect the language was less than 132 // reliable, or if the actual text used to detect the language was less than
155 // 100 bytes (short texts can often lead to wrong results). 133 // 100 bytes (short texts can often lead to wrong results).
156 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that 134 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
157 // the determined language code is correct with 50% confidence. Chrome should 135 // the determined language code is correct with 50% confidence. Chrome should
158 // handle the real confidence value to judge. 136 // handle the real confidence value to judge.
159 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) { 137 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) {
160 // We should not use LanguageCode_ISO_639_1 because it does not cover all 138 // We should not use LanguageCode_ISO_639_1 because it does not cover all
161 // the languages CLD can detect. As a result, it'll return the invalid 139 // the languages CLD can detect. As a result, it'll return the invalid
162 // language code for traditional Chinese among others. 140 // language code for traditional Chinese among others.
163 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and 141 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
164 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN 142 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
165 // for Simplified Chinese. 143 // for Simplified Chinese.
166 #if CLD_VERSION==1 144 //
167 language = LanguageCodeWithDialects(static_cast<Language>(cld_language));
168 #elif CLD_VERSION==2
169 // (1) CLD2's LanguageCode returns general Chinese 'zh' for 145 // (1) CLD2's LanguageCode returns general Chinese 'zh' for
170 // CLD2::CHINESE, but Translate server doesn't accept it. This is 146 // CLD2::CHINESE, but Translate server doesn't accept it. This is
171 // converted to 'zh-CN' in the same way as CLD1's 147 // converted to 'zh-CN' in the same way as CLD1's
172 // LanguageCodeWithDialects. 148 // LanguageCodeWithDialects.
173 // 149 //
174 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for 150 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for
175 // CLD2::CHINESE_T. This is technically more precise for the language 151 // CLD2::CHINESE_T. This is technically more precise for the language
176 // code of traditional Chinese, while Translate server hasn't accepted 152 // code of traditional Chinese, while Translate server hasn't accepted
177 // zh-Hant yet. 153 // zh-Hant yet.
178 if (cld_language == CLD2::CHINESE) 154 if (cld_language == CLD2::CHINESE)
179 language = "zh-CN"; 155 language = "zh-CN";
180 else if (cld_language == CLD2::CHINESE_T) 156 else if (cld_language == CLD2::CHINESE_T)
181 language = "zh-TW"; 157 language = "zh-TW";
182 else 158 else
183 language = CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); 159 language = CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
184 #else
185 # error "CLD_VERSION must be 1 or 2"
186 #endif
187 } 160 }
188 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text 161 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
189 << "\n*************************************\n"; 162 << "\n*************************************\n";
190 return language; 163 return language;
191 } 164 }
192 165
193 // Checks if CLD can complement a sub code when the page language doesn't know 166 // Checks if CLD can complement a sub code when the page language doesn't know
194 // the sub code. 167 // the sub code.
195 bool CanCLDComplementSubCode( 168 bool CanCLDComplementSubCode(
196 const std::string& page_language, const std::string& cld_language) { 169 const std::string& page_language, const std::string& cld_language) {
(...skipping 195 matching lines...) Expand 10 before | Expand all | Expand 10 after
392 // distinguish from English, and the language is one of well-known languages 365 // distinguish from English, and the language is one of well-known languages
393 // which often provide "en-*" meta information mistakenly. 366 // which often provide "en-*" meta information mistakenly.
394 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 367 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
395 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 368 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
396 return true; 369 return true;
397 } 370 }
398 return false; 371 return false;
399 } 372 }
400 373
401 } // namespace translate 374 } // namespace translate
OLDNEW
« no previous file with comments | « components/translate/core/language_detection/BUILD.gn ('k') | extensions/BUILD.gn » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698