Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(318)

Side by Side Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 2756313002: [Merge M-58] Adds ChineseScriptClassifier to predict zh-Hant or zh-Hans for input detected as zh. (Closed)
Patch Set: Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « components/translate/core/language_detection/chinese_script_classifier_test.cc ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/translate/core/language_detection/language_detection_util.h " 5 #include "components/translate/core/language_detection/language_detection_util.h "
6 6
7 #include <stddef.h> 7 #include <stddef.h>
8 8
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/macros.h" 10 #include "base/macros.h"
11 #include "base/metrics/histogram_base.h" 11 #include "base/metrics/histogram_base.h"
12 #include "base/metrics/histogram_macros.h" 12 #include "base/metrics/histogram_macros.h"
13 #include "base/metrics/metrics_hashes.h" 13 #include "base/metrics/metrics_hashes.h"
14 #include "base/metrics/sparse_histogram.h" 14 #include "base/metrics/sparse_histogram.h"
15 #include "base/strings/string_split.h" 15 #include "base/strings/string_split.h"
16 #include "base/strings/string_util.h" 16 #include "base/strings/string_util.h"
17 #include "base/strings/utf_string_conversions.h" 17 #include "base/strings/utf_string_conversions.h"
18 #include "base/time/time.h" 18 #include "base/time/time.h"
19 #include "components/translate/core/common/translate_constants.h" 19 #include "components/translate/core/common/translate_constants.h"
20 #include "components/translate/core/common/translate_metrics.h" 20 #include "components/translate/core/common/translate_metrics.h"
21 #include "components/translate/core/common/translate_util.h" 21 #include "components/translate/core/common/translate_util.h"
22 #include "components/translate/core/language_detection/chinese_script_classifier .h"
22 #include "third_party/cld/cld_version.h" 23 #include "third_party/cld/cld_version.h"
23 24
24 #if BUILDFLAG(CLD_VERSION) == 2 25 #if BUILDFLAG(CLD_VERSION) == 2
25 #include "third_party/cld_2/src/public/compact_lang_det.h" 26 #include "third_party/cld_2/src/public/compact_lang_det.h"
26 #include "third_party/cld_2/src/public/encodings.h" 27 #include "third_party/cld_2/src/public/encodings.h"
27 #elif BUILDFLAG(CLD_VERSION) == 3 28 #elif BUILDFLAG(CLD_VERSION) == 3
28 #include "third_party/cld_3/src/src/nnet_language_identifier.h" 29 #include "third_party/cld_3/src/src/nnet_language_identifier.h"
29 #else 30 #else
30 # error "CLD_VERSION must be 2 or 3" 31 # error "CLD_VERSION must be 2 or 3"
31 #endif 32 #endif
(...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after
200 // Ignore unreliable, "unknown", and xx-Latn predictions that are currently 201 // Ignore unreliable, "unknown", and xx-Latn predictions that are currently
201 // not supported. 202 // not supported.
202 if (prediction_reliable && 203 if (prediction_reliable &&
203 predicted_language != "bg-Latn" && 204 predicted_language != "bg-Latn" &&
204 predicted_language != "el-Latn" && 205 predicted_language != "el-Latn" &&
205 predicted_language != "ja-Latn" && 206 predicted_language != "ja-Latn" &&
206 predicted_language != "ru-Latn" && 207 predicted_language != "ru-Latn" &&
207 predicted_language != "zh-Latn" && 208 predicted_language != "zh-Latn" &&
208 predicted_language != 209 predicted_language !=
209 chrome_lang_id::NNetLanguageIdentifier::kUnknown) { 210 chrome_lang_id::NNetLanguageIdentifier::kUnknown) {
210 // CLD3 returns 'zh' for Chinese but Translate doesn't accept it. Thus, 211 if (predicted_language != "zh") {
211 // analogously to CLD2, 'zh-CN' is returned instead. 212 language = predicted_language;
212 if (predicted_language == "zh") {
213 language = "zh-CN";
214 } else { 213 } else {
215 language = predicted_language; 214 // If prediction is "zh" (Chinese), then we need to determine whether the
215 // text is zh-Hant (Chinese Traditional) or zh-Hans (Chinese Simplified).
216 translate::ChineseScriptClassifier zh_classifier;
217
218 // The Classify function returns either "zh-Hant" or "zh-Hans".
219 // Convert to the old-style language codes used by the Translate API.
220 const std::string zh_classification = zh_classifier.Classify(utf8_text);
221 if (zh_classification == "zh-Hant") {
222 language = "zh-TW";
223 } else if (zh_classification == "zh-Hans") {
224 language = "zh-CN";
225 } else {
226 language = translate::kUnknownLanguageCode;
227 }
216 } 228 }
217 } 229 }
218
219 #else 230 #else
220 # error "CLD_VERSION must be 2 or 3" 231 # error "CLD_VERSION must be 2 or 3"
221 #endif 232 #endif
222 233
234 VLOG(1) << "Detected language: " << language;
223 return language; 235 return language;
224 } 236 }
225 237
226 // Checks if CLD can complement a sub code when the page language doesn't know 238 // Checks if CLD can complement a sub code when the page language doesn't know
227 // the sub code. 239 // the sub code.
228 bool CanCLDComplementSubCode( 240 bool CanCLDComplementSubCode(
229 const std::string& page_language, const std::string& cld_language) { 241 const std::string& page_language, const std::string& cld_language) {
230 // Translate server cannot treat general Chinese. If Content-Language and 242 // Translate server cannot treat general Chinese. If Content-Language and
231 // CLD agree that the language is Chinese and Content-Language doesn't know 243 // CLD agree that the language is Chinese and Content-Language doesn't know
232 // which dialect is used, CLD language has priority. 244 // which dialect is used, CLD language has priority.
(...skipping 192 matching lines...) Expand 10 before | Expand all | Expand 10 after
425 // distinguish from English, and the language is one of well-known languages 437 // distinguish from English, and the language is one of well-known languages
426 // which often provide "en-*" meta information mistakenly. 438 // which often provide "en-*" meta information mistakenly.
427 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 439 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
428 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 440 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
429 return true; 441 return true;
430 } 442 }
431 return false; 443 return false;
432 } 444 }
433 445
434 } // namespace translate 446 } // namespace translate
OLDNEW
« no previous file with comments | « components/translate/core/language_detection/chinese_script_classifier_test.cc ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698