| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/translate/core/language_detection/language_detection_util.h
" | 5 #include "components/translate/core/language_detection/language_detection_util.h
" |
| 6 | 6 |
| 7 #include <stddef.h> | 7 #include <stddef.h> |
| 8 | 8 |
| 9 #include "base/logging.h" | 9 #include "base/logging.h" |
| 10 #include "base/macros.h" | 10 #include "base/macros.h" |
| 11 #include "base/metrics/histogram_base.h" | 11 #include "base/metrics/histogram_base.h" |
| 12 #include "base/metrics/histogram_macros.h" | 12 #include "base/metrics/histogram_macros.h" |
| 13 #include "base/metrics/metrics_hashes.h" | 13 #include "base/metrics/metrics_hashes.h" |
| 14 #include "base/metrics/sparse_histogram.h" | 14 #include "base/metrics/sparse_histogram.h" |
| 15 #include "base/strings/string_split.h" | 15 #include "base/strings/string_split.h" |
| 16 #include "base/strings/string_util.h" | 16 #include "base/strings/string_util.h" |
| 17 #include "base/strings/utf_string_conversions.h" | 17 #include "base/strings/utf_string_conversions.h" |
| 18 #include "base/time/time.h" | 18 #include "base/time/time.h" |
| 19 #include "components/translate/core/common/translate_constants.h" | 19 #include "components/translate/core/common/translate_constants.h" |
| 20 #include "components/translate/core/common/translate_metrics.h" | 20 #include "components/translate/core/common/translate_metrics.h" |
| 21 #include "components/translate/core/common/translate_util.h" | 21 #include "components/translate/core/common/translate_util.h" |
| 22 #include "components/translate/core/language_detection/chinese_script_classifier
.h" |
| 22 #include "third_party/cld/cld_version.h" | 23 #include "third_party/cld/cld_version.h" |
| 23 | 24 |
| 24 #if BUILDFLAG(CLD_VERSION) == 2 | 25 #if BUILDFLAG(CLD_VERSION) == 2 |
| 25 #include "third_party/cld_2/src/public/compact_lang_det.h" | 26 #include "third_party/cld_2/src/public/compact_lang_det.h" |
| 26 #include "third_party/cld_2/src/public/encodings.h" | 27 #include "third_party/cld_2/src/public/encodings.h" |
| 27 #elif BUILDFLAG(CLD_VERSION) == 3 | 28 #elif BUILDFLAG(CLD_VERSION) == 3 |
| 28 #include "third_party/cld_3/src/src/nnet_language_identifier.h" | 29 #include "third_party/cld_3/src/src/nnet_language_identifier.h" |
| 29 #else | 30 #else |
| 30 # error "CLD_VERSION must be 2 or 3" | 31 # error "CLD_VERSION must be 2 or 3" |
| 31 #endif | 32 #endif |
| (...skipping 168 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 200 // Ignore unreliable, "unknown", and xx-Latn predictions that are currently | 201 // Ignore unreliable, "unknown", and xx-Latn predictions that are currently |
| 201 // not supported. | 202 // not supported. |
| 202 if (prediction_reliable && | 203 if (prediction_reliable && |
| 203 predicted_language != "bg-Latn" && | 204 predicted_language != "bg-Latn" && |
| 204 predicted_language != "el-Latn" && | 205 predicted_language != "el-Latn" && |
| 205 predicted_language != "ja-Latn" && | 206 predicted_language != "ja-Latn" && |
| 206 predicted_language != "ru-Latn" && | 207 predicted_language != "ru-Latn" && |
| 207 predicted_language != "zh-Latn" && | 208 predicted_language != "zh-Latn" && |
| 208 predicted_language != | 209 predicted_language != |
| 209 chrome_lang_id::NNetLanguageIdentifier::kUnknown) { | 210 chrome_lang_id::NNetLanguageIdentifier::kUnknown) { |
| 210 // CLD3 returns 'zh' for Chinese but Translate doesn't accept it. Thus, | 211 if (predicted_language != "zh") { |
| 211 // analogously to CLD2, 'zh-CN' is returned instead. | 212 language = predicted_language; |
| 212 if (predicted_language == "zh") { | |
| 213 language = "zh-CN"; | |
| 214 } else { | 213 } else { |
| 215 language = predicted_language; | 214 // If prediction is "zh" (Chinese), then we need to determine whether the |
| 215 // text is zh-Hant (Chinese Traditional) or zh-Hans (Chinese Simplified). |
| 216 translate::ChineseScriptClassifier zh_classifier; |
| 217 |
| 218 // The Classify function returns either "zh-Hant" or "zh-Hans". |
| 219 // Convert to the old-style language codes used by the Translate API. |
| 220 const std::string zh_classification = zh_classifier.Classify(utf8_text); |
| 221 if (zh_classification == "zh-Hant") { |
| 222 language = "zh-TW"; |
| 223 } else if (zh_classification == "zh-Hans") { |
| 224 language = "zh-CN"; |
| 225 } else { |
| 226 language = translate::kUnknownLanguageCode; |
| 227 } |
| 216 } | 228 } |
| 217 } | 229 } |
| 218 | |
| 219 #else | 230 #else |
| 220 # error "CLD_VERSION must be 2 or 3" | 231 # error "CLD_VERSION must be 2 or 3" |
| 221 #endif | 232 #endif |
| 222 | 233 |
| 234 VLOG(1) << "Detected language: " << language; |
| 223 return language; | 235 return language; |
| 224 } | 236 } |
| 225 | 237 |
| 226 // Checks if CLD can complement a sub code when the page language doesn't know | 238 // Checks if CLD can complement a sub code when the page language doesn't know |
| 227 // the sub code. | 239 // the sub code. |
| 228 bool CanCLDComplementSubCode( | 240 bool CanCLDComplementSubCode( |
| 229 const std::string& page_language, const std::string& cld_language) { | 241 const std::string& page_language, const std::string& cld_language) { |
| 230 // Translate server cannot treat general Chinese. If Content-Language and | 242 // Translate server cannot treat general Chinese. If Content-Language and |
| 231 // CLD agree that the language is Chinese and Content-Language doesn't know | 243 // CLD agree that the language is Chinese and Content-Language doesn't know |
| 232 // which dialect is used, CLD language has priority. | 244 // which dialect is used, CLD language has priority. |
| (...skipping 192 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 425 // distinguish from English, and the language is one of well-known languages | 437 // distinguish from English, and the language is one of well-known languages |
| 426 // which often provide "en-*" meta information mistakenly. | 438 // which often provide "en-*" meta information mistakenly. |
| 427 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 439 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
| 428 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 440 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
| 429 return true; | 441 return true; |
| 430 } | 442 } |
| 431 return false; | 443 return false; |
| 432 } | 444 } |
| 433 | 445 |
| 434 } // namespace translate | 446 } // namespace translate |
| OLD | NEW |