| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/translate/core/language_detection/language_detection_util.h
" | 5 #include "components/translate/core/language_detection/language_detection_util.h
" |
| 6 | 6 |
| 7 #include <stddef.h> | 7 #include <stddef.h> |
| 8 | 8 |
| 9 #include "base/logging.h" | 9 #include "base/logging.h" |
| 10 #include "base/macros.h" | 10 #include "base/macros.h" |
| 11 #include "base/metrics/histogram_macros.h" | 11 #include "base/metrics/histogram_macros.h" |
| 12 #include "base/strings/string_split.h" | 12 #include "base/strings/string_split.h" |
| 13 #include "base/strings/string_util.h" | 13 #include "base/strings/string_util.h" |
| 14 #include "base/strings/utf_string_conversions.h" | 14 #include "base/strings/utf_string_conversions.h" |
| 15 #include "base/time/time.h" | 15 #include "base/time/time.h" |
| 16 #include "components/translate/core/common/translate_constants.h" | 16 #include "components/translate/core/common/translate_constants.h" |
| 17 #include "components/translate/core/common/translate_metrics.h" | 17 #include "components/translate/core/common/translate_metrics.h" |
| 18 #include "components/translate/core/common/translate_util.h" | 18 #include "components/translate/core/common/translate_util.h" |
| 19 | |
| 20 #if CLD_VERSION==1 | |
| 21 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | |
| 22 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | |
| 23 #endif | |
| 24 | |
| 25 #if CLD_VERSION==2 | |
| 26 #include "third_party/cld_2/src/public/compact_lang_det.h" | 19 #include "third_party/cld_2/src/public/compact_lang_det.h" |
| 27 #include "third_party/cld_2/src/public/encodings.h" | 20 #include "third_party/cld_2/src/public/encodings.h" |
| 28 #endif | |
| 29 | 21 |
| 30 namespace { | 22 namespace { |
| 31 | 23 |
| 32 // Similar language code list. Some languages are very similar and difficult | 24 // Similar language code list. Some languages are very similar and difficult |
| 33 // for CLD to distinguish. | 25 // for CLD to distinguish. |
| 34 struct SimilarLanguageCode { | 26 struct SimilarLanguageCode { |
| 35 const char* const code; | 27 const char* const code; |
| 36 int group; | 28 int group; |
| 37 }; | 29 }; |
| 38 | 30 |
| (...skipping 44 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 83 std::string& html_lang) { | 75 std::string& html_lang) { |
| 84 std::string language = translate::kUnknownLanguageCode; | 76 std::string language = translate::kUnknownLanguageCode; |
| 85 int num_bytes_evaluated = 0; | 77 int num_bytes_evaluated = 0; |
| 86 bool is_reliable = false; | 78 bool is_reliable = false; |
| 87 const bool is_plain_text = true; | 79 const bool is_plain_text = true; |
| 88 | 80 |
| 89 // Language or CLD2::Language | 81 // Language or CLD2::Language |
| 90 int cld_language = 0; | 82 int cld_language = 0; |
| 91 bool is_valid_language = false; | 83 bool is_valid_language = false; |
| 92 | 84 |
| 93 #if CLD_VERSION==1 | |
| 94 int num_languages = 0; | |
| 95 cld_language = DetectLanguageOfUnicodeText(NULL, text.c_str(), is_plain_text, | |
| 96 &is_reliable, &num_languages, NULL, | |
| 97 &num_bytes_evaluated); | |
| 98 is_valid_language = cld_language != NUM_LANGUAGES && | |
| 99 cld_language != UNKNOWN_LANGUAGE && | |
| 100 cld_language != TG_UNKNOWN_LANGUAGE; | |
| 101 #elif CLD_VERSION==2 | |
| 102 const std::string utf8_text(base::UTF16ToUTF8(text)); | 85 const std::string utf8_text(base::UTF16ToUTF8(text)); |
| 103 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); | 86 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); |
| 104 const char* raw_utf8_bytes = utf8_text.c_str(); | 87 const char* raw_utf8_bytes = utf8_text.c_str(); |
| 105 | 88 |
| 106 CLD2::Language language3[3]; | 89 CLD2::Language language3[3]; |
| 107 int percent3[3]; | 90 int percent3[3]; |
| 108 int flags = 0; // No flags, see compact_lang_det.h for details. | 91 int flags = 0; // No flags, see compact_lang_det.h for details. |
| 109 int text_bytes; // Amount of non-tag/letters-only text (assumed 0). | 92 int text_bytes; // Amount of non-tag/letters-only text (assumed 0). |
| 110 double normalized_score3[3]; | 93 double normalized_score3[3]; |
| 111 | 94 |
| (...skipping 23 matching lines...) Expand all Loading... |
| 135 cld_language != CLD2::UNKNOWN_LANGUAGE && | 118 cld_language != CLD2::UNKNOWN_LANGUAGE && |
| 136 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; | 119 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; |
| 137 | 120 |
| 138 // Choose top language. | 121 // Choose top language. |
| 139 cld_language = language3[0]; | 122 cld_language = language3[0]; |
| 140 UMA_HISTOGRAM_ENUMERATION("Translate.CLD2.LanguageDetected", | 123 UMA_HISTOGRAM_ENUMERATION("Translate.CLD2.LanguageDetected", |
| 141 cld_language, CLD2::NUM_LANGUAGES); | 124 cld_language, CLD2::NUM_LANGUAGES); |
| 142 if (is_valid_language) | 125 if (is_valid_language) |
| 143 UMA_HISTOGRAM_PERCENTAGE("Translate.CLD2.LanguageAccuracy", percent3[0]); | 126 UMA_HISTOGRAM_PERCENTAGE("Translate.CLD2.LanguageAccuracy", percent3[0]); |
| 144 | 127 |
| 145 | |
| 146 #else | |
| 147 # error "CLD_VERSION must be 1 or 2" | |
| 148 #endif | |
| 149 | |
| 150 if (is_cld_reliable != NULL) | 128 if (is_cld_reliable != NULL) |
| 151 *is_cld_reliable = is_reliable; | 129 *is_cld_reliable = is_reliable; |
| 152 | 130 |
| 153 // We don't trust the result if the CLD reports that the detection is not | 131 // We don't trust the result if the CLD reports that the detection is not |
| 154 // reliable, or if the actual text used to detect the language was less than | 132 // reliable, or if the actual text used to detect the language was less than |
| 155 // 100 bytes (short texts can often lead to wrong results). | 133 // 100 bytes (short texts can often lead to wrong results). |
| 156 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that | 134 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that |
| 157 // the determined language code is correct with 50% confidence. Chrome should | 135 // the determined language code is correct with 50% confidence. Chrome should |
| 158 // handle the real confidence value to judge. | 136 // handle the real confidence value to judge. |
| 159 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) { | 137 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) { |
| 160 // We should not use LanguageCode_ISO_639_1 because it does not cover all | 138 // We should not use LanguageCode_ISO_639_1 because it does not cover all |
| 161 // the languages CLD can detect. As a result, it'll return the invalid | 139 // the languages CLD can detect. As a result, it'll return the invalid |
| 162 // language code for traditional Chinese among others. | 140 // language code for traditional Chinese among others. |
| 163 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and | 141 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and |
| 164 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN | 142 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN |
| 165 // for Simplified Chinese. | 143 // for Simplified Chinese. |
| 166 #if CLD_VERSION==1 | 144 // |
| 167 language = LanguageCodeWithDialects(static_cast<Language>(cld_language)); | |
| 168 #elif CLD_VERSION==2 | |
| 169 // (1) CLD2's LanguageCode returns general Chinese 'zh' for | 145 // (1) CLD2's LanguageCode returns general Chinese 'zh' for |
| 170 // CLD2::CHINESE, but Translate server doesn't accept it. This is | 146 // CLD2::CHINESE, but Translate server doesn't accept it. This is |
| 171 // converted to 'zh-CN' in the same way as CLD1's | 147 // converted to 'zh-CN' in the same way as CLD1's |
| 172 // LanguageCodeWithDialects. | 148 // LanguageCodeWithDialects. |
| 173 // | 149 // |
| 174 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for | 150 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for |
| 175 // CLD2::CHINESE_T. This is technically more precise for the language | 151 // CLD2::CHINESE_T. This is technically more precise for the language |
| 176 // code of traditional Chinese, while Translate server hasn't accepted | 152 // code of traditional Chinese, while Translate server hasn't accepted |
| 177 // zh-Hant yet. | 153 // zh-Hant yet. |
| 178 if (cld_language == CLD2::CHINESE) | 154 if (cld_language == CLD2::CHINESE) |
| 179 language = "zh-CN"; | 155 language = "zh-CN"; |
| 180 else if (cld_language == CLD2::CHINESE_T) | 156 else if (cld_language == CLD2::CHINESE_T) |
| 181 language = "zh-TW"; | 157 language = "zh-TW"; |
| 182 else | 158 else |
| 183 language = CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); | 159 language = CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); |
| 184 #else | |
| 185 # error "CLD_VERSION must be 1 or 2" | |
| 186 #endif | |
| 187 } | 160 } |
| 188 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text | 161 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text |
| 189 << "\n*************************************\n"; | 162 << "\n*************************************\n"; |
| 190 return language; | 163 return language; |
| 191 } | 164 } |
| 192 | 165 |
| 193 // Checks if CLD can complement a sub code when the page language doesn't know | 166 // Checks if CLD can complement a sub code when the page language doesn't know |
| 194 // the sub code. | 167 // the sub code. |
| 195 bool CanCLDComplementSubCode( | 168 bool CanCLDComplementSubCode( |
| 196 const std::string& page_language, const std::string& cld_language) { | 169 const std::string& page_language, const std::string& cld_language) { |
| (...skipping 195 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 392 // distinguish from English, and the language is one of well-known languages | 365 // distinguish from English, and the language is one of well-known languages |
| 393 // which often provide "en-*" meta information mistakenly. | 366 // which often provide "en-*" meta information mistakenly. |
| 394 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 367 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
| 395 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 368 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
| 396 return true; | 369 return true; |
| 397 } | 370 } |
| 398 return false; | 371 return false; |
| 399 } | 372 } |
| 400 | 373 |
| 401 } // namespace translate | 374 } // namespace translate |
| OLD | NEW |