| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/translate/core/language_detection/language_detection_util.h
" | 5 #include "components/translate/core/language_detection/language_detection_util.h
" |
| 6 | 6 |
| 7 #include <stddef.h> | 7 #include <stddef.h> |
| 8 | 8 |
| 9 #include "base/logging.h" | 9 #include "base/logging.h" |
| 10 #include "base/macros.h" | 10 #include "base/macros.h" |
| 11 #include "base/metrics/histogram_macros.h" |
| 11 #include "base/strings/string_split.h" | 12 #include "base/strings/string_split.h" |
| 12 #include "base/strings/string_util.h" | 13 #include "base/strings/string_util.h" |
| 13 #include "base/strings/utf_string_conversions.h" | 14 #include "base/strings/utf_string_conversions.h" |
| 14 #include "base/time/time.h" | 15 #include "base/time/time.h" |
| 15 #include "components/translate/core/common/translate_constants.h" | 16 #include "components/translate/core/common/translate_constants.h" |
| 16 #include "components/translate/core/common/translate_metrics.h" | 17 #include "components/translate/core/common/translate_metrics.h" |
| 17 #include "components/translate/core/common/translate_util.h" | 18 #include "components/translate/core/common/translate_util.h" |
| 18 | 19 |
| 19 #if CLD_VERSION==1 | 20 #if CLD_VERSION==1 |
| 20 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | 21 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
| (...skipping 108 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 129 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags, | 130 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags, |
| 130 language3, percent3, normalized_score3, | 131 language3, percent3, normalized_score3, |
| 131 nullptr /* No ResultChunkVector used */, &text_bytes, &is_reliable); | 132 nullptr /* No ResultChunkVector used */, &text_bytes, &is_reliable); |
| 132 } | 133 } |
| 133 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && | 134 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && |
| 134 cld_language != CLD2::UNKNOWN_LANGUAGE && | 135 cld_language != CLD2::UNKNOWN_LANGUAGE && |
| 135 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; | 136 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; |
| 136 | 137 |
| 137 // Choose top language. | 138 // Choose top language. |
| 138 cld_language = language3[0]; | 139 cld_language = language3[0]; |
| 140 UMA_HISTOGRAM_ENUMERATION("Translate.CLD2.LanguageDetected", |
| 141 cld_language, CLD2::NUM_LANGUAGES); |
| 142 if (is_valid_language) |
| 143 UMA_HISTOGRAM_PERCENTAGE("Translate.CLD2.LanguageAccuracy", percent3[0]); |
| 144 |
| 145 |
| 139 #else | 146 #else |
| 140 # error "CLD_VERSION must be 1 or 2" | 147 # error "CLD_VERSION must be 1 or 2" |
| 141 #endif | 148 #endif |
| 142 | 149 |
| 143 if (is_cld_reliable != NULL) | 150 if (is_cld_reliable != NULL) |
| 144 *is_cld_reliable = is_reliable; | 151 *is_cld_reliable = is_reliable; |
| 145 | 152 |
| 146 // We don't trust the result if the CLD reports that the detection is not | 153 // We don't trust the result if the CLD reports that the detection is not |
| 147 // reliable, or if the actual text used to detect the language was less than | 154 // reliable, or if the actual text used to detect the language was less than |
| 148 // 100 bytes (short texts can often lead to wrong results). | 155 // 100 bytes (short texts can often lead to wrong results). |
| 149 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that | 156 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that |
| 150 // the determined language code is correct with 50% confidence. Chrome should | 157 // the determined language code is correct with 50% confidence. Chrome should |
| 151 // handle the real confidence value to judge. | 158 // handle the real confidence value to judge. |
| 152 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) { | 159 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) { |
| 153 // We should not use LanguageCode_ISO_639_1 because it does not cover all | 160 // We should not use LanguageCode_ISO_639_1 because it does not cover all |
| 154 // the languages CLD can detect. As a result, it'll return the invalid | 161 // the languages CLD can detect. As a result, it'll return the invalid |
| 155 // language code for tradtional Chinese among others. | 162 // language code for traditional Chinese among others. |
| 156 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and | 163 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and |
| 157 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN | 164 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN |
| 158 // for Simplified Chinese. | 165 // for Simplified Chinese. |
| 159 #if CLD_VERSION==1 | 166 #if CLD_VERSION==1 |
| 160 language = LanguageCodeWithDialects(static_cast<Language>(cld_language)); | 167 language = LanguageCodeWithDialects(static_cast<Language>(cld_language)); |
| 161 #elif CLD_VERSION==2 | 168 #elif CLD_VERSION==2 |
| 162 // (1) CLD2's LanguageCode returns general Chinese 'zh' for | 169 // (1) CLD2's LanguageCode returns general Chinese 'zh' for |
| 163 // CLD2::CHINESE, but Translate server doesn't accept it. This is | 170 // CLD2::CHINESE, but Translate server doesn't accept it. This is |
| 164 // converted to 'zh-CN' in the same way as CLD1's | 171 // converted to 'zh-CN' in the same way as CLD1's |
| 165 // LanguageCodeWithDialects. | 172 // LanguageCodeWithDialects. |
| (...skipping 219 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 385 // distinguish from English, and the language is one of well-known languages | 392 // distinguish from English, and the language is one of well-known languages |
| 386 // which often provide "en-*" meta information mistakenly. | 393 // which often provide "en-*" meta information mistakenly. |
| 387 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 394 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
| 388 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 395 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
| 389 return true; | 396 return true; |
| 390 } | 397 } |
| 391 return false; | 398 return false; |
| 392 } | 399 } |
| 393 | 400 |
| 394 } // namespace translate | 401 } // namespace translate |
| OLD | NEW |