| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/translate/core/language_detection/language_detection_util.h
" | 5 #include "components/translate/core/language_detection/language_detection_util.h
" |
| 6 | 6 |
| 7 #include "base/logging.h" | 7 #include "base/logging.h" |
| 8 #include "base/metrics/field_trial.h" | 8 #include "base/metrics/field_trial.h" |
| 9 #include "base/strings/string_split.h" | 9 #include "base/strings/string_split.h" |
| 10 #include "base/strings/string_util.h" | 10 #include "base/strings/string_util.h" |
| (...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 81 return CLD_VERSION; | 81 return CLD_VERSION; |
| 82 #endif | 82 #endif |
| 83 } | 83 } |
| 84 | 84 |
| 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
| 86 // failed. | 86 // failed. |
| 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
| 88 std::string DetermineTextLanguage(const base::string16& text, | 88 std::string DetermineTextLanguage(const base::string16& text, |
| 89 bool* is_cld_reliable) { | 89 bool* is_cld_reliable) { |
| 90 std::string language = translate::kUnknownLanguageCode; | 90 std::string language = translate::kUnknownLanguageCode; |
| 91 int text_bytes = 0; | 91 int num_bytes_evaluated = 0; |
| 92 bool is_reliable = false; | 92 bool is_reliable = false; |
| 93 const bool is_plain_text = true; |
| 93 | 94 |
| 94 // Language or CLD2::Language | 95 // Language or CLD2::Language |
| 95 int cld_language = 0; | 96 int cld_language = 0; |
| 96 bool is_valid_language = false; | 97 bool is_valid_language = false; |
| 97 | 98 |
| 98 switch (GetCLDMajorVersion()) { | 99 switch (GetCLDMajorVersion()) { |
| 99 #if !defined(CLD_VERSION) || CLD_VERSION==1 | 100 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
| 100 case 1: { | 101 case 1: { |
| 101 int num_languages = 0; | 102 int num_languages = 0; |
| 102 cld_language = | 103 cld_language = DetectLanguageOfUnicodeText( |
| 103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, | 104 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL, |
| 104 &num_languages, NULL, &text_bytes); | 105 &num_bytes_evaluated); |
| 105 is_valid_language = cld_language != NUM_LANGUAGES && | 106 is_valid_language = cld_language != NUM_LANGUAGES && |
| 106 cld_language != UNKNOWN_LANGUAGE && | 107 cld_language != UNKNOWN_LANGUAGE && |
| 107 cld_language != TG_UNKNOWN_LANGUAGE; | 108 cld_language != TG_UNKNOWN_LANGUAGE; |
| 108 break; | 109 break; |
| 109 } | 110 } |
| 110 #endif | 111 #endif |
| 111 #if !defined(CLD_VERSION) || CLD_VERSION==2 | 112 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
| 112 case 2: { | 113 case 2: { |
| 113 std::string utf8_text(base::UTF16ToUTF8(text)); | 114 const std::string utf8_text(base::UTF16ToUTF8(text)); |
| 114 CLD2::Language language3[3]; | 115 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); |
| 115 int percent3[3]; | 116 const char* raw_utf8_bytes = utf8_text.c_str(); |
| 116 CLD2::DetectLanguageSummary( | 117 cld_language = CLD2::DetectLanguageCheckUTF8( |
| 117 utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3, | 118 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable, |
| 118 &text_bytes, &is_reliable); | 119 &num_bytes_evaluated); |
| 119 cld_language = language3[0]; | 120 |
| 121 if (num_bytes_evaluated < num_utf8_bytes && |
| 122 cld_language == CLD2::UNKNOWN_LANGUAGE) { |
| 123 // Invalid UTF8 encountered, see bug http://crbug.com/444258. |
| 124 // Retry using only the valid characters. This time the check for valid |
| 125 // UTF8 can be skipped since the precise number of valid bytes is known. |
| 126 cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated, |
| 127 is_plain_text, &is_reliable); |
| 128 } |
| 120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && | 129 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && |
| 121 cld_language != CLD2::UNKNOWN_LANGUAGE && | 130 cld_language != CLD2::UNKNOWN_LANGUAGE && |
| 122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; | 131 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; |
| 123 break; | 132 break; |
| 124 } | 133 } |
| 125 #endif | 134 #endif |
| 126 default: | 135 default: |
| 127 NOTREACHED(); | 136 NOTREACHED(); |
| 128 } | 137 } |
| 129 | 138 |
| 130 if (is_cld_reliable != NULL) | 139 if (is_cld_reliable != NULL) |
| 131 *is_cld_reliable = is_reliable; | 140 *is_cld_reliable = is_reliable; |
| 132 | 141 |
| 133 // We don't trust the result if the CLD reports that the detection is not | 142 // We don't trust the result if the CLD reports that the detection is not |
| 134 // reliable, or if the actual text used to detect the language was less than | 143 // reliable, or if the actual text used to detect the language was less than |
| 135 // 100 bytes (short texts can often lead to wrong results). | 144 // 100 bytes (short texts can often lead to wrong results). |
| 136 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that | 145 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that |
| 137 // the determined language code is correct with 50% confidence. Chrome should | 146 // the determined language code is correct with 50% confidence. Chrome should |
| 138 // handle the real confidence value to judge. | 147 // handle the real confidence value to judge. |
| 139 if (is_reliable && text_bytes >= 100 && is_valid_language) { | 148 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) { |
| 140 // We should not use LanguageCode_ISO_639_1 because it does not cover all | 149 // We should not use LanguageCode_ISO_639_1 because it does not cover all |
| 141 // the languages CLD can detect. As a result, it'll return the invalid | 150 // the languages CLD can detect. As a result, it'll return the invalid |
| 142 // language code for tradtional Chinese among others. | 151 // language code for tradtional Chinese among others. |
| 143 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and | 152 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and |
| 144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN | 153 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN |
| 145 // for Simplified Chinese. | 154 // for Simplified Chinese. |
| 146 switch (GetCLDMajorVersion()) { | 155 switch (GetCLDMajorVersion()) { |
| 147 #if !defined(CLD_VERSION) || CLD_VERSION==1 | 156 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
| 148 case 1: | 157 case 1: |
| 149 language = | 158 language = |
| (...skipping 230 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 380 // distinguish from English, and the language is one of well-known languages | 389 // distinguish from English, and the language is one of well-known languages |
| 381 // which often provide "en-*" meta information mistakenly. | 390 // which often provide "en-*" meta information mistakenly. |
| 382 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 391 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
| 383 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 392 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
| 384 return true; | 393 return true; |
| 385 } | 394 } |
| 386 return false; | 395 return false; |
| 387 } | 396 } |
| 388 | 397 |
| 389 } // namespace translate | 398 } // namespace translate |
| OLD | NEW |