Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/translate/core/language_detection/language_detection_util.h " | 5 #include "components/translate/core/language_detection/language_detection_util.h " |
| 6 | 6 |
| 7 #include "base/logging.h" | 7 #include "base/logging.h" |
| 8 #include "base/metrics/field_trial.h" | |
| 9 #include "base/strings/string_split.h" | 8 #include "base/strings/string_split.h" |
| 10 #include "base/strings/string_util.h" | 9 #include "base/strings/string_util.h" |
| 11 #include "base/strings/utf_string_conversions.h" | 10 #include "base/strings/utf_string_conversions.h" |
| 12 #include "base/time/time.h" | 11 #include "base/time/time.h" |
| 13 #include "components/translate/core/common/translate_constants.h" | 12 #include "components/translate/core/common/translate_constants.h" |
| 14 #include "components/translate/core/common/translate_metrics.h" | 13 #include "components/translate/core/common/translate_metrics.h" |
| 15 #include "components/translate/core/common/translate_util.h" | 14 #include "components/translate/core/common/translate_util.h" |
| 16 | 15 |
| 17 #if !defined(CLD_VERSION) || CLD_VERSION==1 | 16 #if CLD_VERSION==1 |
| 18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | 17 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
| 19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | 18 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" |
| 20 #endif | 19 #endif |
| 21 | 20 |
| 22 #if !defined(CLD_VERSION) || CLD_VERSION==2 | 21 #if CLD_VERSION==2 |
| 23 #include "third_party/cld_2/src/public/compact_lang_det.h" | 22 #include "third_party/cld_2/src/public/compact_lang_det.h" |
| 24 #endif | 23 #endif |
| 25 | 24 |
| 26 namespace { | 25 namespace { |
| 27 | 26 |
| 28 // Similar language code list. Some languages are very similar and difficult | 27 // Similar language code list. Some languages are very similar and difficult |
| 29 // for CLD to distinguish. | 28 // for CLD to distinguish. |
| 30 struct SimilarLanguageCode { | 29 struct SimilarLanguageCode { |
| 31 const char* const code; | 30 const char* const code; |
| 32 int group; | 31 int group; |
| (...skipping 30 matching lines...) Expand all Loading... | |
| 63 translate::CorrectLanguageCodeTypo(code); | 62 translate::CorrectLanguageCodeTypo(code); |
| 64 | 63 |
| 65 if (!translate::IsValidLanguageCode(*code)) { | 64 if (!translate::IsValidLanguageCode(*code)) { |
| 66 *code = std::string(); | 65 *code = std::string(); |
| 67 return; | 66 return; |
| 68 } | 67 } |
| 69 | 68 |
| 70 translate::ToTranslateLanguageSynonym(code); | 69 translate::ToTranslateLanguageSynonym(code); |
| 71 } | 70 } |
| 72 | 71 |
| 73 int GetCLDMajorVersion() { | |
| 74 #if !defined(CLD_VERSION) | |
| 75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); | |
| 76 if (group_name == "CLD2") | |
| 77 return 2; | |
| 78 else | |
| 79 return 1; | |
| 80 #else | |
| 81 return CLD_VERSION; | |
| 82 #endif | |
| 83 } | |
| 84 | |
| 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | 72 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
| 86 // failed. | 73 // failed. |
| 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | 74 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
| 88 std::string DetermineTextLanguage(const base::string16& text, | 75 std::string DetermineTextLanguage(const base::string16& text, |
| 89 bool* is_cld_reliable) { | 76 bool* is_cld_reliable) { |
| 90 std::string language = translate::kUnknownLanguageCode; | 77 std::string language = translate::kUnknownLanguageCode; |
| 91 int num_bytes_evaluated = 0; | 78 int num_bytes_evaluated = 0; |
| 92 bool is_reliable = false; | 79 bool is_reliable = false; |
| 93 const bool is_plain_text = true; | 80 const bool is_plain_text = true; |
| 94 | 81 |
| 95 // Language or CLD2::Language | 82 // Language or CLD2::Language |
| 96 int cld_language = 0; | 83 int cld_language = 0; |
| 97 bool is_valid_language = false; | 84 bool is_valid_language = false; |
| 98 | 85 |
| 99 switch (GetCLDMajorVersion()) { | 86 #if CLD_VERSION==1 |
| 100 #if !defined(CLD_VERSION) || CLD_VERSION==1 | 87 int num_languages = 0; |
| 101 case 1: { | 88 cld_language = DetectLanguageOfUnicodeText( |
| 102 int num_languages = 0; | 89 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL, |
| 103 cld_language = DetectLanguageOfUnicodeText( | 90 &num_bytes_evaluated); |
| 104 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL, | 91 is_valid_language = cld_language != NUM_LANGUAGES && |
| 105 &num_bytes_evaluated); | 92 cld_language != UNKNOWN_LANGUAGE && |
|
Takashi Toyoshima
2015/07/30 08:41:05
wrong indent
hajimehoshi
2015/07/30 08:51:29
Done.
| |
| 106 is_valid_language = cld_language != NUM_LANGUAGES && | 93 cld_language != TG_UNKNOWN_LANGUAGE; |
| 107 cld_language != UNKNOWN_LANGUAGE && | |
| 108 cld_language != TG_UNKNOWN_LANGUAGE; | |
| 109 break; | |
| 110 } | |
| 111 #endif | 94 #endif |
| 112 #if !defined(CLD_VERSION) || CLD_VERSION==2 | 95 #if CLD_VERSION==2 |
| 113 case 2: { | 96 const std::string utf8_text(base::UTF16ToUTF8(text)); |
| 114 const std::string utf8_text(base::UTF16ToUTF8(text)); | 97 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); |
| 115 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); | 98 const char* raw_utf8_bytes = utf8_text.c_str(); |
| 116 const char* raw_utf8_bytes = utf8_text.c_str(); | 99 cld_language = CLD2::DetectLanguageCheckUTF8( |
| 117 cld_language = CLD2::DetectLanguageCheckUTF8( | 100 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable, |
| 118 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable, | 101 &num_bytes_evaluated); |
| 119 &num_bytes_evaluated); | |
| 120 | 102 |
| 121 if (num_bytes_evaluated < num_utf8_bytes && | 103 if (num_bytes_evaluated < num_utf8_bytes && |
| 122 cld_language == CLD2::UNKNOWN_LANGUAGE) { | 104 cld_language == CLD2::UNKNOWN_LANGUAGE) { |
| 123 // Invalid UTF8 encountered, see bug http://crbug.com/444258. | 105 // Invalid UTF8 encountered, see bug http://crbug.com/444258. |
| 124 // Retry using only the valid characters. This time the check for valid | 106 // Retry using only the valid characters. This time the check for valid |
| 125 // UTF8 can be skipped since the precise number of valid bytes is known. | 107 // UTF8 can be skipped since the precise number of valid bytes is known. |
| 126 cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated, | 108 cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated, |
| 127 is_plain_text, &is_reliable); | 109 is_plain_text, &is_reliable); |
| 128 } | 110 } |
| 129 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && | 111 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && |
| 130 cld_language != CLD2::UNKNOWN_LANGUAGE && | 112 cld_language != CLD2::UNKNOWN_LANGUAGE && |
|
Takashi Toyoshima
2015/07/30 08:41:05
wrong indent
hajimehoshi
2015/07/30 08:51:29
Done.
| |
| 131 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; | 113 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; |
| 132 break; | |
| 133 } | |
| 134 #endif | 114 #endif |
|
Takashi Toyoshima
2015/07/30 08:41:05
up to you, but could be something like this?
#if
hajimehoshi
2015/07/30 08:51:29
Done.
| |
| 135 default: | |
| 136 NOTREACHED(); | |
| 137 } | |
| 138 | 115 |
| 139 if (is_cld_reliable != NULL) | 116 if (is_cld_reliable != NULL) |
| 140 *is_cld_reliable = is_reliable; | 117 *is_cld_reliable = is_reliable; |
| 141 | 118 |
| 142 // We don't trust the result if the CLD reports that the detection is not | 119 // We don't trust the result if the CLD reports that the detection is not |
| 143 // reliable, or if the actual text used to detect the language was less than | 120 // reliable, or if the actual text used to detect the language was less than |
| 144 // 100 bytes (short texts can often lead to wrong results). | 121 // 100 bytes (short texts can often lead to wrong results). |
| 145 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that | 122 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that |
| 146 // the determined language code is correct with 50% confidence. Chrome should | 123 // the determined language code is correct with 50% confidence. Chrome should |
| 147 // handle the real confidence value to judge. | 124 // handle the real confidence value to judge. |
| 148 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) { | 125 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) { |
| 149 // We should not use LanguageCode_ISO_639_1 because it does not cover all | 126 // We should not use LanguageCode_ISO_639_1 because it does not cover all |
| 150 // the languages CLD can detect. As a result, it'll return the invalid | 127 // the languages CLD can detect. As a result, it'll return the invalid |
| 151 // language code for tradtional Chinese among others. | 128 // language code for tradtional Chinese among others. |
| 152 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and | 129 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and |
| 153 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN | 130 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN |
| 154 // for Simplified Chinese. | 131 // for Simplified Chinese. |
| 155 switch (GetCLDMajorVersion()) { | 132 #if CLD_VERSION==1 |
| 156 #if !defined(CLD_VERSION) || CLD_VERSION==1 | 133 language = LanguageCodeWithDialects(static_cast<Language>(cld_language)); |
| 157 case 1: | |
| 158 language = | |
| 159 LanguageCodeWithDialects(static_cast<Language>(cld_language)); | |
| 160 break; | |
| 161 #endif | 134 #endif |
| 162 #if !defined(CLD_VERSION) || CLD_VERSION==2 | 135 #if CLD_VERSION==2 |
| 163 case 2: | 136 // (1) CLD2's LanguageCode returns general Chinese 'zh' for |
| 164 // (1) CLD2's LanguageCode returns general Chinese 'zh' for | 137 // CLD2::CHINESE, but Translate server doesn't accept it. This is |
| 165 // CLD2::CHINESE, but Translate server doesn't accept it. This is | 138 // converted to 'zh-CN' in the same way as CLD1's |
| 166 // converted to 'zh-CN' in the same way as CLD1's | 139 // LanguageCodeWithDialects. |
| 167 // LanguageCodeWithDialects. | 140 // |
| 168 // | 141 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for |
| 169 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for | 142 // CLD2::CHINESE_T. This is technically more precise for the language |
| 170 // CLD2::CHINESE_T. This is technically more precise for the language | 143 // code of traditional Chinese, while Translate server hasn't accepted |
| 171 // code of traditional Chinese, while Translate server hasn't accepted | 144 // zh-Hant yet. |
| 172 // zh-Hant yet. | 145 if (cld_language == CLD2::CHINESE) |
| 173 if (cld_language == CLD2::CHINESE) { | 146 language = "zh-CN"; |
| 174 language = "zh-CN"; | 147 else if (cld_language == CLD2::CHINESE_T) |
| 175 } else if (cld_language == CLD2::CHINESE_T) { | 148 language = "zh-TW"; |
| 176 language = "zh-TW"; | 149 else |
| 177 } else { | 150 language = CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); |
| 178 language = | |
| 179 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); | |
| 180 } | |
| 181 break; | |
| 182 #endif | 151 #endif |
|
Takashi Toyoshima
2015/07/30 08:41:05
ditto
hajimehoshi
2015/07/30 08:51:29
Done.
| |
| 183 default: | |
| 184 NOTREACHED(); | |
| 185 } | |
| 186 } | 152 } |
| 187 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text | 153 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text |
| 188 << "\n*************************************\n"; | 154 << "\n*************************************\n"; |
| 189 return language; | 155 return language; |
| 190 } | 156 } |
| 191 | 157 |
| 192 // Checks if CLD can complement a sub code when the page language doesn't know | 158 // Checks if CLD can complement a sub code when the page language doesn't know |
| 193 // the sub code. | 159 // the sub code. |
| 194 bool CanCLDComplementSubCode( | 160 bool CanCLDComplementSubCode( |
| 195 const std::string& page_language, const std::string& cld_language) { | 161 const std::string& page_language, const std::string& cld_language) { |
| (...skipping 194 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 390 // distinguish from English, and the language is one of well-known languages | 356 // distinguish from English, and the language is one of well-known languages |
| 391 // which often provide "en-*" meta information mistakenly. | 357 // which often provide "en-*" meta information mistakenly. |
| 392 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 358 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
| 393 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 359 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
| 394 return true; | 360 return true; |
| 395 } | 361 } |
| 396 return false; | 362 return false; |
| 397 } | 363 } |
| 398 | 364 |
| 399 } // namespace translate | 365 } // namespace translate |
| OLD | NEW |