Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/translate/core/language_detection/language_detection_util.h " | 5 #include "components/translate/core/language_detection/language_detection_util.h " |
| 6 | 6 |
| 7 #include "base/logging.h" | 7 #include "base/logging.h" |
| 8 #include "base/metrics/field_trial.h" | 8 #include "base/metrics/field_trial.h" |
| 9 #include "base/strings/string_split.h" | 9 #include "base/strings/string_split.h" |
| 10 #include "base/strings/string_util.h" | 10 #include "base/strings/string_util.h" |
| 11 #include "base/strings/utf_string_conversions.h" | 11 #include "base/strings/utf_string_conversions.h" |
| 12 #include "base/time/time.h" | 12 #include "base/time/time.h" |
| 13 #include "components/translate/core/common/translate_constants.h" | 13 #include "components/translate/core/common/translate_constants.h" |
| 14 #include "components/translate/core/common/translate_metrics.h" | 14 #include "components/translate/core/common/translate_metrics.h" |
| 15 #include "components/translate/core/common/translate_util.h" | 15 #include "components/translate/core/common/translate_util.h" |
| 16 | 16 |
| 17 #if !defined(CLD_VERSION) || CLD_VERSION==1 | 17 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
| 18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | 18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
| 19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | 19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" |
| 20 #endif | 20 #endif |
| 21 | 21 |
| 22 #if !defined(CLD_VERSION) || CLD_VERSION==2 | 22 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
| 23 #include "third_party/cld_2/src/public/compact_lang_det.h" | 23 #include "third_party/cld_2/src/public/compact_lang_det.h" |
| 24 #include "third_party/cld_2/src/public/encodings.h" | |
| 24 #endif | 25 #endif |
| 25 | 26 |
| 26 namespace { | 27 namespace { |
| 27 | 28 |
| 28 // Similar language code list. Some languages are very similar and difficult | 29 // Similar language code list. Some languages are very similar and difficult |
| 29 // for CLD to distinguish. | 30 // for CLD to distinguish. |
| 30 struct SimilarLanguageCode { | 31 struct SimilarLanguageCode { |
| 31 const char* const code; | 32 const char* const code; |
| 32 int group; | 33 int group; |
| 33 }; | 34 }; |
| (...skipping 45 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 79 return 1; | 80 return 1; |
| 80 #else | 81 #else |
| 81 return CLD_VERSION; | 82 return CLD_VERSION; |
| 82 #endif | 83 #endif |
| 83 } | 84 } |
| 84 | 85 |
| 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | 86 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
| 86 // failed. | 87 // failed. |
| 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | 88 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
| 88 std::string DetermineTextLanguage(const base::string16& text, | 89 std::string DetermineTextLanguage(const base::string16& text, |
| 89 bool* is_cld_reliable) { | 90 bool* is_cld_reliable, |
| 91 std::string& code, | |
| 92 std::string& html_lang) { | |
| 90 std::string language = translate::kUnknownLanguageCode; | 93 std::string language = translate::kUnknownLanguageCode; |
| 91 int num_bytes_evaluated = 0; | 94 int num_bytes_evaluated = 0; |
| 92 bool is_reliable = false; | 95 bool is_reliable = false; |
| 93 const bool is_plain_text = true; | 96 const bool is_plain_text = true; |
| 94 | 97 |
| 95 // Language or CLD2::Language | 98 // Language or CLD2::Language |
| 96 int cld_language = 0; | 99 int cld_language = 0; |
| 97 bool is_valid_language = false; | 100 bool is_valid_language = false; |
| 98 | 101 |
| 99 switch (GetCLDMajorVersion()) { | 102 switch (GetCLDMajorVersion()) { |
| 100 #if !defined(CLD_VERSION) || CLD_VERSION==1 | 103 #if !defined(CLD_VERSION) || CLD_VERSION==1 |
| 101 case 1: { | 104 case 1: { |
| 102 int num_languages = 0; | 105 int num_languages = 0; |
| 103 cld_language = DetectLanguageOfUnicodeText( | 106 cld_language = DetectLanguageOfUnicodeText( |
| 104 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL, | 107 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL, |
| 105 &num_bytes_evaluated); | 108 &num_bytes_evaluated); |
| 106 is_valid_language = cld_language != NUM_LANGUAGES && | 109 is_valid_language = cld_language != NUM_LANGUAGES && |
| 107 cld_language != UNKNOWN_LANGUAGE && | 110 cld_language != UNKNOWN_LANGUAGE && |
| 108 cld_language != TG_UNKNOWN_LANGUAGE; | 111 cld_language != TG_UNKNOWN_LANGUAGE; |
| 109 break; | 112 break; |
| 110 } | 113 } |
| 111 #endif | 114 #endif |
| 112 #if !defined(CLD_VERSION) || CLD_VERSION==2 | 115 #if !defined(CLD_VERSION) || CLD_VERSION==2 |
| 113 case 2: { | 116 case 2: { |
| 114 const std::string utf8_text(base::UTF16ToUTF8(text)); | 117 const std::string utf8_text(base::UTF16ToUTF8(text)); |
| 115 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); | 118 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); |
| 116 const char* raw_utf8_bytes = utf8_text.c_str(); | 119 const char* raw_utf8_bytes = utf8_text.c_str(); |
| 117 cld_language = CLD2::DetectLanguageCheckUTF8( | 120 |
| 118 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable, | 121 CLD2::Language language3[3]; |
| 119 &num_bytes_evaluated); | 122 int percent3[3]; |
| 123 int flags = 0; // No flags, see compact_lang_det.h for details. | |
| 124 int text_bytes; // Amount of non-tag/letters-only text (assumed 0). | |
| 125 double normalized_score3[3]; | |
| 126 | |
| 127 const char* tld_hint = ""; | |
| 128 int encoding_hint = CLD2::UNKNOWN_ENCODING; | |
| 129 CLD2::Language language_hint = | |
| 130 CLD2::GetLanguageFromName(html_lang.c_str()); | |
| 131 CLD2::CLDHints cldhints = {code.c_str(), tld_hint, encoding_hint, | |
| 132 language_hint}; | |
| 133 | |
| 134 cld_language = CLD2::ExtDetectLanguageSummaryCheckUTF8( | |
| 135 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags, | |
| 136 language3, percent3, normalized_score3, nullptr, &text_bytes, | |
|
Andrew Hayden (chromium.org)
2015/08/05 11:22:45
Please comment the meaning of the nullptr here, we
| |
| 137 &is_reliable, &num_bytes_evaluated); | |
| 120 | 138 |
| 121 if (num_bytes_evaluated < num_utf8_bytes && | 139 if (num_bytes_evaluated < num_utf8_bytes && |
| 122 cld_language == CLD2::UNKNOWN_LANGUAGE) { | 140 cld_language == CLD2::UNKNOWN_LANGUAGE) { |
| 123 // Invalid UTF8 encountered, see bug http://crbug.com/444258. | 141 // Invalid UTF8 encountered, see bug http://crbug.com/444258. |
| 124 // Retry using only the valid characters. This time the check for valid | 142 // Retry using only the valid characters. This time the check for valid |
| 125 // UTF8 can be skipped since the precise number of valid bytes is known. | 143 // UTF8 can be skipped since the precise number of valid bytes is known. |
| 126 cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated, | 144 cld_language = CLD2::ExtDetectLanguageSummary( |
| 127 is_plain_text, &is_reliable); | 145 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags, |
| 146 language3, percent3, normalized_score3, nullptr, &text_bytes, | |
|
Andrew Hayden (chromium.org)
2015/08/05 11:22:45
And same here, please
| |
| 147 &is_reliable); | |
| 128 } | 148 } |
| 129 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && | 149 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && |
| 130 cld_language != CLD2::UNKNOWN_LANGUAGE && | 150 cld_language != CLD2::UNKNOWN_LANGUAGE && |
| 131 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; | 151 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; |
| 152 | |
| 153 // Choose top language. | |
| 154 cld_language = language3[0]; | |
| 132 break; | 155 break; |
| 133 } | 156 } |
| 134 #endif | 157 #endif |
| 135 default: | 158 default: |
| 136 NOTREACHED(); | 159 NOTREACHED(); |
| 137 } | 160 } |
| 138 | 161 |
| 139 if (is_cld_reliable != NULL) | 162 if (is_cld_reliable != NULL) |
| 140 *is_cld_reliable = is_reliable; | 163 *is_cld_reliable = is_reliable; |
| 141 | 164 |
| (...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 206 | 229 |
| 207 namespace translate { | 230 namespace translate { |
| 208 | 231 |
| 209 std::string DeterminePageLanguage(const std::string& code, | 232 std::string DeterminePageLanguage(const std::string& code, |
| 210 const std::string& html_lang, | 233 const std::string& html_lang, |
| 211 const base::string16& contents, | 234 const base::string16& contents, |
| 212 std::string* cld_language_p, | 235 std::string* cld_language_p, |
| 213 bool* is_cld_reliable_p) { | 236 bool* is_cld_reliable_p) { |
| 214 base::TimeTicks begin_time = base::TimeTicks::Now(); | 237 base::TimeTicks begin_time = base::TimeTicks::Now(); |
| 215 bool is_cld_reliable; | 238 bool is_cld_reliable; |
| 216 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); | |
| 217 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now()); | |
| 218 | |
| 219 if (cld_language_p != NULL) | |
| 220 *cld_language_p = cld_language; | |
| 221 if (is_cld_reliable_p != NULL) | |
| 222 *is_cld_reliable_p = is_cld_reliable; | |
| 223 translate::ToTranslateLanguageSynonym(&cld_language); | |
| 224 | |
| 225 // Check if html lang attribute is valid. | 239 // Check if html lang attribute is valid. |
| 226 std::string modified_html_lang; | 240 std::string modified_html_lang; |
| 227 if (!html_lang.empty()) { | 241 if (!html_lang.empty()) { |
| 228 modified_html_lang = html_lang; | 242 modified_html_lang = html_lang; |
| 229 ApplyLanguageCodeCorrection(&modified_html_lang); | 243 ApplyLanguageCodeCorrection(&modified_html_lang); |
| 230 translate::ReportHtmlLang(html_lang, modified_html_lang); | 244 translate::ReportHtmlLang(html_lang, modified_html_lang); |
| 231 VLOG(9) << "html lang based language code: " << modified_html_lang; | 245 VLOG(9) << "html lang based language code: " << modified_html_lang; |
| 232 } | 246 } |
| 233 | 247 |
| 234 // Check if Content-Language is valid. | 248 // Check if Content-Language is valid. |
| 235 std::string modified_code; | 249 std::string modified_code; |
| 236 if (!code.empty()) { | 250 if (!code.empty()) { |
| 237 modified_code = code; | 251 modified_code = code; |
| 238 ApplyLanguageCodeCorrection(&modified_code); | 252 ApplyLanguageCodeCorrection(&modified_code); |
| 239 translate::ReportContentLanguage(code, modified_code); | 253 translate::ReportContentLanguage(code, modified_code); |
| 240 } | 254 } |
| 241 | 255 |
| 256 std::string cld_language = DetermineTextLanguage( | |
| 257 contents, &is_cld_reliable, modified_code, modified_html_lang); | |
| 258 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now()); | |
| 259 | |
| 260 if (cld_language_p != NULL) | |
| 261 *cld_language_p = cld_language; | |
| 262 if (is_cld_reliable_p != NULL) | |
| 263 *is_cld_reliable_p = is_cld_reliable; | |
| 264 translate::ToTranslateLanguageSynonym(&cld_language); | |
| 265 | |
| 242 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt | 266 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt |
| 243 // |modified_code|. | 267 // |modified_code|. |
| 244 std::string language = modified_html_lang.empty() ? modified_code : | 268 std::string language = modified_html_lang.empty() ? modified_code : |
| 245 modified_html_lang; | 269 modified_html_lang; |
| 246 | 270 |
| 247 // If |language| is empty, just use CLD result even though it might be | 271 // If |language| is empty, just use CLD result even though it might be |
| 248 // translate::kUnknownLanguageCode. | 272 // translate::kUnknownLanguageCode. |
| 249 if (language.empty()) { | 273 if (language.empty()) { |
| 250 translate::ReportLanguageVerification( | 274 translate::ReportLanguageVerification( |
| 251 translate::LANGUAGE_VERIFICATION_CLD_ONLY); | 275 translate::LANGUAGE_VERIFICATION_CLD_ONLY); |
| (...skipping 138 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 390 // distinguish from English, and the language is one of well-known languages | 414 // distinguish from English, and the language is one of well-known languages |
| 391 // which often provide "en-*" meta information mistakenly. | 415 // which often provide "en-*" meta information mistakenly. |
| 392 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 416 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
| 393 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 417 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
| 394 return true; | 418 return true; |
| 395 } | 419 } |
| 396 return false; | 420 return false; |
| 397 } | 421 } |
| 398 | 422 |
| 399 } // namespace translate | 423 } // namespace translate |
| OLD | NEW |