| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/translate/core/language_detection/language_detection_util.h
" | 5 #include "components/translate/core/language_detection/language_detection_util.h
" |
| 6 | 6 |
| 7 #include "base/logging.h" | 7 #include "base/logging.h" |
| 8 #include "base/strings/string_split.h" | 8 #include "base/strings/string_split.h" |
| 9 #include "base/strings/string_util.h" | 9 #include "base/strings/string_util.h" |
| 10 #include "base/strings/utf_string_conversions.h" | 10 #include "base/strings/utf_string_conversions.h" |
| 11 #include "base/time/time.h" | 11 #include "base/time/time.h" |
| 12 #include "components/translate/core/common/translate_constants.h" | 12 #include "components/translate/core/common/translate_constants.h" |
| 13 #include "components/translate/core/common/translate_metrics.h" | 13 #include "components/translate/core/common/translate_metrics.h" |
| 14 #include "components/translate/core/common/translate_util.h" | 14 #include "components/translate/core/common/translate_util.h" |
| 15 | 15 |
| 16 #if CLD_VERSION==1 | 16 #if CLD_VERSION==1 |
| 17 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" | 17 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" |
| 18 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" | 18 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" |
| 19 #endif | 19 #endif |
| 20 | 20 |
| 21 #if CLD_VERSION==2 | 21 #if CLD_VERSION==2 |
| 22 #include "third_party/cld_2/src/public/compact_lang_det.h" | 22 #include "third_party/cld_2/src/public/compact_lang_det.h" |
| 23 #include "third_party/cld_2/src/public/encodings.h" |
| 23 #endif | 24 #endif |
| 24 | 25 |
| 25 namespace { | 26 namespace { |
| 26 | 27 |
| 27 // Similar language code list. Some languages are very similar and difficult | 28 // Similar language code list. Some languages are very similar and difficult |
| 28 // for CLD to distinguish. | 29 // for CLD to distinguish. |
| 29 struct SimilarLanguageCode { | 30 struct SimilarLanguageCode { |
| 30 const char* const code; | 31 const char* const code; |
| 31 int group; | 32 int group; |
| 32 }; | 33 }; |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 66 return; | 67 return; |
| 67 } | 68 } |
| 68 | 69 |
| 69 translate::ToTranslateLanguageSynonym(code); | 70 translate::ToTranslateLanguageSynonym(code); |
| 70 } | 71 } |
| 71 | 72 |
| 72 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it | 73 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it |
| 73 // failed. | 74 // failed. |
| 74 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. | 75 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
| 75 std::string DetermineTextLanguage(const base::string16& text, | 76 std::string DetermineTextLanguage(const base::string16& text, |
| 76 bool* is_cld_reliable) { | 77 bool* is_cld_reliable, |
| 78 std::string& code, |
| 79 std::string& html_lang) { |
| 77 std::string language = translate::kUnknownLanguageCode; | 80 std::string language = translate::kUnknownLanguageCode; |
| 78 int num_bytes_evaluated = 0; | 81 int num_bytes_evaluated = 0; |
| 79 bool is_reliable = false; | 82 bool is_reliable = false; |
| 80 const bool is_plain_text = true; | 83 const bool is_plain_text = true; |
| 81 | 84 |
| 82 // Language or CLD2::Language | 85 // Language or CLD2::Language |
| 83 int cld_language = 0; | 86 int cld_language = 0; |
| 84 bool is_valid_language = false; | 87 bool is_valid_language = false; |
| 85 | 88 |
| 86 #if CLD_VERSION==1 | 89 #if CLD_VERSION==1 |
| 87 int num_languages = 0; | 90 int num_languages = 0; |
| 88 cld_language = DetectLanguageOfUnicodeText( | 91 cld_language = DetectLanguageOfUnicodeText(NULL, text.c_str(), is_plain_text, |
| 89 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL, | 92 &is_reliable, &num_languages, NULL, |
| 90 &num_bytes_evaluated); | 93 &num_bytes_evaluated); |
| 91 is_valid_language = cld_language != NUM_LANGUAGES && | 94 is_valid_language = cld_language != NUM_LANGUAGES && |
| 92 cld_language != UNKNOWN_LANGUAGE && | 95 cld_language != UNKNOWN_LANGUAGE && |
| 93 cld_language != TG_UNKNOWN_LANGUAGE; | 96 cld_language != TG_UNKNOWN_LANGUAGE; |
| 94 #elif CLD_VERSION==2 | 97 #elif CLD_VERSION==2 |
| 95 const std::string utf8_text(base::UTF16ToUTF8(text)); | 98 const std::string utf8_text(base::UTF16ToUTF8(text)); |
| 96 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); | 99 const int num_utf8_bytes = static_cast<int>(utf8_text.size()); |
| 97 const char* raw_utf8_bytes = utf8_text.c_str(); | 100 const char* raw_utf8_bytes = utf8_text.c_str(); |
| 98 cld_language = CLD2::DetectLanguageCheckUTF8( | 101 |
| 99 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable, | 102 CLD2::Language language3[3]; |
| 103 int percent3[3]; |
| 104 int flags = 0; // No flags, see compact_lang_det.h for details. |
| 105 int text_bytes; // Amount of non-tag/letters-only text (assumed 0). |
| 106 double normalized_score3[3]; |
| 107 |
| 108 const char* tld_hint = ""; |
| 109 int encoding_hint = CLD2::UNKNOWN_ENCODING; |
| 110 CLD2::Language language_hint = CLD2::GetLanguageFromName(html_lang.c_str()); |
| 111 CLD2::CLDHints cldhints = {code.c_str(), tld_hint, encoding_hint, |
| 112 language_hint}; |
| 113 |
| 114 cld_language = CLD2::ExtDetectLanguageSummaryCheckUTF8( |
| 115 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags, |
| 116 language3, percent3, normalized_score3, |
| 117 nullptr /* No ResultChunkVector used */, &text_bytes, &is_reliable, |
| 100 &num_bytes_evaluated); | 118 &num_bytes_evaluated); |
| 101 | 119 |
| 102 if (num_bytes_evaluated < num_utf8_bytes && | 120 if (num_bytes_evaluated < num_utf8_bytes && |
| 103 cld_language == CLD2::UNKNOWN_LANGUAGE) { | 121 cld_language == CLD2::UNKNOWN_LANGUAGE) { |
| 104 // Invalid UTF8 encountered, see bug http://crbug.com/444258. | 122 // Invalid UTF8 encountered, see bug http://crbug.com/444258. |
| 105 // Retry using only the valid characters. This time the check for valid | 123 // Retry using only the valid characters. This time the check for valid |
| 106 // UTF8 can be skipped since the precise number of valid bytes is known. | 124 // UTF8 can be skipped since the precise number of valid bytes is known. |
| 107 cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated, | 125 cld_language = CLD2::ExtDetectLanguageSummary( |
| 108 is_plain_text, &is_reliable); | 126 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags, |
| 127 language3, percent3, normalized_score3, |
| 128 nullptr /* No ResultChunkVector used */, &text_bytes, &is_reliable); |
| 109 } | 129 } |
| 110 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && | 130 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && |
| 111 cld_language != CLD2::UNKNOWN_LANGUAGE && | 131 cld_language != CLD2::UNKNOWN_LANGUAGE && |
| 112 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; | 132 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; |
| 133 |
| 134 // Choose top language. |
| 135 cld_language = language3[0]; |
| 113 #else | 136 #else |
| 114 # error "CLD_VERSION must be 1 or 2" | 137 # error "CLD_VERSION must be 1 or 2" |
| 115 #endif | 138 #endif |
| 116 | 139 |
| 117 if (is_cld_reliable != NULL) | 140 if (is_cld_reliable != NULL) |
| 118 *is_cld_reliable = is_reliable; | 141 *is_cld_reliable = is_reliable; |
| 119 | 142 |
| 120 // We don't trust the result if the CLD reports that the detection is not | 143 // We don't trust the result if the CLD reports that the detection is not |
| 121 // reliable, or if the actual text used to detect the language was less than | 144 // reliable, or if the actual text used to detect the language was less than |
| 122 // 100 bytes (short texts can often lead to wrong results). | 145 // 100 bytes (short texts can often lead to wrong results). |
| (...skipping 51 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 174 | 197 |
| 175 namespace translate { | 198 namespace translate { |
| 176 | 199 |
| 177 std::string DeterminePageLanguage(const std::string& code, | 200 std::string DeterminePageLanguage(const std::string& code, |
| 178 const std::string& html_lang, | 201 const std::string& html_lang, |
| 179 const base::string16& contents, | 202 const base::string16& contents, |
| 180 std::string* cld_language_p, | 203 std::string* cld_language_p, |
| 181 bool* is_cld_reliable_p) { | 204 bool* is_cld_reliable_p) { |
| 182 base::TimeTicks begin_time = base::TimeTicks::Now(); | 205 base::TimeTicks begin_time = base::TimeTicks::Now(); |
| 183 bool is_cld_reliable; | 206 bool is_cld_reliable; |
| 184 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); | |
| 185 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now()); | |
| 186 | |
| 187 if (cld_language_p != NULL) | |
| 188 *cld_language_p = cld_language; | |
| 189 if (is_cld_reliable_p != NULL) | |
| 190 *is_cld_reliable_p = is_cld_reliable; | |
| 191 translate::ToTranslateLanguageSynonym(&cld_language); | |
| 192 | |
| 193 // Check if html lang attribute is valid. | 207 // Check if html lang attribute is valid. |
| 194 std::string modified_html_lang; | 208 std::string modified_html_lang; |
| 195 if (!html_lang.empty()) { | 209 if (!html_lang.empty()) { |
| 196 modified_html_lang = html_lang; | 210 modified_html_lang = html_lang; |
| 197 ApplyLanguageCodeCorrection(&modified_html_lang); | 211 ApplyLanguageCodeCorrection(&modified_html_lang); |
| 198 translate::ReportHtmlLang(html_lang, modified_html_lang); | 212 translate::ReportHtmlLang(html_lang, modified_html_lang); |
| 199 VLOG(9) << "html lang based language code: " << modified_html_lang; | 213 VLOG(9) << "html lang based language code: " << modified_html_lang; |
| 200 } | 214 } |
| 201 | 215 |
| 202 // Check if Content-Language is valid. | 216 // Check if Content-Language is valid. |
| 203 std::string modified_code; | 217 std::string modified_code; |
| 204 if (!code.empty()) { | 218 if (!code.empty()) { |
| 205 modified_code = code; | 219 modified_code = code; |
| 206 ApplyLanguageCodeCorrection(&modified_code); | 220 ApplyLanguageCodeCorrection(&modified_code); |
| 207 translate::ReportContentLanguage(code, modified_code); | 221 translate::ReportContentLanguage(code, modified_code); |
| 208 } | 222 } |
| 209 | 223 |
| 224 std::string cld_language = DetermineTextLanguage( |
| 225 contents, &is_cld_reliable, modified_code, modified_html_lang); |
| 226 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now()); |
| 227 |
| 228 if (cld_language_p != NULL) |
| 229 *cld_language_p = cld_language; |
| 230 if (is_cld_reliable_p != NULL) |
| 231 *is_cld_reliable_p = is_cld_reliable; |
| 232 translate::ToTranslateLanguageSynonym(&cld_language); |
| 233 |
| 210 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt | 234 // Adopt |modified_html_lang| if it is valid. Otherwise, adopt |
| 211 // |modified_code|. | 235 // |modified_code|. |
| 212 std::string language = modified_html_lang.empty() ? modified_code : | 236 std::string language = modified_html_lang.empty() ? modified_code : |
| 213 modified_html_lang; | 237 modified_html_lang; |
| 214 | 238 |
| 215 // If |language| is empty, just use CLD result even though it might be | 239 // If |language| is empty, just use CLD result even though it might be |
| 216 // translate::kUnknownLanguageCode. | 240 // translate::kUnknownLanguageCode. |
| 217 if (language.empty()) { | 241 if (language.empty()) { |
| 218 translate::ReportLanguageVerification( | 242 translate::ReportLanguageVerification( |
| 219 translate::LANGUAGE_VERIFICATION_CLD_ONLY); | 243 translate::LANGUAGE_VERIFICATION_CLD_ONLY); |
| (...skipping 138 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 358 // distinguish from English, and the language is one of well-known languages | 382 // distinguish from English, and the language is one of well-known languages |
| 359 // which often provide "en-*" meta information mistakenly. | 383 // which often provide "en-*" meta information mistakenly. |
| 360 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { | 384 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { |
| 361 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) | 385 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) |
| 362 return true; | 386 return true; |
| 363 } | 387 } |
| 364 return false; | 388 return false; |
| 365 } | 389 } |
| 366 | 390 |
| 367 } // namespace translate | 391 } // namespace translate |
| OLD | NEW |