Chromium Code Reviews| Index: components/translate/core/language_detection/language_detection_util.cc |
| diff --git a/components/translate/core/language_detection/language_detection_util.cc b/components/translate/core/language_detection/language_detection_util.cc |
| index 5c751a7ffd59fb1427bb7597a306bfb2ba9a0d3c..1697fbed5ba4bd6a0c7b5dd54cc4ea747e11df5c 100644 |
| --- a/components/translate/core/language_detection/language_detection_util.cc |
| +++ b/components/translate/core/language_detection/language_detection_util.cc |
| @@ -21,6 +21,7 @@ |
| #if !defined(CLD_VERSION) || CLD_VERSION==2 |
| #include "third_party/cld_2/src/public/compact_lang_det.h" |
| +#include "third_party/cld_2/src/public/encodings.h" |
| #endif |
| namespace { |
| @@ -86,7 +87,9 @@ int GetCLDMajorVersion() { |
| // failed. |
| // |is_cld_reliable| will be set as true if CLD says the detection is reliable. |
| std::string DetermineTextLanguage(const base::string16& text, |
| - bool* is_cld_reliable) { |
| + bool* is_cld_reliable, |
| + std::string& code, |
| + std::string& html_lang) { |
| std::string language = translate::kUnknownLanguageCode; |
| int num_bytes_evaluated = 0; |
| bool is_reliable = false; |
| @@ -114,21 +117,41 @@ std::string DetermineTextLanguage(const base::string16& text, |
| const std::string utf8_text(base::UTF16ToUTF8(text)); |
| const int num_utf8_bytes = static_cast<int>(utf8_text.size()); |
| const char* raw_utf8_bytes = utf8_text.c_str(); |
| - cld_language = CLD2::DetectLanguageCheckUTF8( |
| - raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable, |
| - &num_bytes_evaluated); |
| + |
| + CLD2::Language language3[3]; |
| + int percent3[3]; |
| + int flags = 0; // No flags, see compact_lang_det.h for details. |
| + int text_bytes; // Amount of non-tag/letters-only text (assumed 0). |
| + double normalized_score3[3]; |
| + |
| + const char* tld_hint = ""; |
| + int encoding_hint = CLD2::UNKNOWN_ENCODING; |
| + CLD2::Language language_hint = |
| + CLD2::GetLanguageFromName(html_lang.c_str()); |
| + CLD2::CLDHints cldhints = {code.c_str(), tld_hint, encoding_hint, |
| + language_hint}; |
| + |
| + cld_language = CLD2::ExtDetectLanguageSummaryCheckUTF8( |
| + raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags, |
| + language3, percent3, normalized_score3, nullptr, &text_bytes, |
|
Andrew Hayden (chromium.org)
2015/08/05 11:22:45
Please comment the meaning of the nullptr here, we
|
| + &is_reliable, &num_bytes_evaluated); |
| if (num_bytes_evaluated < num_utf8_bytes && |
| cld_language == CLD2::UNKNOWN_LANGUAGE) { |
| // Invalid UTF8 encountered, see bug http://crbug.com/444258. |
| // Retry using only the valid characters. This time the check for valid |
| // UTF8 can be skipped since the precise number of valid bytes is known. |
| - cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated, |
| - is_plain_text, &is_reliable); |
| + cld_language = CLD2::ExtDetectLanguageSummary( |
| + raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags, |
| + language3, percent3, normalized_score3, nullptr, &text_bytes, |
|
Andrew Hayden (chromium.org)
2015/08/05 11:22:45
And same here, please
|
| + &is_reliable); |
| } |
| is_valid_language = cld_language != CLD2::NUM_LANGUAGES && |
| cld_language != CLD2::UNKNOWN_LANGUAGE && |
| cld_language != CLD2::TG_UNKNOWN_LANGUAGE; |
| + |
| + // Choose top language. |
| + cld_language = language3[0]; |
| break; |
| } |
| #endif |
| @@ -213,15 +236,6 @@ std::string DeterminePageLanguage(const std::string& code, |
| bool* is_cld_reliable_p) { |
| base::TimeTicks begin_time = base::TimeTicks::Now(); |
| bool is_cld_reliable; |
| - std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); |
| - translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now()); |
| - |
| - if (cld_language_p != NULL) |
| - *cld_language_p = cld_language; |
| - if (is_cld_reliable_p != NULL) |
| - *is_cld_reliable_p = is_cld_reliable; |
| - translate::ToTranslateLanguageSynonym(&cld_language); |
| - |
| // Check if html lang attribute is valid. |
| std::string modified_html_lang; |
| if (!html_lang.empty()) { |
| @@ -239,6 +253,16 @@ std::string DeterminePageLanguage(const std::string& code, |
| translate::ReportContentLanguage(code, modified_code); |
| } |
| + std::string cld_language = DetermineTextLanguage( |
| + contents, &is_cld_reliable, modified_code, modified_html_lang); |
| + translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now()); |
| + |
| + if (cld_language_p != NULL) |
| + *cld_language_p = cld_language; |
| + if (is_cld_reliable_p != NULL) |
| + *is_cld_reliable_p = is_cld_reliable; |
| + translate::ToTranslateLanguageSynonym(&cld_language); |
| + |
| // Adopt |modified_html_lang| if it is valid. Otherwise, adopt |
| // |modified_code|. |
| std::string language = modified_html_lang.empty() ? modified_code : |