components/translate/core/language_detection/language_detection_util.cc - Issue 1125403004: Switch language detection to use CLD2's DetectLanguageCheckUTF8 method.

Unified Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 1125403004: Switch language detection to use CLD2's DetectLanguageCheckUTF8 method. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: git cl format Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: components/translate/core/language_detection/language_detection_util.cc

diff --git a/components/translate/core/language_detection/language_detection_util.cc b/components/translate/core/language_detection/language_detection_util.cc

index 35a21b9019e007035260299103fcf259c355070c..ab25a7530b4cee704fbfc96ac2fecec387a42374 100644

--- a/components/translate/core/language_detection/language_detection_util.cc

+++ b/components/translate/core/language_detection/language_detection_util.cc

@@ -88,8 +88,9 @@ int GetCLDMajorVersion() {

std::string DetermineTextLanguage(const base::string16& text,

bool* is_cld_reliable) {

std::string language = translate::kUnknownLanguageCode;

- int text_bytes = 0;

+ int num_bytes_evaluated = 0;

bool is_reliable = false;

+ const bool is_plain_text = true;

// Language or CLD2::Language

int cld_language = 0;

@@ -99,9 +100,9 @@ std::string DetermineTextLanguage(const base::string16& text,

#if !defined(CLD_VERSION) || CLD_VERSION==1

case 1: {

int num_languages = 0;

- cld_language =

- DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,

- &num_languages, NULL, &text_bytes);

+ cld_language = DetectLanguageOfUnicodeText(

+ NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL,

+ &num_bytes_evaluated);

is_valid_language = cld_language != NUM_LANGUAGES &&

cld_language != UNKNOWN_LANGUAGE &&

cld_language != TG_UNKNOWN_LANGUAGE;

@@ -110,13 +111,21 @@ std::string DetermineTextLanguage(const base::string16& text,

#endif

#if !defined(CLD_VERSION) || CLD_VERSION==2

case 2: {

- std::string utf8_text(base::UTF16ToUTF8(text));

- CLD2::Language language3[3];

- int percent3[3];

- CLD2::DetectLanguageSummary(

- utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3,

- &text_bytes, &is_reliable);

- cld_language = language3[0];

+ const std::string utf8_text(base::UTF16ToUTF8(text));

+ const int num_utf8_bytes = static_cast<int>(utf8_text.size());

+ const char* raw_utf8_bytes = utf8_text.c_str();

+ cld_language = CLD2::DetectLanguageCheckUTF8(

+ raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable,

+ &num_bytes_evaluated);

+ if (num_bytes_evaluated < num_utf8_bytes &&

+ cld_language == CLD2::UNKNOWN_LANGUAGE) {

+ // Invalid UTF8 encountered, see bug http://crbug.com/444258.

+ // Retry using only the valid characters. This time the check for valid

+ // UTF8 can be skipped since the precise number of valid bytes is known.

+ cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated,

+ is_plain_text, &is_reliable);

+ }

is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&

cld_language != CLD2::UNKNOWN_LANGUAGE &&

cld_language != CLD2::TG_UNKNOWN_LANGUAGE;

@@ -136,7 +145,7 @@ std::string DetermineTextLanguage(const base::string16& text,

// TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that

// the determined language code is correct with 50% confidence. Chrome should

// handle the real confidence value to judge.

- if (is_reliable && text_bytes >= 100 && is_valid_language) {

+ if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) {

// We should not use LanguageCode_ISO_639_1 because it does not cover all

// the languages CLD can detect. As a result, it'll return the invalid

// language code for tradtional Chinese among others.

« no previous file with comments | « no previous file | no next file » | no next file with comments »