Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1007)

Unified Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 1125403004: Switch language detection to use CLD2's DetectLanguageCheckUTF8 method. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: git cl format Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: components/translate/core/language_detection/language_detection_util.cc
diff --git a/components/translate/core/language_detection/language_detection_util.cc b/components/translate/core/language_detection/language_detection_util.cc
index 35a21b9019e007035260299103fcf259c355070c..ab25a7530b4cee704fbfc96ac2fecec387a42374 100644
--- a/components/translate/core/language_detection/language_detection_util.cc
+++ b/components/translate/core/language_detection/language_detection_util.cc
@@ -88,8 +88,9 @@ int GetCLDMajorVersion() {
std::string DetermineTextLanguage(const base::string16& text,
bool* is_cld_reliable) {
std::string language = translate::kUnknownLanguageCode;
- int text_bytes = 0;
+ int num_bytes_evaluated = 0;
bool is_reliable = false;
+ const bool is_plain_text = true;
// Language or CLD2::Language
int cld_language = 0;
@@ -99,9 +100,9 @@ std::string DetermineTextLanguage(const base::string16& text,
#if !defined(CLD_VERSION) || CLD_VERSION==1
case 1: {
int num_languages = 0;
- cld_language =
- DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
- &num_languages, NULL, &text_bytes);
+ cld_language = DetectLanguageOfUnicodeText(
+ NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL,
+ &num_bytes_evaluated);
is_valid_language = cld_language != NUM_LANGUAGES &&
cld_language != UNKNOWN_LANGUAGE &&
cld_language != TG_UNKNOWN_LANGUAGE;
@@ -110,13 +111,21 @@ std::string DetermineTextLanguage(const base::string16& text,
#endif
#if !defined(CLD_VERSION) || CLD_VERSION==2
case 2: {
- std::string utf8_text(base::UTF16ToUTF8(text));
- CLD2::Language language3[3];
- int percent3[3];
- CLD2::DetectLanguageSummary(
- utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3,
- &text_bytes, &is_reliable);
- cld_language = language3[0];
+ const std::string utf8_text(base::UTF16ToUTF8(text));
+ const int num_utf8_bytes = static_cast<int>(utf8_text.size());
+ const char* raw_utf8_bytes = utf8_text.c_str();
+ cld_language = CLD2::DetectLanguageCheckUTF8(
+ raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable,
+ &num_bytes_evaluated);
+
+ if (num_bytes_evaluated < num_utf8_bytes &&
+ cld_language == CLD2::UNKNOWN_LANGUAGE) {
+ // Invalid UTF8 encountered, see bug http://crbug.com/444258.
+ // Retry using only the valid characters. This time the check for valid
+ // UTF8 can be skipped since the precise number of valid bytes is known.
+ cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated,
+ is_plain_text, &is_reliable);
+ }
is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
cld_language != CLD2::UNKNOWN_LANGUAGE &&
cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
@@ -136,7 +145,7 @@ std::string DetermineTextLanguage(const base::string16& text,
// TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
// the determined language code is correct with 50% confidence. Chrome should
// handle the real confidence value to judge.
- if (is_reliable && text_bytes >= 100 && is_valid_language) {
+ if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) {
// We should not use LanguageCode_ISO_639_1 because it does not cover all
// the languages CLD can detect. As a result, it'll return the invalid
// language code for tradtional Chinese among others.
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698