Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(153)

Side by Side Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 1125403004: Switch language detection to use CLD2's DetectLanguageCheckUTF8 method. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: git cl format Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/translate/core/language_detection/language_detection_util.h " 5 #include "components/translate/core/language_detection/language_detection_util.h "
6 6
7 #include "base/logging.h" 7 #include "base/logging.h"
8 #include "base/metrics/field_trial.h" 8 #include "base/metrics/field_trial.h"
9 #include "base/strings/string_split.h" 9 #include "base/strings/string_split.h"
10 #include "base/strings/string_util.h" 10 #include "base/strings/string_util.h"
(...skipping 70 matching lines...) Expand 10 before | Expand all | Expand 10 after
81 return CLD_VERSION; 81 return CLD_VERSION;
82 #endif 82 #endif
83 } 83 }
84 84
85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it 85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
86 // failed. 86 // failed.
87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. 87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
88 std::string DetermineTextLanguage(const base::string16& text, 88 std::string DetermineTextLanguage(const base::string16& text,
89 bool* is_cld_reliable) { 89 bool* is_cld_reliable) {
90 std::string language = translate::kUnknownLanguageCode; 90 std::string language = translate::kUnknownLanguageCode;
91 int text_bytes = 0; 91 int num_bytes_evaluated = 0;
92 bool is_reliable = false; 92 bool is_reliable = false;
93 const bool is_plain_text = true;
93 94
94 // Language or CLD2::Language 95 // Language or CLD2::Language
95 int cld_language = 0; 96 int cld_language = 0;
96 bool is_valid_language = false; 97 bool is_valid_language = false;
97 98
98 switch (GetCLDMajorVersion()) { 99 switch (GetCLDMajorVersion()) {
99 #if !defined(CLD_VERSION) || CLD_VERSION==1 100 #if !defined(CLD_VERSION) || CLD_VERSION==1
100 case 1: { 101 case 1: {
101 int num_languages = 0; 102 int num_languages = 0;
102 cld_language = 103 cld_language = DetectLanguageOfUnicodeText(
103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, 104 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL,
104 &num_languages, NULL, &text_bytes); 105 &num_bytes_evaluated);
105 is_valid_language = cld_language != NUM_LANGUAGES && 106 is_valid_language = cld_language != NUM_LANGUAGES &&
106 cld_language != UNKNOWN_LANGUAGE && 107 cld_language != UNKNOWN_LANGUAGE &&
107 cld_language != TG_UNKNOWN_LANGUAGE; 108 cld_language != TG_UNKNOWN_LANGUAGE;
108 break; 109 break;
109 } 110 }
110 #endif 111 #endif
111 #if !defined(CLD_VERSION) || CLD_VERSION==2 112 #if !defined(CLD_VERSION) || CLD_VERSION==2
112 case 2: { 113 case 2: {
113 std::string utf8_text(base::UTF16ToUTF8(text)); 114 const std::string utf8_text(base::UTF16ToUTF8(text));
114 CLD2::Language language3[3]; 115 const int num_utf8_bytes = static_cast<int>(utf8_text.size());
115 int percent3[3]; 116 const char* raw_utf8_bytes = utf8_text.c_str();
116 CLD2::DetectLanguageSummary( 117 cld_language = CLD2::DetectLanguageCheckUTF8(
117 utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3, 118 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable,
118 &text_bytes, &is_reliable); 119 &num_bytes_evaluated);
119 cld_language = language3[0]; 120
121 if (num_bytes_evaluated < num_utf8_bytes &&
122 cld_language == CLD2::UNKNOWN_LANGUAGE) {
123 // Invalid UTF8 encountered, see bug http://crbug.com/444258.
124 // Retry using only the valid characters. This time the check for valid
125 // UTF8 can be skipped since the precise number of valid bytes is known.
126 cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated,
127 is_plain_text, &is_reliable);
128 }
120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && 129 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
121 cld_language != CLD2::UNKNOWN_LANGUAGE && 130 cld_language != CLD2::UNKNOWN_LANGUAGE &&
122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; 131 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
123 break; 132 break;
124 } 133 }
125 #endif 134 #endif
126 default: 135 default:
127 NOTREACHED(); 136 NOTREACHED();
128 } 137 }
129 138
130 if (is_cld_reliable != NULL) 139 if (is_cld_reliable != NULL)
131 *is_cld_reliable = is_reliable; 140 *is_cld_reliable = is_reliable;
132 141
133 // We don't trust the result if the CLD reports that the detection is not 142 // We don't trust the result if the CLD reports that the detection is not
134 // reliable, or if the actual text used to detect the language was less than 143 // reliable, or if the actual text used to detect the language was less than
135 // 100 bytes (short texts can often lead to wrong results). 144 // 100 bytes (short texts can often lead to wrong results).
136 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that 145 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
137 // the determined language code is correct with 50% confidence. Chrome should 146 // the determined language code is correct with 50% confidence. Chrome should
138 // handle the real confidence value to judge. 147 // handle the real confidence value to judge.
139 if (is_reliable && text_bytes >= 100 && is_valid_language) { 148 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) {
140 // We should not use LanguageCode_ISO_639_1 because it does not cover all 149 // We should not use LanguageCode_ISO_639_1 because it does not cover all
141 // the languages CLD can detect. As a result, it'll return the invalid 150 // the languages CLD can detect. As a result, it'll return the invalid
142 // language code for tradtional Chinese among others. 151 // language code for tradtional Chinese among others.
143 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and 152 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN 153 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
145 // for Simplified Chinese. 154 // for Simplified Chinese.
146 switch (GetCLDMajorVersion()) { 155 switch (GetCLDMajorVersion()) {
147 #if !defined(CLD_VERSION) || CLD_VERSION==1 156 #if !defined(CLD_VERSION) || CLD_VERSION==1
148 case 1: 157 case 1:
149 language = 158 language =
(...skipping 230 matching lines...) Expand 10 before | Expand all | Expand 10 after
380 // distinguish from English, and the language is one of well-known languages 389 // distinguish from English, and the language is one of well-known languages
381 // which often provide "en-*" meta information mistakenly. 390 // which often provide "en-*" meta information mistakenly.
382 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 391 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
383 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 392 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
384 return true; 393 return true;
385 } 394 }
386 return false; 395 return false;
387 } 396 }
388 397
389 } // namespace translate 398 } // namespace translate
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698