chrome/common/translate/language_detection_util.cc - Issue 25531002: Move language detection to a component

Side by Side Diff: chrome/common/translate/language_detection_util.cc

Issue 25531002: Move language detection to a component (Closed) Base URL: http://git.chromium.org/chromium/src.git@master

Patch Set: Run translate unittests on iOS Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "chrome/common/translate/language_detection_util.h"

6

7 #include "base/logging.h"

8 #include "base/metrics/field_trial.h"

9 #include "base/strings/string_split.h"

10 #include "base/strings/string_util.h"

11 #include "base/strings/utf_string_conversions.h"

12 #include "base/time/time.h"

13 #include "chrome/common/chrome_constants.h"

14 #include "chrome/common/translate/translate_common_metrics.h"

15 #include "chrome/common/translate/translate_util.h"

16

17 #if !defined(CLD_VERSION) \|\| CLD_VERSION==1

18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"

19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"

20 #endif

21

22 #if !defined(CLD_VERSION) \|\| CLD_VERSION==2

23 #include "third_party/cld_2/src/public/compact_lang_det.h"

24 #endif

25

26 namespace {

27

28 // Similar language code list. Some languages are very similar and difficult

29 // for CLD to distinguish.

30 struct SimilarLanguageCode {

31 const char* const code;

32 int group;

33 };

34

35 const SimilarLanguageCode kSimilarLanguageCodes[] = {

36 {"bs", 1},

37 {"hr", 1},

38 {"hi", 2},

39 {"ne", 2},

40 };

41

42 // Checks \|kSimilarLanguageCodes\| and returns group code.

43 int GetSimilarLanguageGroupCode(const std::string& language) {

44 for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {

45 if (language.find(kSimilarLanguageCodes[i].code) != 0)

46 continue;

47 return kSimilarLanguageCodes[i].group;

48 }

49 return 0;

50 }

51

52 // Well-known languages which often have wrong server configuration of

53 // Content-Language: en.

54 // TODO(toyoshim): Remove these static tables and caller functions to

55 // chrome/common/translate, and implement them as std::set<>.

56 const char* kWellKnownCodesOnWrongConfiguration[] = {

57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"

58 };

59

60 // Applies a series of language code modification in proper order.

61 void ApplyLanguageCodeCorrection(std::string* code) {

62 // Correct well-known format errors.

63 LanguageDetectionUtil::CorrectLanguageCodeTypo(code);

64

65 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) {

66 *code = std::string();

67 return;

68 }

69

70 TranslateUtil::ToTranslateLanguageSynonym(code);

71 }

72

73 int GetCLDMajorVersion() {

74 #if !defined(CLD_VERSION)

75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");

76 if (group_name == "CLD2")

77 return 2;

78 else

79 return 1;

80 #else

81 return CLD_VERSION;

82 #endif

83 }

84

85 // Returns the ISO 639 language code of the specified \|text\|, or 'unknown' if it

86 // failed.

87 // \|is_cld_reliable\| will be set as true if CLD says the detection is reliable.

88 std::string DetermineTextLanguage(const base::string16& text,

89 bool* is_cld_reliable) {

90 std::string language = chrome::kUnknownLanguageCode;

91 int text_bytes = 0;

92 bool is_reliable = false;

93

94 // Language or CLD2::Language

95 int cld_language = 0;

96 bool is_valid_language = false;

97

98 switch (GetCLDMajorVersion()) {

99 #if !defined(CLD_VERSION) \|\| CLD_VERSION==1

100 case 1: {

101 int num_languages = 0;

102 cld_language =

103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,

104 &num_languages, NULL, &text_bytes);

105 is_valid_language = cld_language != NUM_LANGUAGES &&

106 cld_language != UNKNOWN_LANGUAGE &&

107 cld_language != TG_UNKNOWN_LANGUAGE;

108 break;

109 }

110 #endif

111 #if !defined(CLD_VERSION) \|\| CLD_VERSION==2

112 case 2: {

113 std::string utf8_text(UTF16ToUTF8(text));

114 CLD2::Language language3[3];

115 int percent3[3];

116 cld_language =

117 CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true,

118 language3, percent3,

119 &text_bytes, &is_reliable);

120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&

121 cld_language != CLD2::UNKNOWN_LANGUAGE &&

122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;

123 break;

124 }

125 #endif

126 default:

127 NOTREACHED();

128 }

129

130 if (is_cld_reliable != NULL)

131 *is_cld_reliable = is_reliable;

132

133 // We don't trust the result if the CLD reports that the detection is not

134 // reliable, or if the actual text used to detect the language was less than

135 // 100 bytes (short texts can often lead to wrong results).

136 // TODO(toyoshim): CLD provides \|is_reliable\| flag. But, it just says that

137 // the determined language code is correct with 50% confidence. Chrome should

138 // handle the real confidence value to judge.

139 if (is_reliable && text_bytes >= 100 && is_valid_language) {

140 // We should not use LanguageCode_ISO_639_1 because it does not cover all

141 // the languages CLD can detect. As a result, it'll return the invalid

142 // language code for tradtional Chinese among others.

143 // \|LanguageCodeWithDialect\| will go through ISO 639-1, ISO-639-2 and

144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN

145 // for Simplified Chinese.

146 switch (GetCLDMajorVersion()) {

147 #if !defined(CLD_VERSION) \|\| CLD_VERSION==1

148 case 1:

149 language =

150 LanguageCodeWithDialects(static_cast<Language>(cld_language));

151 break;

152 #endif

153 #if !defined(CLD_VERSION) \|\| CLD_VERSION==2

154 case 2:

155 // (1) CLD2's LanguageCode returns general Chinese 'zh' for

156 // CLD2::CHINESE, but Translate server doesn't accept it. This is

157 // converted to 'zh-CN' in the same way as CLD1's

158 // LanguageCodeWithDialects.

159 //

160 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for

161 // CLD2::CHINESE_T. This is technically more precise for the language

162 // code of traditional Chinese, while Translate server hasn't accepted

163 // zh-Hant yet.

164 if (cld_language == CLD2::CHINESE) {

165 language = "zh-CN";

166 } else if (cld_language == CLD2::CHINESE_T) {

167 language = "zh-TW";

168 } else {

169 language =

170 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));

171 }

172 break;

173 #endif

174 default:

175 NOTREACHED();

176 }

177 }

178 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text

179 << "\n*************************************\n";

180 return language;

181 }

182

183 // Checks if CLD can complement a sub code when the page language doesn't know

184 // the sub code.

185 bool CanCLDComplementSubCode(

186 const std::string& page_language, const std::string& cld_language) {

187 // Translate server cannot treat general Chinese. If Content-Language and

188 // CLD agree that the language is Chinese and Content-Language doesn't know

189 // which dialect is used, CLD language has priority.

190 // TODO(hajimehoshi): How about the other dialects like zh-MO?

191 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);

192 }

193

194 } // namespace

195

196 namespace LanguageDetectionUtil {

197

198 std::string DeterminePageLanguage(const std::string& code,

199 const std::string& html_lang,

200 const base::string16& contents,

201 std::string* cld_language_p,

202 bool* is_cld_reliable_p) {

203 base::TimeTicks begin_time = base::TimeTicks::Now();

204 bool is_cld_reliable;

205 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);

206 TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time,

207 base::TimeTicks::Now());

208

209 if (cld_language_p != NULL)

210 *cld_language_p = cld_language;

211 if (is_cld_reliable_p != NULL)

212 *is_cld_reliable_p = is_cld_reliable;

213 TranslateUtil::ToTranslateLanguageSynonym(&cld_language);

214

215 // Check if html lang attribute is valid.

216 std::string modified_html_lang;

217 if (!html_lang.empty()) {

218 modified_html_lang = html_lang;

219 ApplyLanguageCodeCorrection(&modified_html_lang);

220 TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang);

221 VLOG(9) << "html lang based language code: " << modified_html_lang;

222 }

223

224 // Check if Content-Language is valid.

225 std::string modified_code;

226 if (!code.empty()) {

227 modified_code = code;

228 ApplyLanguageCodeCorrection(&modified_code);

229 TranslateCommonMetrics::ReportContentLanguage(code, modified_code);

230 }

231

232 // Adopt \|modified_html_lang\| if it is valid. Otherwise, adopt

233 // \|modified_code\|.

234 std::string language = modified_html_lang.empty() ? modified_code :

235 modified_html_lang;

236

237 // If \|language\| is empty, just use CLD result even though it might be

238 // chrome::kUnknownLanguageCode.

239 if (language.empty()) {

240 TranslateCommonMetrics::ReportLanguageVerification(

241 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY);

242 return cld_language;

243 }

244

245 if (cld_language == chrome::kUnknownLanguageCode) {

246 TranslateCommonMetrics::ReportLanguageVerification(

247 TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN);

248 return language;

249 } else if (CanCLDComplementSubCode(language, cld_language)) {

250 TranslateCommonMetrics::ReportLanguageVerification(

251 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);

252 return cld_language;

253 } else if (IsSameOrSimilarLanguages(language, cld_language)) {

254 TranslateCommonMetrics::ReportLanguageVerification(

255 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE);

256 return language;

257 } else if (MaybeServerWrongConfiguration(language, cld_language)) {

258 TranslateCommonMetrics::ReportLanguageVerification(

259 TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD);

260 return cld_language;

261 } else {

262 TranslateCommonMetrics::ReportLanguageVerification(

263 TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE);

264 // Content-Language value might be wrong because CLD says that this page

265 // is written in another language with confidence.

266 // In this case, Chrome doesn't rely on any of the language codes, and

267 // gives up suggesting a translation.

268 return std::string(chrome::kUnknownLanguageCode);

269 }

270

271 return language;

272 }

273

274 void CorrectLanguageCodeTypo(std::string* code) {

275 DCHECK(code);

276

277 size_t coma_index = code->find(',');

278 if (coma_index != std::string::npos) {

279 // There are more than 1 language specified, just keep the first one.

280 *code = code->substr(0, coma_index);

281 }

282 TrimWhitespaceASCII(*code, TRIM_ALL, code);

283

284 // An underscore instead of a dash is a frequent mistake.

285 size_t underscore_index = code->find('_');

286 if (underscore_index != std::string::npos)

287 (*code)[underscore_index] = '-';

288

289 // Change everything up to a dash to lower-case and everything after to upper.

290 size_t dash_index = code->find('-');

291 if (dash_index != std::string::npos) {

292 *code = StringToLowerASCII(code->substr(0, dash_index)) +

293 StringToUpperASCII(code->substr(dash_index));

294 } else {

295 code = StringToLowerASCII(code);

296 }

297 }

298

299 bool IsValidLanguageCode(const std::string& code) {

300 // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.

301 // TODO(hajimehoshi): How about es-419, which is used as an Accept language?

302 std::vector<std::string> chunks;

303 base::SplitString(code, '-', &chunks);

304

305 if (chunks.size() < 1 \|\| 2 < chunks.size())

306 return false;

307

308 const std::string& main_code = chunks[0];

309

310 if (main_code.size() < 1 \|\| 3 < main_code.size())

311 return false;

312

313 for (std::string::const_iterator it = main_code.begin();

314 it != main_code.end(); ++it) {

315 if (!IsAsciiAlpha(*it))

316 return false;

317 }

318

319 if (chunks.size() == 1)

320 return true;

321

322 const std::string& sub_code = chunks[1];

323

324 if (sub_code.size() != 2)

325 return false;

326

327 for (std::string::const_iterator it = sub_code.begin();

328 it != sub_code.end(); ++it) {

329 if (!IsAsciiAlpha(*it))

330 return false;

331 }

332

333 return true;

334 }

335

336 bool IsSameOrSimilarLanguages(const std::string& page_language,

337 const std::string& cld_language) {

338 std::vector<std::string> chunks;

339

340 base::SplitString(page_language, '-', &chunks);

341 if (chunks.size() == 0)

342 return false;

343 std::string page_language_main_part = chunks[0];

344

345 base::SplitString(cld_language, '-', &chunks);

346 if (chunks.size() == 0)

347 return false;

348 std::string cld_language_main_part = chunks[0];

349

350 // Language code part of \|page_language\| is matched to one of \|cld_language\|.

351 // Country code is ignored here.

352 if (page_language_main_part == cld_language_main_part) {

353 // Languages are matched strictly. Reports false to metrics, but returns

354 // true.

355 TranslateCommonMetrics::ReportSimilarLanguageMatch(false);

356 return true;

357 }

358

359 // Check if \|page_language\| and \|cld_language\| are in the similar language

360 // list and belong to the same language group.

361 int page_code = GetSimilarLanguageGroupCode(page_language);

362 bool match = page_code != 0 &&

363 page_code == GetSimilarLanguageGroupCode(cld_language);

364

365 TranslateCommonMetrics::ReportSimilarLanguageMatch(match);

366 return match;

367 }

368

369 bool MaybeServerWrongConfiguration(const std::string& page_language,

370 const std::string& cld_language) {

371 // If \|page_language\| is not "en-*", respect it and just return false here.

372 if (!StartsWithASCII(page_language, "en", false))

373 return false;

374

375 // A server provides a language meta information representing "en-*". But it

376 // might be just a default value due to missing user configuration.

377 // Let's trust \|cld_language\| if the determined language is not difficult to

378 // distinguish from English, and the language is one of well-known languages

379 // which often provide "en-*" meta information mistakenly.

380 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {

381 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])

382 return true;

383 }

384 return false;

385 }

386

387 std::string GetCLDVersion() {

388 switch (GetCLDMajorVersion()) {

389 #if !defined(CLD_VERSION) \|\| CLD_VERSION==1

390 case 1:

391 return CompactLangDet::DetectLanguageVersion();

392 #endif

393 #if !defined(CLD_VERSION) \|\| CLD_VERSION==2

394 case 2:

395 return CLD2::DetectLanguageVersion();

396 #endif

397 default:

398 NOTREACHED();

399 }

400 return "";

401 }

402

403 } // namespace LanguageDetectionUtil

OLD	NEW

« no previous file with comments | « chrome/common/translate/language_detection_util.h ('k') | chrome/common/translate/language_detection_util_unittest.cc » ('j') | no next file with comments »