components/translate/language_detection/language_detection_util.cc - Issue 275233004: Move language_detection to core

Side by Side Diff: components/translate/language_detection/language_detection_util.cc

Issue 275233004: Move language_detection to core (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Adding TBR=jam@ Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « components/translate/language_detection/language_detection_util.h ('k') | components/translate/language_detection/language_detection_util_unittest.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright 2013 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 #include "components/translate/language_detection/language_detection_util.h"

6

7 #include "base/logging.h"

8 #include "base/metrics/field_trial.h"

9 #include "base/strings/string_split.h"

10 #include "base/strings/string_util.h"

11 #include "base/strings/utf_string_conversions.h"

12 #include "base/time/time.h"

13 #include "components/translate/core/common/translate_constants.h"

14 #include "components/translate/core/common/translate_metrics.h"

15 #include "components/translate/core/common/translate_util.h"

16

17 #if !defined(CLD_VERSION) \|\| CLD_VERSION==1

18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"

19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"

20 #endif

21

22 #if !defined(CLD_VERSION) \|\| CLD_VERSION==2

23 #include "third_party/cld_2/src/public/compact_lang_det.h"

24 #endif

25

26 namespace {

27

28 // Similar language code list. Some languages are very similar and difficult

29 // for CLD to distinguish.

30 struct SimilarLanguageCode {

31 const char* const code;

32 int group;

33 };

34

35 const SimilarLanguageCode kSimilarLanguageCodes[] = {

36 {"bs", 1},

37 {"hr", 1},

38 {"hi", 2},

39 {"ne", 2},

40 };

41

42 // Checks \|kSimilarLanguageCodes\| and returns group code.

43 int GetSimilarLanguageGroupCode(const std::string& language) {

44 for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {

45 if (language.find(kSimilarLanguageCodes[i].code) != 0)

46 continue;

47 return kSimilarLanguageCodes[i].group;

48 }

49 return 0;

50 }

51

52 // Well-known languages which often have wrong server configuration of

53 // Content-Language: en.

54 // TODO(toyoshim): Remove these static tables and caller functions to

55 // translate/common, and implement them as std::set<>.

56 const char* kWellKnownCodesOnWrongConfiguration[] = {

57 "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"

58 };

59

60 // Applies a series of language code modification in proper order.

61 void ApplyLanguageCodeCorrection(std::string* code) {

62 // Correct well-known format errors.

63 translate::CorrectLanguageCodeTypo(code);

64

65 if (!translate::IsValidLanguageCode(*code)) {

66 *code = std::string();

67 return;

68 }

69

70 translate::ToTranslateLanguageSynonym(code);

71 }

72

73 int GetCLDMajorVersion() {

74 #if !defined(CLD_VERSION)

75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");

76 if (group_name == "CLD2")

77 return 2;

78 else

79 return 1;

80 #else

81 return CLD_VERSION;

82 #endif

83 }

84

85 // Returns the ISO 639 language code of the specified \|text\|, or 'unknown' if it

86 // failed.

87 // \|is_cld_reliable\| will be set as true if CLD says the detection is reliable.

88 std::string DetermineTextLanguage(const base::string16& text,

89 bool* is_cld_reliable) {

90 std::string language = translate::kUnknownLanguageCode;

91 int text_bytes = 0;

92 bool is_reliable = false;

93

94 // Language or CLD2::Language

95 int cld_language = 0;

96 bool is_valid_language = false;

97

98 switch (GetCLDMajorVersion()) {

99 #if !defined(CLD_VERSION) \|\| CLD_VERSION==1

100 case 1: {

101 int num_languages = 0;

102 cld_language =

103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,

104 &num_languages, NULL, &text_bytes);

105 is_valid_language = cld_language != NUM_LANGUAGES &&

106 cld_language != UNKNOWN_LANGUAGE &&

107 cld_language != TG_UNKNOWN_LANGUAGE;

108 break;

109 }

110 #endif

111 #if !defined(CLD_VERSION) \|\| CLD_VERSION==2

112 case 2: {

113 std::string utf8_text(base::UTF16ToUTF8(text));

114 CLD2::Language language3[3];

115 int percent3[3];

116 CLD2::DetectLanguageSummary(

117 utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3,

118 &text_bytes, &is_reliable);

119 cld_language = language3[0];

120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&

121 cld_language != CLD2::UNKNOWN_LANGUAGE &&

122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;

123 break;

124 }

125 #endif

126 default:

127 NOTREACHED();

128 }

129

130 if (is_cld_reliable != NULL)

131 *is_cld_reliable = is_reliable;

132

133 // We don't trust the result if the CLD reports that the detection is not

134 // reliable, or if the actual text used to detect the language was less than

135 // 100 bytes (short texts can often lead to wrong results).

136 // TODO(toyoshim): CLD provides \|is_reliable\| flag. But, it just says that

137 // the determined language code is correct with 50% confidence. Chrome should

138 // handle the real confidence value to judge.

139 if (is_reliable && text_bytes >= 100 && is_valid_language) {

140 // We should not use LanguageCode_ISO_639_1 because it does not cover all

141 // the languages CLD can detect. As a result, it'll return the invalid

142 // language code for tradtional Chinese among others.

143 // \|LanguageCodeWithDialect\| will go through ISO 639-1, ISO-639-2 and

144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN

145 // for Simplified Chinese.

146 switch (GetCLDMajorVersion()) {

147 #if !defined(CLD_VERSION) \|\| CLD_VERSION==1

148 case 1:

149 language =

150 LanguageCodeWithDialects(static_cast<Language>(cld_language));

151 break;

152 #endif

153 #if !defined(CLD_VERSION) \|\| CLD_VERSION==2

154 case 2:

155 // (1) CLD2's LanguageCode returns general Chinese 'zh' for

156 // CLD2::CHINESE, but Translate server doesn't accept it. This is

157 // converted to 'zh-CN' in the same way as CLD1's

158 // LanguageCodeWithDialects.

159 //

160 // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for

161 // CLD2::CHINESE_T. This is technically more precise for the language

162 // code of traditional Chinese, while Translate server hasn't accepted

163 // zh-Hant yet.

164 if (cld_language == CLD2::CHINESE) {

165 language = "zh-CN";

166 } else if (cld_language == CLD2::CHINESE_T) {

167 language = "zh-TW";

168 } else {

169 language =

170 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));

171 }

172 break;

173 #endif

174 default:

175 NOTREACHED();

176 }

177 }

178 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text

179 << "\n*************************************\n";

180 return language;

181 }

182

183 // Checks if CLD can complement a sub code when the page language doesn't know

184 // the sub code.

185 bool CanCLDComplementSubCode(

186 const std::string& page_language, const std::string& cld_language) {

187 // Translate server cannot treat general Chinese. If Content-Language and

188 // CLD agree that the language is Chinese and Content-Language doesn't know

189 // which dialect is used, CLD language has priority.

190 // TODO(hajimehoshi): How about the other dialects like zh-MO?

191 return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);

192 }

193

194 } // namespace

195

196 namespace translate {

197

198 std::string DeterminePageLanguage(const std::string& code,

199 const std::string& html_lang,

200 const base::string16& contents,

201 std::string* cld_language_p,

202 bool* is_cld_reliable_p) {

203 base::TimeTicks begin_time = base::TimeTicks::Now();

204 bool is_cld_reliable;

205 std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);

206 translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());

207

208 if (cld_language_p != NULL)

209 *cld_language_p = cld_language;

210 if (is_cld_reliable_p != NULL)

211 *is_cld_reliable_p = is_cld_reliable;

212 translate::ToTranslateLanguageSynonym(&cld_language);

213

214 // Check if html lang attribute is valid.

215 std::string modified_html_lang;

216 if (!html_lang.empty()) {

217 modified_html_lang = html_lang;

218 ApplyLanguageCodeCorrection(&modified_html_lang);

219 translate::ReportHtmlLang(html_lang, modified_html_lang);

220 VLOG(9) << "html lang based language code: " << modified_html_lang;

221 }

222

223 // Check if Content-Language is valid.

224 std::string modified_code;

225 if (!code.empty()) {

226 modified_code = code;

227 ApplyLanguageCodeCorrection(&modified_code);

228 translate::ReportContentLanguage(code, modified_code);

229 }

230

231 // Adopt \|modified_html_lang\| if it is valid. Otherwise, adopt

232 // \|modified_code\|.

233 std::string language = modified_html_lang.empty() ? modified_code :

234 modified_html_lang;

235

236 // If \|language\| is empty, just use CLD result even though it might be

237 // translate::kUnknownLanguageCode.

238 if (language.empty()) {

239 translate::ReportLanguageVerification(

240 translate::LANGUAGE_VERIFICATION_CLD_ONLY);

241 return cld_language;

242 }

243

244 if (cld_language == kUnknownLanguageCode) {

245 translate::ReportLanguageVerification(

246 translate::LANGUAGE_VERIFICATION_UNKNOWN);

247 return language;

248 }

249

250 if (CanCLDComplementSubCode(language, cld_language)) {

251 translate::ReportLanguageVerification(

252 translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);

253 return cld_language;

254 }

255

256 if (IsSameOrSimilarLanguages(language, cld_language)) {

257 translate::ReportLanguageVerification(

258 translate::LANGUAGE_VERIFICATION_CLD_AGREE);

259 return language;

260 }

261

262 if (MaybeServerWrongConfiguration(language, cld_language)) {

263 translate::ReportLanguageVerification(

264 translate::LANGUAGE_VERIFICATION_TRUST_CLD);

265 return cld_language;

266 }

267

268 // Content-Language value might be wrong because CLD says that this page is

269 // written in another language with confidence. In this case, Chrome doesn't

270 // rely on any of the language codes, and gives up suggesting a translation.

271 translate::ReportLanguageVerification(

272 translate::LANGUAGE_VERIFICATION_CLD_DISAGREE);

273 return kUnknownLanguageCode;

274 }

275

276 void CorrectLanguageCodeTypo(std::string* code) {

277 DCHECK(code);

278

279 size_t coma_index = code->find(',');

280 if (coma_index != std::string::npos) {

281 // There are more than 1 language specified, just keep the first one.

282 *code = code->substr(0, coma_index);

283 }

284 base::TrimWhitespaceASCII(*code, base::TRIM_ALL, code);

285

286 // An underscore instead of a dash is a frequent mistake.

287 size_t underscore_index = code->find('_');

288 if (underscore_index != std::string::npos)

289 (*code)[underscore_index] = '-';

290

291 // Change everything up to a dash to lower-case and everything after to upper.

292 size_t dash_index = code->find('-');

293 if (dash_index != std::string::npos) {

294 *code = StringToLowerASCII(code->substr(0, dash_index)) +

295 StringToUpperASCII(code->substr(dash_index));

296 } else {

297 code = StringToLowerASCII(code);

298 }

299 }

300

301 bool IsValidLanguageCode(const std::string& code) {

302 // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.

303 // TODO(hajimehoshi): How about es-419, which is used as an Accept language?

304 std::vector<std::string> chunks;

305 base::SplitString(code, '-', &chunks);

306

307 if (chunks.size() < 1 \|\| 2 < chunks.size())

308 return false;

309

310 const std::string& main_code = chunks[0];

311

312 if (main_code.size() < 1 \|\| 3 < main_code.size())

313 return false;

314

315 for (std::string::const_iterator it = main_code.begin();

316 it != main_code.end(); ++it) {

317 if (!IsAsciiAlpha(*it))

318 return false;

319 }

320

321 if (chunks.size() == 1)

322 return true;

323

324 const std::string& sub_code = chunks[1];

325

326 if (sub_code.size() != 2)

327 return false;

328

329 for (std::string::const_iterator it = sub_code.begin();

330 it != sub_code.end(); ++it) {

331 if (!IsAsciiAlpha(*it))

332 return false;

333 }

334

335 return true;

336 }

337

338 bool IsSameOrSimilarLanguages(const std::string& page_language,

339 const std::string& cld_language) {

340 std::vector<std::string> chunks;

341

342 base::SplitString(page_language, '-', &chunks);

343 if (chunks.size() == 0)

344 return false;

345 std::string page_language_main_part = chunks[0];

346

347 base::SplitString(cld_language, '-', &chunks);

348 if (chunks.size() == 0)

349 return false;

350 std::string cld_language_main_part = chunks[0];

351

352 // Language code part of \|page_language\| is matched to one of \|cld_language\|.

353 // Country code is ignored here.

354 if (page_language_main_part == cld_language_main_part) {

355 // Languages are matched strictly. Reports false to metrics, but returns

356 // true.

357 translate::ReportSimilarLanguageMatch(false);

358 return true;

359 }

360

361 // Check if \|page_language\| and \|cld_language\| are in the similar language

362 // list and belong to the same language group.

363 int page_code = GetSimilarLanguageGroupCode(page_language);

364 bool match = page_code != 0 &&

365 page_code == GetSimilarLanguageGroupCode(cld_language);

366

367 translate::ReportSimilarLanguageMatch(match);

368 return match;

369 }

370

371 bool MaybeServerWrongConfiguration(const std::string& page_language,

372 const std::string& cld_language) {

373 // If \|page_language\| is not "en-*", respect it and just return false here.

374 if (!StartsWithASCII(page_language, "en", false))

375 return false;

376

377 // A server provides a language meta information representing "en-*". But it

378 // might be just a default value due to missing user configuration.

379 // Let's trust \|cld_language\| if the determined language is not difficult to

380 // distinguish from English, and the language is one of well-known languages

381 // which often provide "en-*" meta information mistakenly.

382 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {

383 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])

384 return true;

385 }

386 return false;

387 }

388

389 } // namespace translate

OLD	NEW