Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(138)

Side by Side Diff: trunk/src/chrome/common/translate/language_detection_util.cc

Issue 23766011: Revert 221380 "Use Finch to compare the performances of CLD1 and..." (Closed) Base URL: svn://svn.chromium.org/chrome/
Patch Set: Created 7 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « trunk/src/chrome/common/DEPS ('k') | trunk/src/third_party/cld/README.chromium » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2013 The Chromium Authors. All rights reserved. 1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "chrome/common/translate/language_detection_util.h" 5 #include "chrome/common/translate/language_detection_util.h"
6 6
7 #include "base/logging.h" 7 #include "base/logging.h"
8 #include "base/metrics/field_trial.h"
9 #include "base/strings/string_split.h" 8 #include "base/strings/string_split.h"
10 #include "base/strings/string_util.h" 9 #include "base/strings/string_util.h"
11 #include "base/strings/utf_string_conversions.h"
12 #include "base/time/time.h" 10 #include "base/time/time.h"
13 #include "chrome/common/chrome_constants.h" 11 #include "chrome/common/chrome_constants.h"
14 #include "chrome/common/translate/translate_common_metrics.h" 12 #include "chrome/common/translate/translate_common_metrics.h"
15 #include "chrome/common/translate/translate_util.h" 13 #include "chrome/common/translate/translate_util.h"
16
17 #if !defined(CLD_VERSION) || CLD_VERSION==1
18 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" 14 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
19 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" 15 #include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
20 #endif
21
22 #if !defined(CLD_VERSION) || CLD_VERSION==2
23 #include "third_party/cld_2/src/public/compact_lang_det.h"
24 #endif
25 16
26 namespace { 17 namespace {
27 18
28 // Similar language code list. Some languages are very similar and difficult 19 // Similar language code list. Some languages are very similar and difficult
29 // for CLD to distinguish. 20 // for CLD to distinguish.
30 struct SimilarLanguageCode { 21 struct SimilarLanguageCode {
31 const char* const code; 22 const char* const code;
32 int group; 23 int group;
33 }; 24 };
34 25
(...skipping 28 matching lines...) Expand all
63 LanguageDetectionUtil::CorrectLanguageCodeTypo(code); 54 LanguageDetectionUtil::CorrectLanguageCodeTypo(code);
64 55
65 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { 56 if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) {
66 *code = std::string(); 57 *code = std::string();
67 return; 58 return;
68 } 59 }
69 60
70 TranslateUtil::ToTranslateLanguageSynonym(code); 61 TranslateUtil::ToTranslateLanguageSynonym(code);
71 } 62 }
72 63
73 int GetCLDMajorVersion() {
74 #if !defined(CLD_VERSION)
75 std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");
76 if (group_name == "CLD2")
77 return 2;
78 else
79 return 1;
80 #else
81 return CLD_VERSION;
82 #endif
83 }
84
85 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it 64 // Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
86 // failed. 65 // failed.
87 // |is_cld_reliable| will be set as true if CLD says the detection is reliable. 66 // |is_cld_reliable| will be set as true if CLD says the detection is reliable.
88 std::string DetermineTextLanguage(const base::string16& text, 67 std::string DetermineTextLanguage(const base::string16& text,
89 bool* is_cld_reliable) { 68 bool* is_cld_reliable) {
90 std::string language = chrome::kUnknownLanguageCode; 69 std::string language = chrome::kUnknownLanguageCode;
70 int num_languages = 0;
91 int text_bytes = 0; 71 int text_bytes = 0;
92 bool is_reliable = false; 72 bool is_reliable = false;
93 73 Language cld_language =
94 // Language or CLD2::Language 74 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
95 int cld_language = 0; 75 &num_languages, NULL, &text_bytes);
96 bool is_valid_language = false;
97
98 switch (GetCLDMajorVersion()) {
99 #if !defined(CLD_VERSION) || CLD_VERSION==1
100 case 1: {
101 int num_languages = 0;
102 cld_language =
103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
104 &num_languages, NULL, &text_bytes);
105 is_valid_language = cld_language != NUM_LANGUAGES &&
106 cld_language != UNKNOWN_LANGUAGE &&
107 cld_language != TG_UNKNOWN_LANGUAGE;
108 break;
109 }
110 #endif
111 #if !defined(CLD_VERSION) || CLD_VERSION==2
112 case 2: {
113 std::string utf8_text(UTF16ToUTF8(text));
114 CLD2::Language language3[3];
115 int percent3[3];
116 cld_language =
117 CLD2::DetectLanguageSummary(utf8_text.c_str(), utf8_text.size(), true,
118 language3, percent3,
119 &text_bytes, &is_reliable);
120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
121 cld_language != CLD2::UNKNOWN_LANGUAGE &&
122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
123 break;
124 }
125 #endif
126 default:
127 NOTREACHED();
128 }
129
130 if (is_cld_reliable != NULL) 76 if (is_cld_reliable != NULL)
131 *is_cld_reliable = is_reliable; 77 *is_cld_reliable = is_reliable;
132 78
133 // We don't trust the result if the CLD reports that the detection is not 79 // We don't trust the result if the CLD reports that the detection is not
134 // reliable, or if the actual text used to detect the language was less than 80 // reliable, or if the actual text used to detect the language was less than
135 // 100 bytes (short texts can often lead to wrong results). 81 // 100 bytes (short texts can often lead to wrong results).
136 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that 82 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
137 // the determined language code is correct with 50% confidence. Chrome should 83 // the determined language code is correct with 50% confidence. Chrome should
138 // handle the real confidence value to judge. 84 // handle the real confidence value to judge.
139 if (is_reliable && text_bytes >= 100 && is_valid_language) { 85 if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES &&
86 cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) {
140 // We should not use LanguageCode_ISO_639_1 because it does not cover all 87 // We should not use LanguageCode_ISO_639_1 because it does not cover all
141 // the languages CLD can detect. As a result, it'll return the invalid 88 // the languages CLD can detect. As a result, it'll return the invalid
142 // language code for tradtional Chinese among others. 89 // language code for tradtional Chinese among others.
143 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and 90 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN 91 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
145 // for Simplified Chinese. 92 // for Simplified Chinese.
146 switch (GetCLDMajorVersion()) { 93 language = LanguageCodeWithDialects(cld_language);
147 #if !defined(CLD_VERSION) || CLD_VERSION==1
148 case 1:
149 language =
150 LanguageCodeWithDialects(static_cast<Language>(cld_language));
151 break;
152 #endif
153 #if !defined(CLD_VERSION) || CLD_VERSION==2
154 case 2:
155 if (cld_language == CLD2::CHINESE) {
156 language = "zh-CN";
157 } else {
158 language =
159 CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
160 }
161 break;
162 #endif
163 default:
164 NOTREACHED();
165 }
166 } 94 }
167 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text 95 VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
168 << "\n*************************************\n"; 96 << "\n*************************************\n";
169 return language; 97 return language;
170 } 98 }
171 99
172 // Checks if CLD can complement a sub code when the page language doesn't know 100 // Checks if CLD can complement a sub code when the page language doesn't know
173 // the sub code. 101 // the sub code.
174 bool CanCLDComplementSubCode( 102 bool CanCLDComplementSubCode(
175 const std::string& page_language, const std::string& cld_language) { 103 const std::string& page_language, const std::string& cld_language) {
(...skipping 180 matching lines...) Expand 10 before | Expand all | Expand 10 after
356 // distinguish from English, and the language is one of well-known languages 284 // distinguish from English, and the language is one of well-known languages
357 // which often provide "en-*" meta information mistakenly. 285 // which often provide "en-*" meta information mistakenly.
358 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 286 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
359 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 287 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
360 return true; 288 return true;
361 } 289 }
362 return false; 290 return false;
363 } 291 }
364 292
365 std::string GetCLDVersion() { 293 std::string GetCLDVersion() {
366 switch (GetCLDMajorVersion()) { 294 return CompactLangDet::DetectLanguageVersion();
367 #if !defined(CLD_VERSION) || CLD_VERSION==1
368 case 1:
369 return CompactLangDet::DetectLanguageVersion();
370 #endif
371 #if !defined(CLD_VERSION) || CLD_VERSION==2
372 case 2:
373 return CLD2::DetectLanguageVersion();
374 #endif
375 default:
376 NOTREACHED();
377 }
378 return "";
379 } 295 }
380 296
381 } // namespace LanguageDetectionUtil 297 } // namespace LanguageDetectionUtil
OLDNEW
« no previous file with comments | « trunk/src/chrome/common/DEPS ('k') | trunk/src/third_party/cld/README.chromium » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698