Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(9)

Side by Side Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 1847713002: Add histograms to measure CLD2 language detection as well as the accuracy of the chosen detection. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Remove the summary language bugfix Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/translate/core/language_detection/language_detection_util.h " 5 #include "components/translate/core/language_detection/language_detection_util.h "
6 6
7 #include <stddef.h> 7 #include <stddef.h>
8 8
9 #include "base/logging.h" 9 #include "base/logging.h"
10 #include "base/macros.h" 10 #include "base/macros.h"
11 #include "base/metrics/histogram_macros.h"
11 #include "base/strings/string_split.h" 12 #include "base/strings/string_split.h"
12 #include "base/strings/string_util.h" 13 #include "base/strings/string_util.h"
13 #include "base/strings/utf_string_conversions.h" 14 #include "base/strings/utf_string_conversions.h"
14 #include "base/time/time.h" 15 #include "base/time/time.h"
15 #include "components/translate/core/common/translate_constants.h" 16 #include "components/translate/core/common/translate_constants.h"
16 #include "components/translate/core/common/translate_metrics.h" 17 #include "components/translate/core/common/translate_metrics.h"
17 #include "components/translate/core/common/translate_util.h" 18 #include "components/translate/core/common/translate_util.h"
18 19
19 #if CLD_VERSION==1 20 #if CLD_VERSION==1
20 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" 21 #include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
(...skipping 108 matching lines...) Expand 10 before | Expand all | Expand 10 after
129 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags, 130 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags,
130 language3, percent3, normalized_score3, 131 language3, percent3, normalized_score3,
131 nullptr /* No ResultChunkVector used */, &text_bytes, &is_reliable); 132 nullptr /* No ResultChunkVector used */, &text_bytes, &is_reliable);
132 } 133 }
133 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && 134 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
134 cld_language != CLD2::UNKNOWN_LANGUAGE && 135 cld_language != CLD2::UNKNOWN_LANGUAGE &&
135 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; 136 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
136 137
137 // Choose top language. 138 // Choose top language.
138 cld_language = language3[0]; 139 cld_language = language3[0];
140 UMA_HISTOGRAM_ENUMERATION("Translate.CLD2LanguageDetected",
141 cld_language, CLD2::NUM_LANGUAGES);
Alexei Svitkine (slow) 2016/03/31 15:10:21 Does CLD guarantee that it's language enums won't
rkaplow 2016/03/31 15:26:14 I suspect only the unused space (i.e. x_nnn) would
142 if (is_valid_language) {
Alexei Svitkine (slow) 2016/03/31 15:10:22 Nit: No {}'s
rkaplow 2016/03/31 15:26:14 Done.
143 UMA_HISTOGRAM_PERCENTAGE("Translate.CLD2LanguageAccuracy", percent3[0]);
144 }
145
139 #else 146 #else
140 # error "CLD_VERSION must be 1 or 2" 147 # error "CLD_VERSION must be 1 or 2"
141 #endif 148 #endif
142 149
143 if (is_cld_reliable != NULL) 150 if (is_cld_reliable != NULL)
144 *is_cld_reliable = is_reliable; 151 *is_cld_reliable = is_reliable;
145 152
146 // We don't trust the result if the CLD reports that the detection is not 153 // We don't trust the result if the CLD reports that the detection is not
147 // reliable, or if the actual text used to detect the language was less than 154 // reliable, or if the actual text used to detect the language was less than
148 // 100 bytes (short texts can often lead to wrong results). 155 // 100 bytes (short texts can often lead to wrong results).
149 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that 156 // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
150 // the determined language code is correct with 50% confidence. Chrome should 157 // the determined language code is correct with 50% confidence. Chrome should
151 // handle the real confidence value to judge. 158 // handle the real confidence value to judge.
152 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) { 159 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) {
153 // We should not use LanguageCode_ISO_639_1 because it does not cover all 160 // We should not use LanguageCode_ISO_639_1 because it does not cover all
154 // the languages CLD can detect. As a result, it'll return the invalid 161 // the languages CLD can detect. As a result, it'll return the invalid
155 // language code for tradtional Chinese among others. 162 // language code for traditional Chinese among others.
156 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and 163 // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
157 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN 164 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
158 // for Simplified Chinese. 165 // for Simplified Chinese.
159 #if CLD_VERSION==1 166 #if CLD_VERSION==1
160 language = LanguageCodeWithDialects(static_cast<Language>(cld_language)); 167 language = LanguageCodeWithDialects(static_cast<Language>(cld_language));
161 #elif CLD_VERSION==2 168 #elif CLD_VERSION==2
162 // (1) CLD2's LanguageCode returns general Chinese 'zh' for 169 // (1) CLD2's LanguageCode returns general Chinese 'zh' for
163 // CLD2::CHINESE, but Translate server doesn't accept it. This is 170 // CLD2::CHINESE, but Translate server doesn't accept it. This is
164 // converted to 'zh-CN' in the same way as CLD1's 171 // converted to 'zh-CN' in the same way as CLD1's
165 // LanguageCodeWithDialects. 172 // LanguageCodeWithDialects.
(...skipping 219 matching lines...) Expand 10 before | Expand all | Expand 10 after
385 // distinguish from English, and the language is one of well-known languages 392 // distinguish from English, and the language is one of well-known languages
386 // which often provide "en-*" meta information mistakenly. 393 // which often provide "en-*" meta information mistakenly.
387 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 394 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
388 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 395 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
389 return true; 396 return true;
390 } 397 }
391 return false; 398 return false;
392 } 399 }
393 400
394 } // namespace translate 401 } // namespace translate
OLDNEW
« no previous file with comments | « no previous file | tools/metrics/histograms/histograms.xml » ('j') | tools/metrics/histograms/histograms.xml » ('J')

Powered by Google App Engine
This is Rietveld 408576698