Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(353)

Unified Diff: components/translate/core/language_detection/chinese_script_classifier.cc

Issue 2732023003: Adds ChineseScriptClassifier to predict zh-Hant or zh-Hans for input detected as zh. (Closed)
Patch Set: Fixes broken test Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/translate/core/language_detection/chinese_script_classifier.cc
diff --git a/components/translate/core/language_detection/chinese_script_classifier.cc b/components/translate/core/language_detection/chinese_script_classifier.cc
new file mode 100644
index 0000000000000000000000000000000000000000..816beb28366d9eaace0c9f5296aaf63661a8bcff
--- /dev/null
+++ b/components/translate/core/language_detection/chinese_script_classifier.cc
@@ -0,0 +1,118 @@
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/translate/core/language_detection/chinese_script_classifier.h"
+
+#include <algorithm>
+#include <cctype>
+#include <memory>
+#include <string>
+#include "base/logging.h"
+#include "base/strings/string_util.h"
+#include "third_party/icu/source/common/unicode/unistr.h"
+#include "third_party/icu/source/i18n/unicode/translit.h"
+
+namespace translate {
+
+const char ChineseScriptClassifier::kChineseSimplifiedCode[] = "zh-Hans";
+const char ChineseScriptClassifier::kChineseTraditionalCode[] = "zh-Hant";
+
+ChineseScriptClassifier::ChineseScriptClassifier() {
+ UParseError parse_status;
+ UErrorCode status = U_ZERO_ERROR;
+ // The Transliterator IDs are defined in:
+ // third_party/icu/source/data/translit/root.txt.
+ //
+ // Chromium keeps only a subset of these, defined in:
+ // third_party/icu/source/data/translit/root_subset.txt
+ hans2hant_.reset(icu::Transliterator::createInstance(
+ icu::UnicodeString("Hans-Hant"), UTRANS_FORWARD, parse_status, status));
+ VLOG(1) << "Hans-Hant Transliterator initialization status: "
+ << u_errorName(status);
+ hant2hans_.reset(icu::Transliterator::createInstance(
+ icu::UnicodeString("Hant-Hans"), UTRANS_FORWARD, parse_status, status));
+ VLOG(1) << "Hant-Hans Transliterator initialization status: "
+ << u_errorName(status);
+}
+
+bool ChineseScriptClassifier::IsInitialized() const {
+ return hans2hant_ && hant2hans_;
+}
+
+ChineseScriptClassifier::~ChineseScriptClassifier() {}
+
+std::string ChineseScriptClassifier::Classify(const std::string& input) const {
+ // If there was a problem with initialization, return the empty string.
+ if (!IsInitialized()) {
+ return "";
+ }
+
+ // Operate only on first 500 bytes.
+ std::string input_subset;
+ base::TruncateUTF8ToByteSize(input, 500, &input_subset);
+
+ // Remove whitespace since transliterators may not preserve it.
+ input_subset.erase(std::remove_if(input_subset.begin(), input_subset.end(),
+ base::IsUnicodeWhitespace),
+ input_subset.end());
+
+ // Convert two copies of the input to icu::UnicodeString. Two copies are
+ // necessary because transliteration happens in place only.
+ icu::UnicodeString original_input =
+ icu::UnicodeString::fromUTF8(input_subset);
+ icu::UnicodeString hant_input = icu::UnicodeString::fromUTF8(input_subset);
+ icu::UnicodeString hans_input = icu::UnicodeString::fromUTF8(input_subset);
+
+ // Get the zh-Hant version of this input.
+ hans2hant_->transliterate(hant_input);
+ // Get the zh-Hans version of this input.
+ hant2hans_->transliterate(hans_input);
+
+ // Debugging only: show the input, the Hant version, and the Hans version.
+ if (VLOG_IS_ON(1)) {
+ std::string hant_string;
+ std::string hans_string;
+ hans_input.toUTF8String(hans_string);
+ hant_input.toUTF8String(hant_string);
+ VLOG(1) << "Original input:\n" << input_subset;
groby-ooo-7-16 2017/03/08 00:41:54 Are you planning to keep the VLOG forever, or is t
riesa 2017/03/08 01:47:42 Yes, I was planning to keep it in case of bug repo
groby1 2017/03/08 02:00:57 I'm torn. Each VLOG increases binary size, which a
+ VLOG(1) << "zh-Hant output:\n" << hant_string;
+ VLOG(1) << "zh-Hans output:\n" << hans_string;
+ }
+
+ // Count matches between the original input chars and the Hant and Hans
+ // versions of the input.
+ int hant_count = 0;
+ int hans_count = 0;
+ for (int index = 0;
+ index < original_input.length() && index < hans_input.length() &&
groby-ooo-7-16 2017/03/08 00:41:54 nit: please do compute min outside loop
riesa 2017/03/08 01:47:42 Done.
+ index < hant_input.length();
+ ++index) {
+ const auto original_char = original_input.charAt(index);
+ const auto hans_char = hans_input.charAt(index);
groby-ooo-7-16 2017/03/08 00:41:54 Bit concerned by the fact that charAt needs to rep
riesa 2017/03/08 01:47:42 Hm, from what I can tell charAt is just indexing r
groby1 2017/03/08 02:00:57 Gah. I misread the types to be still UTF8. Never m
+ const auto hant_char = hant_input.charAt(index);
+ if (hans_char == hant_char) {
groby-ooo-7-16 2017/03/08 00:41:54 Possible simplification (no braces, less branches,
riesa 2017/03/08 01:47:42 I originally had it this way in an earlier patch,
groby1 2017/03/08 02:00:57 I don't think there's a performance penalty worth
+ continue;
+ } else if (original_char == hans_char) {
+ // Input matches a Hans-only char.
+ ++hans_count;
+ } else if (original_char == hant_char) {
+ // Input matches a Hant-only char.
+ ++hant_count;
+ }
+ }
+ VLOG(1) << "Found " << hans_count << " zh-Hans chars in input";
+ VLOG(1) << "Found " << hant_count << " zh-Hant chars in input";
+
+ if (hant_count > hans_count) {
+ return kChineseTraditionalCode;
+ } else if (hans_count > hant_count) {
+ return kChineseSimplifiedCode;
+ } else { // hans_count == hant_count
+ // All characters are the same in both scripts. In this case, we return the
+ // following code.
+ return kChineseSimplifiedCode;
+ }
+}
+
+} // namespace translate

Powered by Google App Engine
This is Rietveld 408576698