components/translate/core/language_detection/chinese_script_classifier.cc - Issue 2732023003: Adds ChineseScriptClassifier to predict zh-Hant or zh-Hans for input detected as zh.

Unified Diff: components/translate/core/language_detection/chinese_script_classifier.cc

Issue 2732023003: Adds ChineseScriptClassifier to predict zh-Hant or zh-Hans for input detected as zh. (Closed)

Patch Set: Fixes broken test Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« components/translate/core/language_detection/chinese_script_classifier.h ('K') | « components/translate/core/language_detection/chinese_script_classifier.h ('k') | components/translate/core/language_detection/chinese_script_classifier_test.cc » ('j') | components/translate/core/language_detection/language_detection_util.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: components/translate/core/language_detection/chinese_script_classifier.cc

diff --git a/components/translate/core/language_detection/chinese_script_classifier.cc b/components/translate/core/language_detection/chinese_script_classifier.cc

new file mode 100644

index 0000000000000000000000000000000000000000..816beb28366d9eaace0c9f5296aaf63661a8bcff

--- /dev/null

+++ b/components/translate/core/language_detection/chinese_script_classifier.cc

@@ -0,0 +1,118 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "components/translate/core/language_detection/chinese_script_classifier.h"

+#include <algorithm>

+#include <cctype>

+#include <memory>

+#include <string>

+#include "base/logging.h"

+#include "base/strings/string_util.h"

+#include "third_party/icu/source/common/unicode/unistr.h"

+#include "third_party/icu/source/i18n/unicode/translit.h"

+namespace translate {

+const char ChineseScriptClassifier::kChineseSimplifiedCode[] = "zh-Hans";

+const char ChineseScriptClassifier::kChineseTraditionalCode[] = "zh-Hant";

+ChineseScriptClassifier::ChineseScriptClassifier() {

+ UParseError parse_status;

+ UErrorCode status = U_ZERO_ERROR;

+ // The Transliterator IDs are defined in:

+ // third_party/icu/source/data/translit/root.txt.

+ //

+ // Chromium keeps only a subset of these, defined in:

+ // third_party/icu/source/data/translit/root_subset.txt

+ hans2hant_.reset(icu::Transliterator::createInstance(

+ icu::UnicodeString("Hans-Hant"), UTRANS_FORWARD, parse_status, status));

+ VLOG(1) << "Hans-Hant Transliterator initialization status: "

+ << u_errorName(status);

+ hant2hans_.reset(icu::Transliterator::createInstance(

+ icu::UnicodeString("Hant-Hans"), UTRANS_FORWARD, parse_status, status));

+ VLOG(1) << "Hant-Hans Transliterator initialization status: "

+ << u_errorName(status);

+bool ChineseScriptClassifier::IsInitialized() const {

+ return hans2hant_ && hant2hans_;

+ChineseScriptClassifier::~ChineseScriptClassifier() {}

+std::string ChineseScriptClassifier::Classify(const std::string& input) const {

+ // If there was a problem with initialization, return the empty string.

+ if (!IsInitialized()) {

+ return "";

+ }

+ // Operate only on first 500 bytes.

+ std::string input_subset;

+ base::TruncateUTF8ToByteSize(input, 500, &input_subset);

+ // Remove whitespace since transliterators may not preserve it.

+ input_subset.erase(std::remove_if(input_subset.begin(), input_subset.end(),

+ base::IsUnicodeWhitespace),

+ input_subset.end());

+ // Convert two copies of the input to icu::UnicodeString. Two copies are

+ // necessary because transliteration happens in place only.

+ icu::UnicodeString original_input =

+ icu::UnicodeString::fromUTF8(input_subset);

+ icu::UnicodeString hant_input = icu::UnicodeString::fromUTF8(input_subset);

+ icu::UnicodeString hans_input = icu::UnicodeString::fromUTF8(input_subset);

+ // Get the zh-Hant version of this input.

+ hans2hant_->transliterate(hant_input);

+ // Get the zh-Hans version of this input.

+ hant2hans_->transliterate(hans_input);

+ // Debugging only: show the input, the Hant version, and the Hans version.

+ if (VLOG_IS_ON(1)) {

+ std::string hant_string;

+ std::string hans_string;

+ hans_input.toUTF8String(hans_string);

+ hant_input.toUTF8String(hant_string);

+ VLOG(1) << "Original input:\n" << input_subset;

groby-ooo-7-16 2017/03/08 00:41:54 Are you planning to keep the VLOG forever, or is t

riesa 2017/03/08 01:47:42 Yes, I was planning to keep it in case of bug repo

groby1 2017/03/08 02:00:57 I'm torn. Each VLOG increases binary size, which a

+ VLOG(1) << "zh-Hant output:\n" << hant_string;

+ VLOG(1) << "zh-Hans output:\n" << hans_string;

+ }

+ // Count matches between the original input chars and the Hant and Hans

+ // versions of the input.

+ int hant_count = 0;

+ int hans_count = 0;

+ for (int index = 0;

+ index < original_input.length() && index < hans_input.length() &&

groby-ooo-7-16 2017/03/08 00:41:54 nit: please do compute min outside loop

riesa 2017/03/08 01:47:42 Done.

+ index < hant_input.length();

+ ++index) {

+ const auto original_char = original_input.charAt(index);

+ const auto hans_char = hans_input.charAt(index);

groby-ooo-7-16 2017/03/08 00:41:54 Bit concerned by the fact that charAt needs to rep

riesa 2017/03/08 01:47:42 Hm, from what I can tell charAt is just indexing r

groby1 2017/03/08 02:00:57 Gah. I misread the types to be still UTF8. Never m

+ const auto hant_char = hant_input.charAt(index);

+ if (hans_char == hant_char) {

groby-ooo-7-16 2017/03/08 00:41:54 Possible simplification (no braces, less branches,

riesa 2017/03/08 01:47:42 I originally had it this way in an earlier patch,

groby1 2017/03/08 02:00:57 I don't think there's a performance penalty worth

+ continue;

+ } else if (original_char == hans_char) {

+ // Input matches a Hans-only char.

+ ++hans_count;

+ } else if (original_char == hant_char) {

+ // Input matches a Hant-only char.

+ ++hant_count;

+ }

+ VLOG(1) << "Found " << hans_count << " zh-Hans chars in input";

+ VLOG(1) << "Found " << hant_count << " zh-Hant chars in input";

+ if (hant_count > hans_count) {

+ return kChineseTraditionalCode;

+ } else if (hans_count > hant_count) {

+ return kChineseSimplifiedCode;

+ } else { // hans_count == hant_count

+ // All characters are the same in both scripts. In this case, we return the

+ // following code.

+ return kChineseSimplifiedCode;

+ }

+} // namespace translate