components/translate/core/language_detection/chinese_script_classifier.cc - Issue 2756313002: [Merge M-58] Adds ChineseScriptClassifier to predict zh-Hant or zh-Hans for input detected as zh.

Unified Diff: components/translate/core/language_detection/chinese_script_classifier.cc

Issue 2756313002: [Merge M-58] Adds ChineseScriptClassifier to predict zh-Hant or zh-Hans for input detected as zh. (Closed)

Patch Set: Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « components/translate/core/language_detection/chinese_script_classifier.h ('k') | components/translate/core/language_detection/chinese_script_classifier_test.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: components/translate/core/language_detection/chinese_script_classifier.cc

diff --git a/components/translate/core/language_detection/chinese_script_classifier.cc b/components/translate/core/language_detection/chinese_script_classifier.cc

new file mode 100644

index 0000000000000000000000000000000000000000..f37cfe475e2f4f51f1d5526da0e06bcbc3b8c166

--- /dev/null

+++ b/components/translate/core/language_detection/chinese_script_classifier.cc

@@ -0,0 +1,129 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "components/translate/core/language_detection/chinese_script_classifier.h"

+#include <algorithm>

+#include <memory>

+#include <string>

+#include "base/logging.h"

+#include "base/strings/string_util.h"

+#include "third_party/icu/source/common/unicode/unistr.h"

+#include "third_party/icu/source/i18n/unicode/translit.h"

+namespace translate {

+namespace {

+// BCP 47 language code representing Chinese in Han Simplified script.

+const char kChineseSimplifiedCode[] = "zh-Hans";

+// BCP 47 language code representing Chinese in Han Traditional script.

+const char kChineseTraditionalCode[] = "zh-Hant";

+} // namespace

+ChineseScriptClassifier::ChineseScriptClassifier() {

+ UParseError parse_status;

+ UErrorCode status = U_ZERO_ERROR;

+ // The Transliterator IDs are defined in:

+ // third_party/icu/source/data/translit/root.txt.

+ //

+ // Chromium keeps only a subset of these, defined in:

+ // third_party/icu/source/data/translit/root_subset.txt

+ hans2hant_.reset(icu::Transliterator::createInstance(

+ icu::UnicodeString("Hans-Hant"), UTRANS_FORWARD, parse_status, status));

+ DVLOG(1) << "Hans-Hant Transliterator initialization status: "

+ << u_errorName(status);

+ hant2hans_.reset(icu::Transliterator::createInstance(

+ icu::UnicodeString("Hant-Hans"), UTRANS_FORWARD, parse_status, status));

+ DVLOG(1) << "Hant-Hans Transliterator initialization status: "

+ << u_errorName(status);

+bool ChineseScriptClassifier::IsInitialized() const {

+ return hans2hant_ && hant2hans_;

+ChineseScriptClassifier::~ChineseScriptClassifier() {}

+std::string ChineseScriptClassifier::Classify(const std::string& input) const {

+ // If there was a problem with initialization, return the empty string.

+ if (!IsInitialized()) {

+ return "";

+ }

+ // Operate only on first 500 bytes.

+ std::string input_subset;

+ base::TruncateUTF8ToByteSize(input, 500, &input_subset);

+ // Remove whitespace since transliterators may not preserve it.

+ input_subset.erase(std::remove_if(input_subset.begin(), input_subset.end(),

+ base::IsUnicodeWhitespace),

+ input_subset.end());

+ // Convert two copies of the input to icu::UnicodeString. Two copies are

+ // necessary because transliteration happens in place only.

+ icu::UnicodeString original_input =

+ icu::UnicodeString::fromUTF8(input_subset);

+ icu::UnicodeString hant_input = icu::UnicodeString::fromUTF8(input_subset);

+ icu::UnicodeString hans_input = icu::UnicodeString::fromUTF8(input_subset);

+ // Get the zh-Hant version of this input.

+ hans2hant_->transliterate(hant_input);

+ // Get the zh-Hans version of this input.

+ hant2hans_->transliterate(hans_input);

+ // Debugging only: show the input, the Hant version, and the Hans version.

+ if (VLOG_IS_ON(1)) {

+ std::string hant_string;

+ std::string hans_string;

+ hans_input.toUTF8String(hans_string);

+ hant_input.toUTF8String(hant_string);

+ DVLOG(1) << "Original input:\n" << input_subset;

+ DVLOG(1) << "zh-Hant output:\n" << hant_string;

+ DVLOG(1) << "zh-Hans output:\n" << hans_string;

+ }

+ // Count matches between the original input chars and the Hant and Hans

+ // versions of the input.

+ int hant_count = 0;

+ int hans_count = 0;

+ // Iterate over all chars in the original input and compute matches between

+ // the Hant version and the Hans version.

+ //

+ // All segments (original, Hant, and Hans) should have the same length, but

+ // in case of some corner case or bug in which they turn out not to be,

+ // we compute the minimum length we are allowed to traverse.

+ const int min_length =

+ std::min(original_input.length(),

+ std::min(hans_input.length(), hant_input.length()));

+ for (int index = 0; index < min_length; ++index) {

+ const auto original_char = original_input.charAt(index);

+ const auto hans_char = hans_input.charAt(index);

+ const auto hant_char = hant_input.charAt(index);

+ if (hans_char == hant_char) {

+ continue;

+ } else if (original_char == hans_char) {

+ // Hans-specific char found.

+ ++hans_count;

+ } else if (original_char == hant_char) {

+ // Hant-specific char found.

+ ++hant_count;

+ }

+ DVLOG(1) << "Found " << hans_count << " zh-Hans chars in input";

+ DVLOG(1) << "Found " << hant_count << " zh-Hant chars in input";

+ if (hant_count > hans_count) {

+ return kChineseTraditionalCode;

+ } else if (hans_count > hant_count) {

+ return kChineseSimplifiedCode;

+ } else { // hans_count == hant_count

+ // All characters are the same in both scripts. In this case, we return the

+ // following code.

+ return kChineseSimplifiedCode;

+ }

+} // namespace translate