components/translate/core/language_detection/chinese_script_classifier.cc - Issue 2756313002: [Merge M-58] Adds ChineseScriptClassifier to predict zh-Hant or zh-Hans for input detected as zh.

Side by Side Diff: components/translate/core/language_detection/chinese_script_classifier.cc

Issue 2756313002: [Merge M-58] Adds ChineseScriptClassifier to predict zh-Hant or zh-Hans for input detected as zh. (Closed)

Patch Set: Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « components/translate/core/language_detection/chinese_script_classifier.h ('k') | components/translate/core/language_detection/chinese_script_classifier_test.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright 2017 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "components/translate/core/language_detection/chinese_script_classifier .h"

	6

	7 #include <algorithm>

	8 #include <memory>

	9 #include <string>

	10 #include "base/logging.h"

	11 #include "base/strings/string_util.h"

	12 #include "third_party/icu/source/common/unicode/unistr.h"

	13 #include "third_party/icu/source/i18n/unicode/translit.h"

	14

	15 namespace translate {

	16

	17 namespace {

	18 // BCP 47 language code representing Chinese in Han Simplified script.

	19 const char kChineseSimplifiedCode[] = "zh-Hans";

	20

	21 // BCP 47 language code representing Chinese in Han Traditional script.

	22 const char kChineseTraditionalCode[] = "zh-Hant";

	23 } // namespace

	24

	25 ChineseScriptClassifier::ChineseScriptClassifier() {

	26 UParseError parse_status;

	27 UErrorCode status = U_ZERO_ERROR;

	28 // The Transliterator IDs are defined in:

	29 // third_party/icu/source/data/translit/root.txt.

	30 //

	31 // Chromium keeps only a subset of these, defined in:

	32 // third_party/icu/source/data/translit/root_subset.txt

	33 hans2hant_.reset(icu::Transliterator::createInstance(

	34 icu::UnicodeString("Hans-Hant"), UTRANS_FORWARD, parse_status, status));

	35 DVLOG(1) << "Hans-Hant Transliterator initialization status: "

	36 << u_errorName(status);

	37 hant2hans_.reset(icu::Transliterator::createInstance(

	38 icu::UnicodeString("Hant-Hans"), UTRANS_FORWARD, parse_status, status));

	39 DVLOG(1) << "Hant-Hans Transliterator initialization status: "

	40 << u_errorName(status);

	41 }

	42

	43 bool ChineseScriptClassifier::IsInitialized() const {

	44 return hans2hant_ && hant2hans_;

	45 }

	46

	47 ChineseScriptClassifier::~ChineseScriptClassifier() {}

	48

	49 std::string ChineseScriptClassifier::Classify(const std::string& input) const {

	50 // If there was a problem with initialization, return the empty string.

	51 if (!IsInitialized()) {

	52 return "";

	53 }

	54

	55 // Operate only on first 500 bytes.

	56 std::string input_subset;

	57 base::TruncateUTF8ToByteSize(input, 500, &input_subset);

	58

	59 // Remove whitespace since transliterators may not preserve it.

	60 input_subset.erase(std::remove_if(input_subset.begin(), input_subset.end(),

	61 base::IsUnicodeWhitespace),

	62 input_subset.end());

	63

	64 // Convert two copies of the input to icu::UnicodeString. Two copies are

	65 // necessary because transliteration happens in place only.

	66 icu::UnicodeString original_input =

	67 icu::UnicodeString::fromUTF8(input_subset);

	68 icu::UnicodeString hant_input = icu::UnicodeString::fromUTF8(input_subset);

	69 icu::UnicodeString hans_input = icu::UnicodeString::fromUTF8(input_subset);

	70

	71 // Get the zh-Hant version of this input.

	72 hans2hant_->transliterate(hant_input);

	73 // Get the zh-Hans version of this input.

	74 hant2hans_->transliterate(hans_input);

	75

	76 // Debugging only: show the input, the Hant version, and the Hans version.

	77 if (VLOG_IS_ON(1)) {

	78 std::string hant_string;

	79 std::string hans_string;

	80 hans_input.toUTF8String(hans_string);

	81 hant_input.toUTF8String(hant_string);

	82 DVLOG(1) << "Original input:\n" << input_subset;

	83 DVLOG(1) << "zh-Hant output:\n" << hant_string;

	84 DVLOG(1) << "zh-Hans output:\n" << hans_string;

	85 }

	86

	87 // Count matches between the original input chars and the Hant and Hans

	88 // versions of the input.

	89 int hant_count = 0;

	90 int hans_count = 0;

	91

	92 // Iterate over all chars in the original input and compute matches between

	93 // the Hant version and the Hans version.

	94 //

	95 // All segments (original, Hant, and Hans) should have the same length, but

	96 // in case of some corner case or bug in which they turn out not to be,

	97 // we compute the minimum length we are allowed to traverse.

	98 const int min_length =

	99 std::min(original_input.length(),

	100 std::min(hans_input.length(), hant_input.length()));

	101 for (int index = 0; index < min_length; ++index) {

	102 const auto original_char = original_input.charAt(index);

	103 const auto hans_char = hans_input.charAt(index);

	104 const auto hant_char = hant_input.charAt(index);

	105 if (hans_char == hant_char) {

	106 continue;

	107 } else if (original_char == hans_char) {

	108 // Hans-specific char found.

	109 ++hans_count;

	110 } else if (original_char == hant_char) {

	111 // Hant-specific char found.

	112 ++hant_count;

	113 }

	114 }

	115 DVLOG(1) << "Found " << hans_count << " zh-Hans chars in input";

	116 DVLOG(1) << "Found " << hant_count << " zh-Hant chars in input";

	117

	118 if (hant_count > hans_count) {

	119 return kChineseTraditionalCode;

	120 } else if (hans_count > hant_count) {

	121 return kChineseSimplifiedCode;

	122 } else { // hans_count == hant_count

	123 // All characters are the same in both scripts. In this case, we return the

	124 // following code.

	125 return kChineseSimplifiedCode;

	126 }

	127 }

	128

	129 } // namespace translate

OLD	NEW