Chromium Code Reviews| Index: components/translate/core/language_detection/chinese_script_classifier.cc |
| diff --git a/components/translate/core/language_detection/chinese_script_classifier.cc b/components/translate/core/language_detection/chinese_script_classifier.cc |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..816beb28366d9eaace0c9f5296aaf63661a8bcff |
| --- /dev/null |
| +++ b/components/translate/core/language_detection/chinese_script_classifier.cc |
| @@ -0,0 +1,118 @@ |
| +// Copyright 2017 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "components/translate/core/language_detection/chinese_script_classifier.h" |
| + |
| +#include <algorithm> |
| +#include <cctype> |
| +#include <memory> |
| +#include <string> |
| +#include "base/logging.h" |
| +#include "base/strings/string_util.h" |
| +#include "third_party/icu/source/common/unicode/unistr.h" |
| +#include "third_party/icu/source/i18n/unicode/translit.h" |
| + |
| +namespace translate { |
| + |
| +const char ChineseScriptClassifier::kChineseSimplifiedCode[] = "zh-Hans"; |
| +const char ChineseScriptClassifier::kChineseTraditionalCode[] = "zh-Hant"; |
| + |
| +ChineseScriptClassifier::ChineseScriptClassifier() { |
| + UParseError parse_status; |
| + UErrorCode status = U_ZERO_ERROR; |
| + // The Transliterator IDs are defined in: |
| + // third_party/icu/source/data/translit/root.txt. |
| + // |
| + // Chromium keeps only a subset of these, defined in: |
| + // third_party/icu/source/data/translit/root_subset.txt |
| + hans2hant_.reset(icu::Transliterator::createInstance( |
| + icu::UnicodeString("Hans-Hant"), UTRANS_FORWARD, parse_status, status)); |
| + VLOG(1) << "Hans-Hant Transliterator initialization status: " |
| + << u_errorName(status); |
| + hant2hans_.reset(icu::Transliterator::createInstance( |
| + icu::UnicodeString("Hant-Hans"), UTRANS_FORWARD, parse_status, status)); |
| + VLOG(1) << "Hant-Hans Transliterator initialization status: " |
| + << u_errorName(status); |
| +} |
| + |
| +bool ChineseScriptClassifier::IsInitialized() const { |
| + return hans2hant_ && hant2hans_; |
| +} |
| + |
| +ChineseScriptClassifier::~ChineseScriptClassifier() {} |
| + |
| +std::string ChineseScriptClassifier::Classify(const std::string& input) const { |
| + // If there was a problem with initialization, return the empty string. |
| + if (!IsInitialized()) { |
| + return ""; |
| + } |
| + |
| + // Operate only on first 500 bytes. |
| + std::string input_subset; |
| + base::TruncateUTF8ToByteSize(input, 500, &input_subset); |
| + |
| + // Remove whitespace since transliterators may not preserve it. |
| + input_subset.erase(std::remove_if(input_subset.begin(), input_subset.end(), |
| + base::IsUnicodeWhitespace), |
| + input_subset.end()); |
| + |
| + // Convert two copies of the input to icu::UnicodeString. Two copies are |
| + // necessary because transliteration happens in place only. |
| + icu::UnicodeString original_input = |
| + icu::UnicodeString::fromUTF8(input_subset); |
| + icu::UnicodeString hant_input = icu::UnicodeString::fromUTF8(input_subset); |
| + icu::UnicodeString hans_input = icu::UnicodeString::fromUTF8(input_subset); |
| + |
| + // Get the zh-Hant version of this input. |
| + hans2hant_->transliterate(hant_input); |
| + // Get the zh-Hans version of this input. |
| + hant2hans_->transliterate(hans_input); |
| + |
| + // Debugging only: show the input, the Hant version, and the Hans version. |
| + if (VLOG_IS_ON(1)) { |
| + std::string hant_string; |
| + std::string hans_string; |
| + hans_input.toUTF8String(hans_string); |
| + hant_input.toUTF8String(hant_string); |
| + VLOG(1) << "Original input:\n" << input_subset; |
|
groby-ooo-7-16
2017/03/08 00:41:54
Are you planning to keep the VLOG forever, or is t
riesa
2017/03/08 01:47:42
Yes, I was planning to keep it in case of bug repo
groby1
2017/03/08 02:00:57
I'm torn. Each VLOG increases binary size, which a
|
| + VLOG(1) << "zh-Hant output:\n" << hant_string; |
| + VLOG(1) << "zh-Hans output:\n" << hans_string; |
| + } |
| + |
| + // Count matches between the original input chars and the Hant and Hans |
| + // versions of the input. |
| + int hant_count = 0; |
| + int hans_count = 0; |
| + for (int index = 0; |
| + index < original_input.length() && index < hans_input.length() && |
|
groby-ooo-7-16
2017/03/08 00:41:54
nit: please do compute min outside loop
riesa
2017/03/08 01:47:42
Done.
|
| + index < hant_input.length(); |
| + ++index) { |
| + const auto original_char = original_input.charAt(index); |
| + const auto hans_char = hans_input.charAt(index); |
|
groby-ooo-7-16
2017/03/08 00:41:54
Bit concerned by the fact that charAt needs to rep
riesa
2017/03/08 01:47:42
Hm, from what I can tell charAt is just indexing r
groby1
2017/03/08 02:00:57
Gah. I misread the types to be still UTF8. Never m
|
| + const auto hant_char = hant_input.charAt(index); |
| + if (hans_char == hant_char) { |
|
groby-ooo-7-16
2017/03/08 00:41:54
Possible simplification (no braces, less branches,
riesa
2017/03/08 01:47:42
I originally had it this way in an earlier patch,
groby1
2017/03/08 02:00:57
I don't think there's a performance penalty worth
|
| + continue; |
| + } else if (original_char == hans_char) { |
| + // Input matches a Hans-only char. |
| + ++hans_count; |
| + } else if (original_char == hant_char) { |
| + // Input matches a Hant-only char. |
| + ++hant_count; |
| + } |
| + } |
| + VLOG(1) << "Found " << hans_count << " zh-Hans chars in input"; |
| + VLOG(1) << "Found " << hant_count << " zh-Hant chars in input"; |
| + |
| + if (hant_count > hans_count) { |
| + return kChineseTraditionalCode; |
| + } else if (hans_count > hant_count) { |
| + return kChineseSimplifiedCode; |
| + } else { // hans_count == hant_count |
| + // All characters are the same in both scripts. In this case, we return the |
| + // following code. |
| + return kChineseSimplifiedCode; |
| + } |
| +} |
| + |
| +} // namespace translate |