| OLD | NEW |
| (Empty) | |
| 1 // Copyright 2017 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "components/translate/core/language_detection/chinese_script_classifier
.h" |
| 6 |
| 7 #include <algorithm> |
| 8 #include <memory> |
| 9 #include <string> |
| 10 #include "base/logging.h" |
| 11 #include "base/strings/string_util.h" |
| 12 #include "third_party/icu/source/common/unicode/unistr.h" |
| 13 #include "third_party/icu/source/i18n/unicode/translit.h" |
| 14 |
| 15 namespace translate { |
| 16 |
| 17 namespace { |
| 18 // BCP 47 language code representing Chinese in Han Simplified script. |
| 19 const char kChineseSimplifiedCode[] = "zh-Hans"; |
| 20 |
| 21 // BCP 47 language code representing Chinese in Han Traditional script. |
| 22 const char kChineseTraditionalCode[] = "zh-Hant"; |
| 23 } // namespace |
| 24 |
| 25 ChineseScriptClassifier::ChineseScriptClassifier() { |
| 26 UParseError parse_status; |
| 27 UErrorCode status = U_ZERO_ERROR; |
| 28 // The Transliterator IDs are defined in: |
| 29 // third_party/icu/source/data/translit/root.txt. |
| 30 // |
| 31 // Chromium keeps only a subset of these, defined in: |
| 32 // third_party/icu/source/data/translit/root_subset.txt |
| 33 hans2hant_.reset(icu::Transliterator::createInstance( |
| 34 icu::UnicodeString("Hans-Hant"), UTRANS_FORWARD, parse_status, status)); |
| 35 DVLOG(1) << "Hans-Hant Transliterator initialization status: " |
| 36 << u_errorName(status); |
| 37 hant2hans_.reset(icu::Transliterator::createInstance( |
| 38 icu::UnicodeString("Hant-Hans"), UTRANS_FORWARD, parse_status, status)); |
| 39 DVLOG(1) << "Hant-Hans Transliterator initialization status: " |
| 40 << u_errorName(status); |
| 41 } |
| 42 |
| 43 bool ChineseScriptClassifier::IsInitialized() const { |
| 44 return hans2hant_ && hant2hans_; |
| 45 } |
| 46 |
| 47 ChineseScriptClassifier::~ChineseScriptClassifier() {} |
| 48 |
| 49 std::string ChineseScriptClassifier::Classify(const std::string& input) const { |
| 50 // If there was a problem with initialization, return the empty string. |
| 51 if (!IsInitialized()) { |
| 52 return ""; |
| 53 } |
| 54 |
| 55 // Operate only on first 500 bytes. |
| 56 std::string input_subset; |
| 57 base::TruncateUTF8ToByteSize(input, 500, &input_subset); |
| 58 |
| 59 // Remove whitespace since transliterators may not preserve it. |
| 60 input_subset.erase(std::remove_if(input_subset.begin(), input_subset.end(), |
| 61 base::IsUnicodeWhitespace), |
| 62 input_subset.end()); |
| 63 |
| 64 // Convert two copies of the input to icu::UnicodeString. Two copies are |
| 65 // necessary because transliteration happens in place only. |
| 66 icu::UnicodeString original_input = |
| 67 icu::UnicodeString::fromUTF8(input_subset); |
| 68 icu::UnicodeString hant_input = icu::UnicodeString::fromUTF8(input_subset); |
| 69 icu::UnicodeString hans_input = icu::UnicodeString::fromUTF8(input_subset); |
| 70 |
| 71 // Get the zh-Hant version of this input. |
| 72 hans2hant_->transliterate(hant_input); |
| 73 // Get the zh-Hans version of this input. |
| 74 hant2hans_->transliterate(hans_input); |
| 75 |
| 76 // Debugging only: show the input, the Hant version, and the Hans version. |
| 77 if (VLOG_IS_ON(1)) { |
| 78 std::string hant_string; |
| 79 std::string hans_string; |
| 80 hans_input.toUTF8String(hans_string); |
| 81 hant_input.toUTF8String(hant_string); |
| 82 DVLOG(1) << "Original input:\n" << input_subset; |
| 83 DVLOG(1) << "zh-Hant output:\n" << hant_string; |
| 84 DVLOG(1) << "zh-Hans output:\n" << hans_string; |
| 85 } |
| 86 |
| 87 // Count matches between the original input chars and the Hant and Hans |
| 88 // versions of the input. |
| 89 int hant_count = 0; |
| 90 int hans_count = 0; |
| 91 |
| 92 // Iterate over all chars in the original input and compute matches between |
| 93 // the Hant version and the Hans version. |
| 94 // |
| 95 // All segments (original, Hant, and Hans) should have the same length, but |
| 96 // in case of some corner case or bug in which they turn out not to be, |
| 97 // we compute the minimum length we are allowed to traverse. |
| 98 const int min_length = |
| 99 std::min(original_input.length(), |
| 100 std::min(hans_input.length(), hant_input.length())); |
| 101 for (int index = 0; index < min_length; ++index) { |
| 102 const auto original_char = original_input.charAt(index); |
| 103 const auto hans_char = hans_input.charAt(index); |
| 104 const auto hant_char = hant_input.charAt(index); |
| 105 if (hans_char == hant_char) { |
| 106 continue; |
| 107 } else if (original_char == hans_char) { |
| 108 // Hans-specific char found. |
| 109 ++hans_count; |
| 110 } else if (original_char == hant_char) { |
| 111 // Hant-specific char found. |
| 112 ++hant_count; |
| 113 } |
| 114 } |
| 115 DVLOG(1) << "Found " << hans_count << " zh-Hans chars in input"; |
| 116 DVLOG(1) << "Found " << hant_count << " zh-Hant chars in input"; |
| 117 |
| 118 if (hant_count > hans_count) { |
| 119 return kChineseTraditionalCode; |
| 120 } else if (hans_count > hant_count) { |
| 121 return kChineseSimplifiedCode; |
| 122 } else { // hans_count == hant_count |
| 123 // All characters are the same in both scripts. In this case, we return the |
| 124 // following code. |
| 125 return kChineseSimplifiedCode; |
| 126 } |
| 127 } |
| 128 |
| 129 } // namespace translate |
| OLD | NEW |