Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2017 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "components/translate/core/language_detection/chinese_script_classifier .h" | |
| 6 | |
| 7 #include <algorithm> | |
| 8 #include <cctype> | |
| 9 #include <memory> | |
| 10 #include <string> | |
| 11 #include "base/logging.h" | |
| 12 #include "base/strings/string_util.h" | |
| 13 #include "third_party/icu/source/common/unicode/unistr.h" | |
| 14 #include "third_party/icu/source/i18n/unicode/translit.h" | |
| 15 | |
| 16 namespace translate { | |
| 17 | |
| 18 const char ChineseScriptClassifier::kChineseSimplifiedCode[] = "zh-Hans"; | |
| 19 const char ChineseScriptClassifier::kChineseTraditionalCode[] = "zh-Hant"; | |
| 20 | |
| 21 ChineseScriptClassifier::ChineseScriptClassifier() { | |
| 22 UParseError parse_status; | |
| 23 UErrorCode status = U_ZERO_ERROR; | |
| 24 // The Transliterator IDs are defined in: | |
| 25 // third_party/icu/source/data/translit/root.txt. | |
| 26 // | |
| 27 // Chromium keeps only a subset of these, defined in: | |
| 28 // third_party/icu/source/data/translit/root_subset.txt | |
| 29 hans2hant_.reset(icu::Transliterator::createInstance( | |
| 30 icu::UnicodeString("Hans-Hant"), UTRANS_FORWARD, parse_status, status)); | |
| 31 VLOG(1) << "Hans-Hant Transliterator initialization status: " | |
| 32 << u_errorName(status); | |
| 33 hant2hans_.reset(icu::Transliterator::createInstance( | |
| 34 icu::UnicodeString("Hant-Hans"), UTRANS_FORWARD, parse_status, status)); | |
| 35 VLOG(1) << "Hant-Hans Transliterator initialization status: " | |
| 36 << u_errorName(status); | |
| 37 } | |
| 38 | |
| 39 bool ChineseScriptClassifier::IsInitialized() const { | |
| 40 return hans2hant_ && hant2hans_; | |
| 41 } | |
| 42 | |
| 43 ChineseScriptClassifier::~ChineseScriptClassifier() {} | |
| 44 | |
| 45 std::string ChineseScriptClassifier::Classify(const std::string& input) const { | |
| 46 // If there was a problem with initialization, return the empty string. | |
| 47 if (!IsInitialized()) { | |
| 48 return ""; | |
| 49 } | |
| 50 | |
| 51 // Operate only on first 500 bytes. | |
| 52 std::string input_subset; | |
| 53 base::TruncateUTF8ToByteSize(input, 500, &input_subset); | |
| 54 | |
| 55 // Remove whitespace since transliterators may not preserve it. | |
| 56 input_subset.erase(std::remove_if(input_subset.begin(), input_subset.end(), | |
| 57 base::IsUnicodeWhitespace), | |
| 58 input_subset.end()); | |
| 59 | |
| 60 // Convert two copies of the input to icu::UnicodeString. Two copies are | |
| 61 // necessary because transliteration happens in place only. | |
| 62 icu::UnicodeString original_input = | |
| 63 icu::UnicodeString::fromUTF8(input_subset); | |
| 64 icu::UnicodeString hant_input = icu::UnicodeString::fromUTF8(input_subset); | |
| 65 icu::UnicodeString hans_input = icu::UnicodeString::fromUTF8(input_subset); | |
| 66 | |
| 67 // Get the zh-Hant version of this input. | |
| 68 hans2hant_->transliterate(hant_input); | |
| 69 // Get the zh-Hans version of this input. | |
| 70 hant2hans_->transliterate(hans_input); | |
| 71 | |
| 72 // Debugging only: show the input, the Hant version, and the Hans version. | |
| 73 if (VLOG_IS_ON(1)) { | |
| 74 std::string hant_string; | |
| 75 std::string hans_string; | |
| 76 hans_input.toUTF8String(hans_string); | |
| 77 hant_input.toUTF8String(hant_string); | |
| 78 VLOG(1) << "Original input:\n" << input_subset; | |
|
groby-ooo-7-16
2017/03/08 00:41:54
Are you planning to keep the VLOG forever, or is t
riesa
2017/03/08 01:47:42
Yes, I was planning to keep it in case of bug repo
groby1
2017/03/08 02:00:57
I'm torn. Each VLOG increases binary size, which a
| |
| 79 VLOG(1) << "zh-Hant output:\n" << hant_string; | |
| 80 VLOG(1) << "zh-Hans output:\n" << hans_string; | |
| 81 } | |
| 82 | |
| 83 // Count matches between the original input chars and the Hant and Hans | |
| 84 // versions of the input. | |
| 85 int hant_count = 0; | |
| 86 int hans_count = 0; | |
| 87 for (int index = 0; | |
| 88 index < original_input.length() && index < hans_input.length() && | |
|
groby-ooo-7-16
2017/03/08 00:41:54
nit: please do compute min outside loop
riesa
2017/03/08 01:47:42
Done.
| |
| 89 index < hant_input.length(); | |
| 90 ++index) { | |
| 91 const auto original_char = original_input.charAt(index); | |
| 92 const auto hans_char = hans_input.charAt(index); | |
|
groby-ooo-7-16
2017/03/08 00:41:54
Bit concerned by the fact that charAt needs to rep
riesa
2017/03/08 01:47:42
Hm, from what I can tell charAt is just indexing r
groby1
2017/03/08 02:00:57
Gah. I misread the types to be still UTF8. Never m
| |
| 93 const auto hant_char = hant_input.charAt(index); | |
| 94 if (hans_char == hant_char) { | |
|
groby-ooo-7-16
2017/03/08 00:41:54
Possible simplification (no braces, less branches,
riesa
2017/03/08 01:47:42
I originally had it this way in an earlier patch,
groby1
2017/03/08 02:00:57
I don't think there's a performance penalty worth
| |
| 95 continue; | |
| 96 } else if (original_char == hans_char) { | |
| 97 // Input matches a Hans-only char. | |
| 98 ++hans_count; | |
| 99 } else if (original_char == hant_char) { | |
| 100 // Input matches a Hant-only char. | |
| 101 ++hant_count; | |
| 102 } | |
| 103 } | |
| 104 VLOG(1) << "Found " << hans_count << " zh-Hans chars in input"; | |
| 105 VLOG(1) << "Found " << hant_count << " zh-Hant chars in input"; | |
| 106 | |
| 107 if (hant_count > hans_count) { | |
| 108 return kChineseTraditionalCode; | |
| 109 } else if (hans_count > hant_count) { | |
| 110 return kChineseSimplifiedCode; | |
| 111 } else { // hans_count == hant_count | |
| 112 // All characters are the same in both scripts. In this case, we return the | |
| 113 // following code. | |
| 114 return kChineseSimplifiedCode; | |
| 115 } | |
| 116 } | |
| 117 | |
| 118 } // namespace translate | |
| OLD | NEW |