Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(236)

Side by Side Diff: components/translate/core/language_detection/chinese_script_classifier.cc

Issue 2756313002: [Merge M-58] Adds ChineseScriptClassifier to predict zh-Hant or zh-Hans for input detected as zh. (Closed)
Patch Set: Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/translate/core/language_detection/chinese_script_classifier .h"
6
7 #include <algorithm>
8 #include <memory>
9 #include <string>
10 #include "base/logging.h"
11 #include "base/strings/string_util.h"
12 #include "third_party/icu/source/common/unicode/unistr.h"
13 #include "third_party/icu/source/i18n/unicode/translit.h"
14
15 namespace translate {
16
17 namespace {
18 // BCP 47 language code representing Chinese in Han Simplified script.
19 const char kChineseSimplifiedCode[] = "zh-Hans";
20
21 // BCP 47 language code representing Chinese in Han Traditional script.
22 const char kChineseTraditionalCode[] = "zh-Hant";
23 } // namespace
24
25 ChineseScriptClassifier::ChineseScriptClassifier() {
26 UParseError parse_status;
27 UErrorCode status = U_ZERO_ERROR;
28 // The Transliterator IDs are defined in:
29 // third_party/icu/source/data/translit/root.txt.
30 //
31 // Chromium keeps only a subset of these, defined in:
32 // third_party/icu/source/data/translit/root_subset.txt
33 hans2hant_.reset(icu::Transliterator::createInstance(
34 icu::UnicodeString("Hans-Hant"), UTRANS_FORWARD, parse_status, status));
35 DVLOG(1) << "Hans-Hant Transliterator initialization status: "
36 << u_errorName(status);
37 hant2hans_.reset(icu::Transliterator::createInstance(
38 icu::UnicodeString("Hant-Hans"), UTRANS_FORWARD, parse_status, status));
39 DVLOG(1) << "Hant-Hans Transliterator initialization status: "
40 << u_errorName(status);
41 }
42
43 bool ChineseScriptClassifier::IsInitialized() const {
44 return hans2hant_ && hant2hans_;
45 }
46
47 ChineseScriptClassifier::~ChineseScriptClassifier() {}
48
49 std::string ChineseScriptClassifier::Classify(const std::string& input) const {
50 // If there was a problem with initialization, return the empty string.
51 if (!IsInitialized()) {
52 return "";
53 }
54
55 // Operate only on first 500 bytes.
56 std::string input_subset;
57 base::TruncateUTF8ToByteSize(input, 500, &input_subset);
58
59 // Remove whitespace since transliterators may not preserve it.
60 input_subset.erase(std::remove_if(input_subset.begin(), input_subset.end(),
61 base::IsUnicodeWhitespace),
62 input_subset.end());
63
64 // Convert two copies of the input to icu::UnicodeString. Two copies are
65 // necessary because transliteration happens in place only.
66 icu::UnicodeString original_input =
67 icu::UnicodeString::fromUTF8(input_subset);
68 icu::UnicodeString hant_input = icu::UnicodeString::fromUTF8(input_subset);
69 icu::UnicodeString hans_input = icu::UnicodeString::fromUTF8(input_subset);
70
71 // Get the zh-Hant version of this input.
72 hans2hant_->transliterate(hant_input);
73 // Get the zh-Hans version of this input.
74 hant2hans_->transliterate(hans_input);
75
76 // Debugging only: show the input, the Hant version, and the Hans version.
77 if (VLOG_IS_ON(1)) {
78 std::string hant_string;
79 std::string hans_string;
80 hans_input.toUTF8String(hans_string);
81 hant_input.toUTF8String(hant_string);
82 DVLOG(1) << "Original input:\n" << input_subset;
83 DVLOG(1) << "zh-Hant output:\n" << hant_string;
84 DVLOG(1) << "zh-Hans output:\n" << hans_string;
85 }
86
87 // Count matches between the original input chars and the Hant and Hans
88 // versions of the input.
89 int hant_count = 0;
90 int hans_count = 0;
91
92 // Iterate over all chars in the original input and compute matches between
93 // the Hant version and the Hans version.
94 //
95 // All segments (original, Hant, and Hans) should have the same length, but
96 // in case of some corner case or bug in which they turn out not to be,
97 // we compute the minimum length we are allowed to traverse.
98 const int min_length =
99 std::min(original_input.length(),
100 std::min(hans_input.length(), hant_input.length()));
101 for (int index = 0; index < min_length; ++index) {
102 const auto original_char = original_input.charAt(index);
103 const auto hans_char = hans_input.charAt(index);
104 const auto hant_char = hant_input.charAt(index);
105 if (hans_char == hant_char) {
106 continue;
107 } else if (original_char == hans_char) {
108 // Hans-specific char found.
109 ++hans_count;
110 } else if (original_char == hant_char) {
111 // Hant-specific char found.
112 ++hant_count;
113 }
114 }
115 DVLOG(1) << "Found " << hans_count << " zh-Hans chars in input";
116 DVLOG(1) << "Found " << hant_count << " zh-Hant chars in input";
117
118 if (hant_count > hans_count) {
119 return kChineseTraditionalCode;
120 } else if (hans_count > hant_count) {
121 return kChineseSimplifiedCode;
122 } else { // hans_count == hant_count
123 // All characters are the same in both scripts. In this case, we return the
124 // following code.
125 return kChineseSimplifiedCode;
126 }
127 }
128
129 } // namespace translate
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698