Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(209)

Side by Side Diff: components/translate/core/language_detection/chinese_script_classifier.cc

Issue 2732023003: Adds ChineseScriptClassifier to predict zh-Hant or zh-Hans for input detected as zh. (Closed)
Patch Set: Fixes broken test Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "components/translate/core/language_detection/chinese_script_classifier .h"
6
7 #include <algorithm>
8 #include <cctype>
9 #include <memory>
10 #include <string>
11 #include "base/logging.h"
12 #include "base/strings/string_util.h"
13 #include "third_party/icu/source/common/unicode/unistr.h"
14 #include "third_party/icu/source/i18n/unicode/translit.h"
15
16 namespace translate {
17
18 const char ChineseScriptClassifier::kChineseSimplifiedCode[] = "zh-Hans";
19 const char ChineseScriptClassifier::kChineseTraditionalCode[] = "zh-Hant";
20
21 ChineseScriptClassifier::ChineseScriptClassifier() {
22 UParseError parse_status;
23 UErrorCode status = U_ZERO_ERROR;
24 // The Transliterator IDs are defined in:
25 // third_party/icu/source/data/translit/root.txt.
26 //
27 // Chromium keeps only a subset of these, defined in:
28 // third_party/icu/source/data/translit/root_subset.txt
29 hans2hant_.reset(icu::Transliterator::createInstance(
30 icu::UnicodeString("Hans-Hant"), UTRANS_FORWARD, parse_status, status));
31 VLOG(1) << "Hans-Hant Transliterator initialization status: "
32 << u_errorName(status);
33 hant2hans_.reset(icu::Transliterator::createInstance(
34 icu::UnicodeString("Hant-Hans"), UTRANS_FORWARD, parse_status, status));
35 VLOG(1) << "Hant-Hans Transliterator initialization status: "
36 << u_errorName(status);
37 }
38
39 bool ChineseScriptClassifier::IsInitialized() const {
40 return hans2hant_ && hant2hans_;
41 }
42
43 ChineseScriptClassifier::~ChineseScriptClassifier() {}
44
45 std::string ChineseScriptClassifier::Classify(const std::string& input) const {
46 // If there was a problem with initialization, return the empty string.
47 if (!IsInitialized()) {
48 return "";
49 }
50
51 // Operate only on first 500 bytes.
52 std::string input_subset;
53 base::TruncateUTF8ToByteSize(input, 500, &input_subset);
54
55 // Remove whitespace since transliterators may not preserve it.
56 input_subset.erase(std::remove_if(input_subset.begin(), input_subset.end(),
57 base::IsUnicodeWhitespace),
58 input_subset.end());
59
60 // Convert two copies of the input to icu::UnicodeString. Two copies are
61 // necessary because transliteration happens in place only.
62 icu::UnicodeString original_input =
63 icu::UnicodeString::fromUTF8(input_subset);
64 icu::UnicodeString hant_input = icu::UnicodeString::fromUTF8(input_subset);
65 icu::UnicodeString hans_input = icu::UnicodeString::fromUTF8(input_subset);
66
67 // Get the zh-Hant version of this input.
68 hans2hant_->transliterate(hant_input);
69 // Get the zh-Hans version of this input.
70 hant2hans_->transliterate(hans_input);
71
72 // Debugging only: show the input, the Hant version, and the Hans version.
73 if (VLOG_IS_ON(1)) {
74 std::string hant_string;
75 std::string hans_string;
76 hans_input.toUTF8String(hans_string);
77 hant_input.toUTF8String(hant_string);
78 VLOG(1) << "Original input:\n" << input_subset;
groby-ooo-7-16 2017/03/08 00:41:54 Are you planning to keep the VLOG forever, or is t
riesa 2017/03/08 01:47:42 Yes, I was planning to keep it in case of bug repo
groby1 2017/03/08 02:00:57 I'm torn. Each VLOG increases binary size, which a
79 VLOG(1) << "zh-Hant output:\n" << hant_string;
80 VLOG(1) << "zh-Hans output:\n" << hans_string;
81 }
82
83 // Count matches between the original input chars and the Hant and Hans
84 // versions of the input.
85 int hant_count = 0;
86 int hans_count = 0;
87 for (int index = 0;
88 index < original_input.length() && index < hans_input.length() &&
groby-ooo-7-16 2017/03/08 00:41:54 nit: please do compute min outside loop
riesa 2017/03/08 01:47:42 Done.
89 index < hant_input.length();
90 ++index) {
91 const auto original_char = original_input.charAt(index);
92 const auto hans_char = hans_input.charAt(index);
groby-ooo-7-16 2017/03/08 00:41:54 Bit concerned by the fact that charAt needs to rep
riesa 2017/03/08 01:47:42 Hm, from what I can tell charAt is just indexing r
groby1 2017/03/08 02:00:57 Gah. I misread the types to be still UTF8. Never m
93 const auto hant_char = hant_input.charAt(index);
94 if (hans_char == hant_char) {
groby-ooo-7-16 2017/03/08 00:41:54 Possible simplification (no braces, less branches,
riesa 2017/03/08 01:47:42 I originally had it this way in an earlier patch,
groby1 2017/03/08 02:00:57 I don't think there's a performance penalty worth
95 continue;
96 } else if (original_char == hans_char) {
97 // Input matches a Hans-only char.
98 ++hans_count;
99 } else if (original_char == hant_char) {
100 // Input matches a Hant-only char.
101 ++hant_count;
102 }
103 }
104 VLOG(1) << "Found " << hans_count << " zh-Hans chars in input";
105 VLOG(1) << "Found " << hant_count << " zh-Hant chars in input";
106
107 if (hant_count > hans_count) {
108 return kChineseTraditionalCode;
109 } else if (hans_count > hant_count) {
110 return kChineseSimplifiedCode;
111 } else { // hans_count == hant_count
112 // All characters are the same in both scripts. In this case, we return the
113 // following code.
114 return kChineseSimplifiedCode;
115 }
116 }
117
118 } // namespace translate
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698