components/translate/core/language_detection/language_detection_util.cc - Issue 1125403004: Switch language detection to use CLD2's DetectLanguageCheckUTF8 method.

Side by Side Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 1125403004: Switch language detection to use CLD2's DetectLanguageCheckUTF8 method. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: git cl format Created 5 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "components/translate/core/language_detection/language_detection_util.h "	5 #include "components/translate/core/language_detection/language_detection_util.h "

6	6

7 #include "base/logging.h"	7 #include "base/logging.h"

8 #include "base/metrics/field_trial.h"	8 #include "base/metrics/field_trial.h"

9 #include "base/strings/string_split.h"	9 #include "base/strings/string_split.h"

10 #include "base/strings/string_util.h"	10 #include "base/strings/string_util.h"

(...skipping 70 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
81 return CLD_VERSION;	81 return CLD_VERSION;

82 #endif	82 #endif

83 }	83 }

84	84

85 // Returns the ISO 639 language code of the specified \|text\|, or 'unknown' if it	85 // Returns the ISO 639 language code of the specified \|text\|, or 'unknown' if it

86 // failed.	86 // failed.

87 // \|is_cld_reliable\| will be set as true if CLD says the detection is reliable.	87 // \|is_cld_reliable\| will be set as true if CLD says the detection is reliable.

88 std::string DetermineTextLanguage(const base::string16& text,	88 std::string DetermineTextLanguage(const base::string16& text,

89 bool* is_cld_reliable) {	89 bool* is_cld_reliable) {

90 std::string language = translate::kUnknownLanguageCode;	90 std::string language = translate::kUnknownLanguageCode;

91 int text_bytes = 0;	91 int num_bytes_evaluated = 0;

92 bool is_reliable = false;	92 bool is_reliable = false;

	93 const bool is_plain_text = true;

93	94

94 // Language or CLD2::Language	95 // Language or CLD2::Language

95 int cld_language = 0;	96 int cld_language = 0;

96 bool is_valid_language = false;	97 bool is_valid_language = false;

97	98

98 switch (GetCLDMajorVersion()) {	99 switch (GetCLDMajorVersion()) {

99 #if !defined(CLD_VERSION) \|\| CLD_VERSION==1	100 #if !defined(CLD_VERSION) \|\| CLD_VERSION==1

100 case 1: {	101 case 1: {

101 int num_languages = 0;	102 int num_languages = 0;

102 cld_language =	103 cld_language = DetectLanguageOfUnicodeText(

103 DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,	104 NULL, text.c_str(), is_plain_text, &is_reliable, &num_languages, NULL,

104 &num_languages, NULL, &text_bytes);	105 &num_bytes_evaluated);

105 is_valid_language = cld_language != NUM_LANGUAGES &&	106 is_valid_language = cld_language != NUM_LANGUAGES &&

106 cld_language != UNKNOWN_LANGUAGE &&	107 cld_language != UNKNOWN_LANGUAGE &&

107 cld_language != TG_UNKNOWN_LANGUAGE;	108 cld_language != TG_UNKNOWN_LANGUAGE;

108 break;	109 break;

109 }	110 }

110 #endif	111 #endif

111 #if !defined(CLD_VERSION) \|\| CLD_VERSION==2	112 #if !defined(CLD_VERSION) \|\| CLD_VERSION==2

112 case 2: {	113 case 2: {

113 std::string utf8_text(base::UTF16ToUTF8(text));	114 const std::string utf8_text(base::UTF16ToUTF8(text));

114 CLD2::Language language3[3];	115 const int num_utf8_bytes = static_cast<int>(utf8_text.size());

115 int percent3[3];	116 const char* raw_utf8_bytes = utf8_text.c_str();

116 CLD2::DetectLanguageSummary(	117 cld_language = CLD2::DetectLanguageCheckUTF8(

117 utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3,	118 raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable,

118 &text_bytes, &is_reliable);	119 &num_bytes_evaluated);

119 cld_language = language3[0];	120

	121 if (num_bytes_evaluated < num_utf8_bytes &&

	122 cld_language == CLD2::UNKNOWN_LANGUAGE) {

	123 // Invalid UTF8 encountered, see bug http://crbug.com/444258.

	124 // Retry using only the valid characters. This time the check for valid

	125 // UTF8 can be skipped since the precise number of valid bytes is known.

	126 cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated,

	127 is_plain_text, &is_reliable);

	128 }

120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&	129 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&

121 cld_language != CLD2::UNKNOWN_LANGUAGE &&	130 cld_language != CLD2::UNKNOWN_LANGUAGE &&

122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;	131 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;

123 break;	132 break;

124 }	133 }

125 #endif	134 #endif

126 default:	135 default:

127 NOTREACHED();	136 NOTREACHED();

128 }	137 }

129	138

130 if (is_cld_reliable != NULL)	139 if (is_cld_reliable != NULL)

131 *is_cld_reliable = is_reliable;	140 *is_cld_reliable = is_reliable;

132	141

133 // We don't trust the result if the CLD reports that the detection is not	142 // We don't trust the result if the CLD reports that the detection is not

134 // reliable, or if the actual text used to detect the language was less than	143 // reliable, or if the actual text used to detect the language was less than

135 // 100 bytes (short texts can often lead to wrong results).	144 // 100 bytes (short texts can often lead to wrong results).

136 // TODO(toyoshim): CLD provides \|is_reliable\| flag. But, it just says that	145 // TODO(toyoshim): CLD provides \|is_reliable\| flag. But, it just says that

137 // the determined language code is correct with 50% confidence. Chrome should	146 // the determined language code is correct with 50% confidence. Chrome should

138 // handle the real confidence value to judge.	147 // handle the real confidence value to judge.

139 if (is_reliable && text_bytes >= 100 && is_valid_language) {	148 if (is_reliable && num_bytes_evaluated >= 100 && is_valid_language) {

140 // We should not use LanguageCode_ISO_639_1 because it does not cover all	149 // We should not use LanguageCode_ISO_639_1 because it does not cover all

141 // the languages CLD can detect. As a result, it'll return the invalid	150 // the languages CLD can detect. As a result, it'll return the invalid

142 // language code for tradtional Chinese among others.	151 // language code for tradtional Chinese among others.

143 // \|LanguageCodeWithDialect\| will go through ISO 639-1, ISO-639-2 and	152 // \|LanguageCodeWithDialect\| will go through ISO 639-1, ISO-639-2 and

144 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN	153 // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN

145 // for Simplified Chinese.	154 // for Simplified Chinese.

146 switch (GetCLDMajorVersion()) {	155 switch (GetCLDMajorVersion()) {

147 #if !defined(CLD_VERSION) \|\| CLD_VERSION==1	156 #if !defined(CLD_VERSION) \|\| CLD_VERSION==1

148 case 1:	157 case 1:

149 language =	158 language =

(...skipping 230 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
380 // distinguish from English, and the language is one of well-known languages	389 // distinguish from English, and the language is one of well-known languages

381 // which often provide "en-*" meta information mistakenly.	390 // which often provide "en-*" meta information mistakenly.

382 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {	391 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {

383 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])	392 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])

384 return true;	393 return true;

385 }	394 }

386 return false;	395 return false;

387 }	396 }

388	397

389 } // namespace translate	398 } // namespace translate

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »