Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(143)

Side by Side Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 1125403004: Switch language detection to use CLD2's DetectLanguageCheckUTF8 method. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "components/translate/core/language_detection/language_detection_util.h " 5 #include "components/translate/core/language_detection/language_detection_util.h "
6 6
7 #include "base/logging.h" 7 #include "base/logging.h"
8 #include "base/metrics/field_trial.h" 8 #include "base/metrics/field_trial.h"
9 #include "base/strings/string_split.h" 9 #include "base/strings/string_split.h"
10 #include "base/strings/string_util.h" 10 #include "base/strings/string_util.h"
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after
104 &num_languages, NULL, &text_bytes); 104 &num_languages, NULL, &text_bytes);
105 is_valid_language = cld_language != NUM_LANGUAGES && 105 is_valid_language = cld_language != NUM_LANGUAGES &&
106 cld_language != UNKNOWN_LANGUAGE && 106 cld_language != UNKNOWN_LANGUAGE &&
107 cld_language != TG_UNKNOWN_LANGUAGE; 107 cld_language != TG_UNKNOWN_LANGUAGE;
108 break; 108 break;
109 } 109 }
110 #endif 110 #endif
111 #if !defined(CLD_VERSION) || CLD_VERSION==2 111 #if !defined(CLD_VERSION) || CLD_VERSION==2
112 case 2: { 112 case 2: {
113 std::string utf8_text(base::UTF16ToUTF8(text)); 113 std::string utf8_text(base::UTF16ToUTF8(text));
114 CLD2::Language language3[3]; 114 cld_language = CLD2::DetectLanguageCheckUTF8(
115 int percent3[3]; 115 utf8_text.c_str(), (int)utf8_text.size(), true /* is_plain_text */,
droger 2015/05/07 11:56:57 Could you use static_cast here instead of C-style
Andrew Hayden (chromium.org) 2015/05/07 12:24:05 Done.
116 CLD2::DetectLanguageSummary( 116 &is_reliable, &text_bytes);
117 utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3, 117 if (text_bytes < (int)utf8_text.size() &&
droger 2015/05/07 11:56:56 same here.
Andrew Hayden (chromium.org) 2015/05/07 12:24:05 Done.
118 &text_bytes, &is_reliable); 118 cld_language != CLD2::UNKNOWN_LANGUAGE) {
Andrew Hayden (chromium.org) 2015/05/07 12:24:05 That should be a ==, not a !=.
119 cld_language = language3[0]; 119 // Invalid UTF8 encountered, retry using just text_bytes of data.
120 // In practice this shouldn't happen, as Chromium should sanitize the
droger 2015/05/07 11:56:56 Do you want to add a NOTREACHED here?
Andrew Hayden (chromium.org) 2015/05/07 12:24:05 Done.
121 // text data prior to exposing it to any processing internally.
122 cld_language = CLD2::DetectLanguageCheckUTF8(
123 utf8_text.c_str(), text_bytes, true /* is_plain_text */,
124 &is_reliable, &text_bytes);
125 }
120 is_valid_language = cld_language != CLD2::NUM_LANGUAGES && 126 is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
121 cld_language != CLD2::UNKNOWN_LANGUAGE && 127 cld_language != CLD2::UNKNOWN_LANGUAGE &&
122 cld_language != CLD2::TG_UNKNOWN_LANGUAGE; 128 cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
123 break; 129 break;
124 } 130 }
125 #endif 131 #endif
126 default: 132 default:
127 NOTREACHED(); 133 NOTREACHED();
128 } 134 }
129 135
(...skipping 250 matching lines...) Expand 10 before | Expand all | Expand 10 after
380 // distinguish from English, and the language is one of well-known languages 386 // distinguish from English, and the language is one of well-known languages
381 // which often provide "en-*" meta information mistakenly. 387 // which often provide "en-*" meta information mistakenly.
382 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { 388 for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
383 if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) 389 if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
384 return true; 390 return true;
385 } 391 }
386 return false; 392 return false;
387 } 393 }
388 394
389 } // namespace translate 395 } // namespace translate
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698