Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(456)

Unified Diff: components/translate/core/language_detection/language_detection_util.cc

Issue 1263613002: Implement CLD hints to CLD2 calls. Edit CLD2 result to return top language instead of summary langu… (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: DetectLanguageSummaryV2 function call change Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: components/translate/core/language_detection/language_detection_util.cc
diff --git a/components/translate/core/language_detection/language_detection_util.cc b/components/translate/core/language_detection/language_detection_util.cc
index 5c751a7ffd59fb1427bb7597a306bfb2ba9a0d3c..1697fbed5ba4bd6a0c7b5dd54cc4ea747e11df5c 100644
--- a/components/translate/core/language_detection/language_detection_util.cc
+++ b/components/translate/core/language_detection/language_detection_util.cc
@@ -21,6 +21,7 @@
#if !defined(CLD_VERSION) || CLD_VERSION==2
#include "third_party/cld_2/src/public/compact_lang_det.h"
+#include "third_party/cld_2/src/public/encodings.h"
#endif
namespace {
@@ -86,7 +87,9 @@ int GetCLDMajorVersion() {
// failed.
// |is_cld_reliable| will be set as true if CLD says the detection is reliable.
std::string DetermineTextLanguage(const base::string16& text,
- bool* is_cld_reliable) {
+ bool* is_cld_reliable,
+ std::string& code,
+ std::string& html_lang) {
std::string language = translate::kUnknownLanguageCode;
int num_bytes_evaluated = 0;
bool is_reliable = false;
@@ -114,21 +117,41 @@ std::string DetermineTextLanguage(const base::string16& text,
const std::string utf8_text(base::UTF16ToUTF8(text));
const int num_utf8_bytes = static_cast<int>(utf8_text.size());
const char* raw_utf8_bytes = utf8_text.c_str();
- cld_language = CLD2::DetectLanguageCheckUTF8(
- raw_utf8_bytes, num_utf8_bytes, is_plain_text, &is_reliable,
- &num_bytes_evaluated);
+
+ CLD2::Language language3[3];
+ int percent3[3];
+ int flags = 0; // No flags, see compact_lang_det.h for details.
+ int text_bytes; // Amount of non-tag/letters-only text (assumed 0).
+ double normalized_score3[3];
+
+ const char* tld_hint = "";
+ int encoding_hint = CLD2::UNKNOWN_ENCODING;
+ CLD2::Language language_hint =
+ CLD2::GetLanguageFromName(html_lang.c_str());
+ CLD2::CLDHints cldhints = {code.c_str(), tld_hint, encoding_hint,
+ language_hint};
+
+ cld_language = CLD2::ExtDetectLanguageSummaryCheckUTF8(
+ raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags,
+ language3, percent3, normalized_score3, nullptr, &text_bytes,
Andrew Hayden (chromium.org) 2015/08/05 11:22:45 Please comment the meaning of the nullptr here, we
+ &is_reliable, &num_bytes_evaluated);
if (num_bytes_evaluated < num_utf8_bytes &&
cld_language == CLD2::UNKNOWN_LANGUAGE) {
// Invalid UTF8 encountered, see bug http://crbug.com/444258.
// Retry using only the valid characters. This time the check for valid
// UTF8 can be skipped since the precise number of valid bytes is known.
- cld_language = CLD2::DetectLanguage(raw_utf8_bytes, num_bytes_evaluated,
- is_plain_text, &is_reliable);
+ cld_language = CLD2::ExtDetectLanguageSummary(
+ raw_utf8_bytes, num_utf8_bytes, is_plain_text, &cldhints, flags,
+ language3, percent3, normalized_score3, nullptr, &text_bytes,
Andrew Hayden (chromium.org) 2015/08/05 11:22:45 And same here, please
+ &is_reliable);
}
is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
cld_language != CLD2::UNKNOWN_LANGUAGE &&
cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
+
+ // Choose top language.
+ cld_language = language3[0];
break;
}
#endif
@@ -213,15 +236,6 @@ std::string DeterminePageLanguage(const std::string& code,
bool* is_cld_reliable_p) {
base::TimeTicks begin_time = base::TimeTicks::Now();
bool is_cld_reliable;
- std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
- translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
-
- if (cld_language_p != NULL)
- *cld_language_p = cld_language;
- if (is_cld_reliable_p != NULL)
- *is_cld_reliable_p = is_cld_reliable;
- translate::ToTranslateLanguageSynonym(&cld_language);
-
// Check if html lang attribute is valid.
std::string modified_html_lang;
if (!html_lang.empty()) {
@@ -239,6 +253,16 @@ std::string DeterminePageLanguage(const std::string& code,
translate::ReportContentLanguage(code, modified_code);
}
+ std::string cld_language = DetermineTextLanguage(
+ contents, &is_cld_reliable, modified_code, modified_html_lang);
+ translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
+
+ if (cld_language_p != NULL)
+ *cld_language_p = cld_language;
+ if (is_cld_reliable_p != NULL)
+ *is_cld_reliable_p = is_cld_reliable;
+ translate::ToTranslateLanguageSynonym(&cld_language);
+
// Adopt |modified_html_lang| if it is valid. Otherwise, adopt
// |modified_code|.
std::string language = modified_html_lang.empty() ? modified_code :
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698