| Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
|
| ===================================================================
|
| --- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc (revision 0)
|
| +++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc (revision 0)
|
| @@ -0,0 +1,130 @@
|
| +// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h"
|
| +
|
| +#include <tchar.h>
|
| +#include <windows.h>
|
| +
|
| +#include <vector> // to compile bar/common/component.h
|
| +
|
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h"
|
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h"
|
| +#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/normalizedunicodetext.h"
|
| +
|
| +
|
| +// Detects a language of the UTF-16 encoded zero-terminated text.
|
| +// Returns: Language enum.
|
| +// TODO : make it reuse already allocated buffers to avoid excessive
|
| +// allocate/free call pairs. The idea is to have two buffers allocated and
|
| +// alternate their use for every Windows API call.
|
| +// Let's leave it as it is, simple and working and optimize it as the next step
|
| +// if it will consume too much resources (after careful measuring, indeed).
|
| +Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
|
| + bool* is_reliable, int* num_languages,
|
| + DWORD* error_code) {
|
| + if (!text || !num_languages) {
|
| + if (error_code)
|
| + *error_code = ERROR_INVALID_PARAMETER;
|
| + return NUM_LANGUAGES;
|
| + }
|
| +
|
| + // Normalize text first. We do not check the return value here since there
|
| + // is no meaningful recovery we can do in case of failure anyway.
|
| + // Since the vast majority of texts on the Internet is already normalized
|
| + // and languages which require normalization are easy to recognize by CLD
|
| + // anyway, we'll benefit more from trying to detect language in non-normalized
|
| + // text (and, with some probability, fail to recognize it) than to give up
|
| + // right away and return the unknown language here.
|
| + NormalizedUnicodeText nomalized_text;
|
| + nomalized_text.Normalize(NormalizationC, text);
|
| +
|
| + // Determine the size of the buffer required to store a lowercased text.
|
| + int lowercase_text_size =
|
| + ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,
|
| + nomalized_text.get(), -1,
|
| + NULL, 0);
|
| + if (!lowercase_text_size) {
|
| + if (error_code)
|
| + *error_code = ::GetLastError();
|
| + return NUM_LANGUAGES;
|
| + }
|
| +
|
| + scoped_array<WCHAR> lowercase_text(new WCHAR[lowercase_text_size]);
|
| + if (!lowercase_text.get())
|
| + return NUM_LANGUAGES;
|
| +
|
| + // Covert text to lowercase.
|
| + int lowercasing_result =
|
| + ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,
|
| + nomalized_text.get(), -1,
|
| + lowercase_text.get(), lowercase_text_size);
|
| + if (!lowercasing_result) {
|
| + if (error_code)
|
| + *error_code = ::GetLastError();
|
| + return NUM_LANGUAGES;
|
| + }
|
| +
|
| + // Determine the size of the buffer required to covert text to UTF-8.
|
| + int utf8_encoded_buffer_size =
|
| + ::WideCharToMultiByte(CP_UTF8, 0,
|
| + lowercase_text.get(), -1,
|
| + NULL, 0,
|
| + NULL, NULL);
|
| + if (!utf8_encoded_buffer_size) {
|
| + if (error_code)
|
| + *error_code = ::GetLastError();
|
| + return NUM_LANGUAGES;
|
| + }
|
| +
|
| + scoped_array<char> utf8_encoded_buffer(
|
| + new char[utf8_encoded_buffer_size]);
|
| +
|
| + // Convert text to UTF-8.
|
| + int utf8_encoding_result =
|
| + ::WideCharToMultiByte(CP_UTF8, 0,
|
| + lowercase_text.get(), -1,
|
| + utf8_encoded_buffer.get(), utf8_encoded_buffer_size,
|
| + NULL, NULL);
|
| + if (!utf8_encoding_result) {
|
| + if (error_code)
|
| + *error_code = ::GetLastError();
|
| + return NUM_LANGUAGES;
|
| + }
|
| +
|
| + if (error_code)
|
| + *error_code = 0;
|
| +
|
| + // Engage core CLD library language detection.
|
| + Language language3[3] = {
|
| + UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
|
| + };
|
| + int percent3[3] = { 0, 0, 0 };
|
| + int text_bytes = 0;
|
| + // We ignore return value here due to the problem described in bug 1800161.
|
| + // For example, translate.google.com was detected as Indonesian. It happened
|
| + // due to the heuristic in CLD, which ignores English as a top language
|
| + // in the presence of another reliably detected language.
|
| + // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
|
| + // language3 array is always set according to the detection results and
|
| + // is not affected by this heuristic.
|
| + CompactLangDet::DetectLanguageSummary(utf8_encoded_buffer.get(),
|
| + utf8_encoded_buffer_size,
|
| + is_plain_text, language3, percent3,
|
| + &text_bytes, is_reliable);
|
| +
|
| + // Calcualte a number of languages detected in more than 20% of the text.
|
| + const int kMinTextPercentToCountLanguage = 20;
|
| + *num_languages = 0;
|
| + COMPILE_ASSERT(ARRAYSIZE(language3) == ARRAYSIZE(percent3),
|
| + language3_and_percent3_should_be_of_the_same_size);
|
| + for (int i = 0; i < ARRAYSIZE(language3); ++i) {
|
| + if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) &&
|
| + percent3[i] >= kMinTextPercentToCountLanguage) {
|
| + ++*num_languages;
|
| + }
|
| + }
|
| +
|
| + return language3[0];
|
| +}
|
|
|
| Property changes on: third_party\cld\bar\toolbar\cld\i18n\encodings\compact_lang_det\win\cld_unicodetext.cc
|
| ___________________________________________________________________
|
| Added: svn:eol-style
|
| + LF
|
|
|
|
|