third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc - Issue 122007: [chromium-reviews] Add Compact Language Detection (CLD) library to Chrome. This works in Windows...

Unified Diff: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc

Issue 122007: [chromium-reviews] Add Compact Language Detection (CLD) library to Chrome. This works in Windows... (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 11 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h ('k') | third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unilib.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc

===================================================================

--- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc (revision 0)

+++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc (revision 0)

@@ -0,0 +1,130 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h"

+#include <tchar.h>

+#include <windows.h>

+#include <vector> // to compile bar/common/component.h

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h"

+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/normalizedunicodetext.h"

+// Detects a language of the UTF-16 encoded zero-terminated text.

+// Returns: Language enum.

+// TODO : make it reuse already allocated buffers to avoid excessive

+// allocate/free call pairs. The idea is to have two buffers allocated and

+// alternate their use for every Windows API call.

+// Let's leave it as it is, simple and working and optimize it as the next step

+// if it will consume too much resources (after careful measuring, indeed).

+Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,

+ bool* is_reliable, int* num_languages,

+ DWORD* error_code) {

+ if (!text || !num_languages) {

+ if (error_code)

+ *error_code = ERROR_INVALID_PARAMETER;

+ return NUM_LANGUAGES;

+ }

+ // Normalize text first. We do not check the return value here since there

+ // is no meaningful recovery we can do in case of failure anyway.

+ // Since the vast majority of texts on the Internet is already normalized

+ // and languages which require normalization are easy to recognize by CLD

+ // anyway, we'll benefit more from trying to detect language in non-normalized

+ // text (and, with some probability, fail to recognize it) than to give up

+ // right away and return the unknown language here.

+ NormalizedUnicodeText nomalized_text;

+ nomalized_text.Normalize(NormalizationC, text);

+ // Determine the size of the buffer required to store a lowercased text.

+ int lowercase_text_size =

+ ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,

+ nomalized_text.get(), -1,

+ NULL, 0);

+ if (!lowercase_text_size) {

+ if (error_code)

+ *error_code = ::GetLastError();

+ return NUM_LANGUAGES;

+ }

+ scoped_array<WCHAR> lowercase_text(new WCHAR[lowercase_text_size]);

+ if (!lowercase_text.get())

+ return NUM_LANGUAGES;

+ // Covert text to lowercase.

+ int lowercasing_result =

+ ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,

+ nomalized_text.get(), -1,

+ lowercase_text.get(), lowercase_text_size);

+ if (!lowercasing_result) {

+ if (error_code)

+ *error_code = ::GetLastError();

+ return NUM_LANGUAGES;

+ }

+ // Determine the size of the buffer required to covert text to UTF-8.

+ int utf8_encoded_buffer_size =

+ ::WideCharToMultiByte(CP_UTF8, 0,

+ lowercase_text.get(), -1,

+ NULL, 0,

+ NULL, NULL);

+ if (!utf8_encoded_buffer_size) {

+ if (error_code)

+ *error_code = ::GetLastError();

+ return NUM_LANGUAGES;

+ }

+ scoped_array<char> utf8_encoded_buffer(

+ new char[utf8_encoded_buffer_size]);

+ // Convert text to UTF-8.

+ int utf8_encoding_result =

+ ::WideCharToMultiByte(CP_UTF8, 0,

+ lowercase_text.get(), -1,

+ utf8_encoded_buffer.get(), utf8_encoded_buffer_size,

+ NULL, NULL);

+ if (!utf8_encoding_result) {

+ if (error_code)

+ *error_code = ::GetLastError();

+ return NUM_LANGUAGES;

+ }

+ if (error_code)

+ *error_code = 0;

+ // Engage core CLD library language detection.

+ Language language3[3] = {

+ UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE

+ };

+ int percent3[3] = { 0, 0, 0 };

+ int text_bytes = 0;

+ // We ignore return value here due to the problem described in bug 1800161.

+ // For example, translate.google.com was detected as Indonesian. It happened

+ // due to the heuristic in CLD, which ignores English as a top language

+ // in the presence of another reliably detected language.

+ // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.

+ // language3 array is always set according to the detection results and

+ // is not affected by this heuristic.

+ CompactLangDet::DetectLanguageSummary(utf8_encoded_buffer.get(),

+ utf8_encoded_buffer_size,

+ is_plain_text, language3, percent3,

+ &text_bytes, is_reliable);

+ // Calcualte a number of languages detected in more than 20% of the text.

+ const int kMinTextPercentToCountLanguage = 20;

+ *num_languages = 0;

+ COMPILE_ASSERT(ARRAYSIZE(language3) == ARRAYSIZE(percent3),

+ language3_and_percent3_should_be_of_the_same_size);

+ for (int i = 0; i < ARRAYSIZE(language3); ++i) {

+ if (IsValidLanguage(language3[i]) && !IS_LANGUAGE_UNKNOWN(language3[i]) &&

+ percent3[i] >= kMinTextPercentToCountLanguage) {

+ ++*num_languages;

+ }

+ return language3[0];

Property changes on: third_party\cld\bar\toolbar\cld\i18n\encodings\compact_lang_det\win\cld_unicodetext.cc

___________________________________________________________________

Added: svn:eol-style

+ LF