third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc - Issue 523108: Port back CLD to Linux and Mac and fix Trad Chinese detection

Unified Diff: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc

Issue 523108: Port back CLD to Linux and Mac and fix Trad Chinese detection (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 10 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h ('k') | third_party/cld/base/string_util.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc

===================================================================

--- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc (revision 36372)

+++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc (working copy)

@@ -4,95 +4,47 @@

#include "bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h"

-#include <tchar.h>

-#include <windows.h>

+#include <string>

#include <vector> // to compile bar/common/component.h

#include "bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h"

-#include "bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h"

-#include "bar/toolbar/cld/i18n/encodings/compact_lang_det/win/normalizedunicodetext.h"

+#include "base/string_util.h"

+#include "unicode/normlzr.h"

+#include "unicode/unistr.h"

+#include "unicode/ustring.h"

+std::string NormalizeText(const UChar* text) {

+ // To avoid a copy, use the read-only aliasing ctor.

+ icu::UnicodeString source(1, text, -1);

+ icu::UnicodeString normalized;

+ UErrorCode status = U_ZERO_ERROR;

+ icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);

+ if (U_FAILURE(status))

+ return std::string();

+ normalized.toLower();

+ std::string utf8;

+ // Internally, toUTF8String uses a 1kB stack buffer (which is not large enough

+ // for most web pages) and does pre-flighting followed by malloc for larger

+ // strings. We have to switch to obtaining the buffer with the maximum size

+ // (UTF-16 length * 3) without pre-flighting if necessary.

+ return normalized.toUTF8String(utf8);

// Detects a language of the UTF-16 encoded zero-terminated text.

// Returns: Language enum.

Language DetectLanguageOfUnicodeText(

const CompactLangDet::DetectionTables* detection_tables,

- const WCHAR* text, bool is_plain_text,

+ const UChar* text, bool is_plain_text,

bool* is_reliable, int* num_languages,

- DWORD* error_code) {

- if (!text || !num_languages) {

- if (error_code)

- *error_code = ERROR_INVALID_PARAMETER;

+ int* error_code) {

+ if (!text || !num_languages)

return NUM_LANGUAGES;

- }

- // Normalize text first. We do not check the return value here since there

- // is no meaningful recovery we can do in case of failure anyway.

- // Since the vast majority of texts on the Internet is already normalized

- // and languages which require normalization are easy to recognize by CLD

- // anyway, we'll benefit more from trying to detect language in non-normalized

- // text (and, with some probability, fail to recognize it) than to give up

- // right away and return the unknown language here.

- NormalizedUnicodeText nomalized_text;

- nomalized_text.Normalize(NormalizationC, text);

- // Determine the size of the buffer required to store a lowercased text.

- int lowercase_text_size =

- ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,

- nomalized_text.get(), -1,

- NULL, 0);

- if (!lowercase_text_size) {

- if (error_code)

- *error_code = ::GetLastError();

+ // Normalize text to NFC, lowercase and convert to UTF-8.

+ std::string utf8_encoded = NormalizeText(text);

+ if (utf8_encoded.empty())

return NUM_LANGUAGES;

- }

- scoped_array<WCHAR> lowercase_text(new WCHAR[lowercase_text_size]);

- if (!lowercase_text.get())

- return NUM_LANGUAGES;

- // Covert text to lowercase.

- int lowercasing_result =

- ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,

- nomalized_text.get(), -1,

- lowercase_text.get(), lowercase_text_size);

- if (!lowercasing_result) {

- if (error_code)

- *error_code = ::GetLastError();

- return NUM_LANGUAGES;

- }

- // Determine the size of the buffer required to covert text to UTF-8.

- int utf8_encoded_buffer_size =

- ::WideCharToMultiByte(CP_UTF8, 0,

- lowercase_text.get(), -1,

- NULL, 0,

- NULL, NULL);

- if (!utf8_encoded_buffer_size) {

- if (error_code)

- *error_code = ::GetLastError();

- return NUM_LANGUAGES;

- }

- scoped_array<char> utf8_encoded_buffer(

- new char[utf8_encoded_buffer_size]);

- // Convert text to UTF-8.

- int utf8_encoding_result =

- ::WideCharToMultiByte(CP_UTF8, 0,

- lowercase_text.get(), -1,

- utf8_encoded_buffer.get(), utf8_encoded_buffer_size,

- NULL, NULL);

- if (!utf8_encoding_result) {

- if (error_code)

- *error_code = ::GetLastError();

- return NUM_LANGUAGES;

- }

- if (error_code)

- *error_code = 0;

// Engage core CLD library language detection.

Language language3[3] = {

UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE

@@ -107,8 +59,8 @@

// language3 array is always set according to the detection results and

// is not affected by this heuristic.

CompactLangDet::DetectLanguageSummary(detection_tables,

- utf8_encoded_buffer.get(),

- utf8_encoded_buffer_size,

+ utf8_encoded.c_str(),

+ utf8_encoded.length(),

is_plain_text, language3, percent3,

&text_bytes, is_reliable);