Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(203)

Unified Diff: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc

Issue 551070: Revert 36541 (which went in without any commit log by some black magic). ... (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src/
Patch Set: Created 10 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
===================================================================
--- third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc (revision 36549)
+++ third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc (working copy)
@@ -4,47 +4,95 @@
#include "bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h"
-#include <string>
+#include <tchar.h>
+#include <windows.h>
+
#include <vector> // to compile bar/common/component.h
#include "bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h"
-#include "base/string_util.h"
-#include "unicode/normlzr.h"
-#include "unicode/unistr.h"
-#include "unicode/ustring.h"
+#include "bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h"
+#include "bar/toolbar/cld/i18n/encodings/compact_lang_det/win/normalizedunicodetext.h"
-std::string NormalizeText(const UChar* text) {
- // To avoid a copy, use the read-only aliasing ctor.
- icu::UnicodeString source(1, text, -1);
- icu::UnicodeString normalized;
- UErrorCode status = U_ZERO_ERROR;
- icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
- if (U_FAILURE(status))
- return std::string();
- normalized.toLower();
- std::string utf8;
- // Internally, toUTF8String uses a 1kB stack buffer (which is not large enough
- // for most web pages) and does pre-flighting followed by malloc for larger
- // strings. We have to switch to obtaining the buffer with the maximum size
- // (UTF-16 length * 3) without pre-flighting if necessary.
- return normalized.toUTF8String(utf8);
-}
-
// Detects a language of the UTF-16 encoded zero-terminated text.
// Returns: Language enum.
Language DetectLanguageOfUnicodeText(
const CompactLangDet::DetectionTables* detection_tables,
- const UChar* text, bool is_plain_text,
+ const WCHAR* text, bool is_plain_text,
bool* is_reliable, int* num_languages,
- int* error_code) {
- if (!text || !num_languages)
+ DWORD* error_code) {
+ if (!text || !num_languages) {
+ if (error_code)
+ *error_code = ERROR_INVALID_PARAMETER;
return NUM_LANGUAGES;
- // Normalize text to NFC, lowercase and convert to UTF-8.
- std::string utf8_encoded = NormalizeText(text);
- if (utf8_encoded.empty())
+ }
+
+ // Normalize text first. We do not check the return value here since there
+ // is no meaningful recovery we can do in case of failure anyway.
+ // Since the vast majority of texts on the Internet is already normalized
+ // and languages which require normalization are easy to recognize by CLD
+ // anyway, we'll benefit more from trying to detect language in non-normalized
+ // text (and, with some probability, fail to recognize it) than to give up
+ // right away and return the unknown language here.
+ NormalizedUnicodeText nomalized_text;
+ nomalized_text.Normalize(NormalizationC, text);
+
+ // Determine the size of the buffer required to store a lowercased text.
+ int lowercase_text_size =
+ ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,
+ nomalized_text.get(), -1,
+ NULL, 0);
+ if (!lowercase_text_size) {
+ if (error_code)
+ *error_code = ::GetLastError();
return NUM_LANGUAGES;
+ }
+ scoped_array<WCHAR> lowercase_text(new WCHAR[lowercase_text_size]);
+ if (!lowercase_text.get())
+ return NUM_LANGUAGES;
+
+ // Covert text to lowercase.
+ int lowercasing_result =
+ ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,
+ nomalized_text.get(), -1,
+ lowercase_text.get(), lowercase_text_size);
+ if (!lowercasing_result) {
+ if (error_code)
+ *error_code = ::GetLastError();
+ return NUM_LANGUAGES;
+ }
+
+ // Determine the size of the buffer required to covert text to UTF-8.
+ int utf8_encoded_buffer_size =
+ ::WideCharToMultiByte(CP_UTF8, 0,
+ lowercase_text.get(), -1,
+ NULL, 0,
+ NULL, NULL);
+ if (!utf8_encoded_buffer_size) {
+ if (error_code)
+ *error_code = ::GetLastError();
+ return NUM_LANGUAGES;
+ }
+
+ scoped_array<char> utf8_encoded_buffer(
+ new char[utf8_encoded_buffer_size]);
+
+ // Convert text to UTF-8.
+ int utf8_encoding_result =
+ ::WideCharToMultiByte(CP_UTF8, 0,
+ lowercase_text.get(), -1,
+ utf8_encoded_buffer.get(), utf8_encoded_buffer_size,
+ NULL, NULL);
+ if (!utf8_encoding_result) {
+ if (error_code)
+ *error_code = ::GetLastError();
+ return NUM_LANGUAGES;
+ }
+
+ if (error_code)
+ *error_code = 0;
+
// Engage core CLD library language detection.
Language language3[3] = {
UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
@@ -59,8 +107,8 @@
// language3 array is always set according to the detection results and
// is not affected by this heuristic.
CompactLangDet::DetectLanguageSummary(detection_tables,
- utf8_encoded.c_str(),
- utf8_encoded.length(),
+ utf8_encoded_buffer.get(),
+ utf8_encoded_buffer_size,
is_plain_text, language3, percent3,
&text_bytes, is_reliable);

Powered by Google App Engine
This is Rietveld 408576698