| Index: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
|
| diff --git a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
|
| index 476d992a616816ac845d56c88d27756c26438585..d9f09ca967bf15d8191e1a8044ace8cd23c26332 100644
|
| --- a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
|
| +++ b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
|
| @@ -31,8 +31,8 @@
|
| #include "platform/text/TextEncodingDetector.h"
|
|
|
| #include "wtf/text/TextEncoding.h"
|
| -#include <unicode/ucnv.h>
|
| -#include <unicode/ucsdet.h>
|
| +#include "encodings/compact_enc_det/compact_enc_det.h"
|
| +#include "encodings/public/encodings.h"
|
|
|
| namespace blink {
|
|
|
| @@ -40,75 +40,21 @@ bool detectTextEncodingUniversal(const char* data, size_t length,
|
| const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)
|
| {
|
| *detectedEncoding = WTF::TextEncoding();
|
| - int matchesCount = 0;
|
| - UErrorCode status = U_ZERO_ERROR;
|
| - UCharsetDetector* detector = ucsdet_open(&status);
|
| - if (U_FAILURE(status))
|
| - return false;
|
| - ucsdet_enableInputFilter(detector, true);
|
| - ucsdet_setText(detector, data, static_cast<int32_t>(length), &status);
|
| - if (U_FAILURE(status))
|
| - return false;
|
| -
|
| - // FIXME: A few things we can do other than improving
|
| - // the ICU detector itself.
|
| - // 1. Use ucsdet_detectAll and pick the most likely one given
|
| - // "the context" (parent-encoding, referrer encoding, etc).
|
| - // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
|
| - // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
|
| - // encoding with a highest confidence among the detector-specific
|
| - // limited set of candidate encodings.
|
| - // Below is a partial implementation of the first part of what's outlined
|
| - // above.
|
| - const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
|
| - if (U_FAILURE(status)) {
|
| - ucsdet_close(detector);
|
| - return false;
|
| - }
|
| -
|
| - const char* encoding = 0;
|
| - if (hintEncodingName) {
|
| - WTF::TextEncoding hintEncoding(hintEncodingName);
|
| - // 10 is the minimum confidence value consistent with the codepoint
|
| - // allocation in a given encoding. The size of a chunk passed to
|
| - // us varies even for the same html file (apparently depending on
|
| - // the network load). When we're given a rather short chunk, we
|
| - // don't have a sufficiently reliable signal other than the fact that
|
| - // the chunk is consistent with a set of encodings. So, instead of
|
| - // setting an arbitrary threshold, we have to scan all the encodings
|
| - // consistent with the data.
|
| - const int32_t kThresold = 10;
|
| - for (int i = 0; i < matchesCount; ++i) {
|
| - int32_t confidence = ucsdet_getConfidence(matches[i], &status);
|
| - if (U_FAILURE(status)) {
|
| - status = U_ZERO_ERROR;
|
| - continue;
|
| - }
|
| - if (confidence < kThresold)
|
| - break;
|
| - const char* matchEncoding = ucsdet_getName(matches[i], &status);
|
| - if (U_FAILURE(status)) {
|
| - status = U_ZERO_ERROR;
|
| - continue;
|
| - }
|
| - if (WTF::TextEncoding(matchEncoding) == hintEncoding) {
|
| - encoding = hintEncodingName;
|
| - break;
|
| - }
|
| - }
|
| - }
|
| - // If no match is found so far, just pick the top match.
|
| - // This can happen, say, when a parent frame in EUC-JP refers to
|
| - // a child frame in Shift_JIS and both frames do NOT specify the encoding
|
| - // making us resort to auto-detection (when it IS turned on).
|
| - if (!encoding && matchesCount > 0)
|
| - encoding = ucsdet_getName(matches[0], &status);
|
| - if (U_SUCCESS(status)) {
|
| - *detectedEncoding = WTF::TextEncoding(encoding);
|
| - ucsdet_close(detector);
|
| + int bytes_consumed;
|
| + bool is_reliable;
|
| + Encoding encoding = CompactEncDet::DetectEncoding(
|
| + data, length, NULL, NULL, NULL,
|
| + UNKNOWN_ENCODING,
|
| + UNKNOWN_LANGUAGE,
|
| + CompactEncDet::WEB_CORPUS,
|
| + false, // Include 7-bit encodings
|
| + &bytes_consumed,
|
| + &is_reliable);
|
| + if (encoding != UNKNOWN_ENCODING) {
|
| + *detectedEncoding = WTF::TextEncoding(MimeEncodingName(encoding));
|
| + LOG(ERROR) << "Auto-detected encoding: " << encoding;
|
| return true;
|
| }
|
| - ucsdet_close(detector);
|
| return false;
|
| }
|
|
|
|
|