| Index: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
|
| diff --git a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
|
| index abda74b2b62b0e43cb7d1f4395e99bd359ce5ec6..616aac0114d74ae3f284b97a740124242c7c7e3e 100644
|
| --- a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
|
| +++ b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
|
| @@ -30,8 +30,9 @@
|
|
|
| #include "platform/text/TextEncodingDetector.h"
|
|
|
| -#include "third_party/ced/src/compact_enc_det/compact_enc_det.h"
|
| #include "wtf/text/TextEncoding.h"
|
| +#include <unicode/ucnv.h>
|
| +#include <unicode/ucsdet.h>
|
|
|
| namespace blink {
|
|
|
| @@ -39,20 +40,75 @@
|
| const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)
|
| {
|
| *detectedEncoding = WTF::TextEncoding();
|
| - int consumedBytes;
|
| - bool isReliable;
|
| - Encoding encoding = CompactEncDet::DetectEncoding(
|
| - data, length, nullptr, nullptr, nullptr,
|
| - EncodingNameAliasToEncoding(hintEncodingName),
|
| - UNKNOWN_LANGUAGE,
|
| - CompactEncDet::WEB_CORPUS,
|
| - false, // Include 7-bit encodings
|
| - &consumedBytes,
|
| - &isReliable);
|
| - if (encoding != UNKNOWN_ENCODING) {
|
| - *detectedEncoding = WTF::TextEncoding(MimeEncodingName(encoding));
|
| + int matchesCount = 0;
|
| + UErrorCode status = U_ZERO_ERROR;
|
| + UCharsetDetector* detector = ucsdet_open(&status);
|
| + if (U_FAILURE(status))
|
| + return false;
|
| + ucsdet_enableInputFilter(detector, true);
|
| + ucsdet_setText(detector, data, static_cast<int32_t>(length), &status);
|
| + if (U_FAILURE(status))
|
| + return false;
|
| +
|
| + // FIXME: A few things we can do other than improving
|
| + // the ICU detector itself.
|
| + // 1. Use ucsdet_detectAll and pick the most likely one given
|
| + // "the context" (parent-encoding, referrer encoding, etc).
|
| + // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
|
| + // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
|
| + // encoding with a highest confidence among the detector-specific
|
| + // limited set of candidate encodings.
|
| + // Below is a partial implementation of the first part of what's outlined
|
| + // above.
|
| + const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
|
| + if (U_FAILURE(status)) {
|
| + ucsdet_close(detector);
|
| + return false;
|
| + }
|
| +
|
| + const char* encoding = 0;
|
| + if (hintEncodingName) {
|
| + WTF::TextEncoding hintEncoding(hintEncodingName);
|
| + // 10 is the minimum confidence value consistent with the codepoint
|
| + // allocation in a given encoding. The size of a chunk passed to
|
| + // us varies even for the same html file (apparently depending on
|
| + // the network load). When we're given a rather short chunk, we
|
| + // don't have a sufficiently reliable signal other than the fact that
|
| + // the chunk is consistent with a set of encodings. So, instead of
|
| + // setting an arbitrary threshold, we have to scan all the encodings
|
| + // consistent with the data.
|
| + const int32_t kThresold = 10;
|
| + for (int i = 0; i < matchesCount; ++i) {
|
| + int32_t confidence = ucsdet_getConfidence(matches[i], &status);
|
| + if (U_FAILURE(status)) {
|
| + status = U_ZERO_ERROR;
|
| + continue;
|
| + }
|
| + if (confidence < kThresold)
|
| + break;
|
| + const char* matchEncoding = ucsdet_getName(matches[i], &status);
|
| + if (U_FAILURE(status)) {
|
| + status = U_ZERO_ERROR;
|
| + continue;
|
| + }
|
| + if (WTF::TextEncoding(matchEncoding) == hintEncoding) {
|
| + encoding = hintEncodingName;
|
| + break;
|
| + }
|
| + }
|
| + }
|
| + // If no match is found so far, just pick the top match.
|
| + // This can happen, say, when a parent frame in EUC-JP refers to
|
| + // a child frame in Shift_JIS and both frames do NOT specify the encoding
|
| + // making us resort to auto-detection (when it IS turned on).
|
| + if (!encoding && matchesCount > 0)
|
| + encoding = ucsdet_getName(matches[0], &status);
|
| + if (U_SUCCESS(status)) {
|
| + *detectedEncoding = WTF::TextEncoding(encoding);
|
| + ucsdet_close(detector);
|
| return true;
|
| }
|
| + ucsdet_close(detector);
|
| return false;
|
| }
|
|
|
|
|