third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp - Issue 2110833005: Revert of Reland "Replace ICU with CED for auto encoding detection"

Unified Diff: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

Issue 2110833005: Revert of Reland "Replace ICU with CED for auto encoding detection" (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

diff --git a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

index abda74b2b62b0e43cb7d1f4395e99bd359ce5ec6..616aac0114d74ae3f284b97a740124242c7c7e3e 100644

--- a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

+++ b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

@@ -30,8 +30,9 @@

#include "platform/text/TextEncodingDetector.h"

-#include "third_party/ced/src/compact_enc_det/compact_enc_det.h"

#include "wtf/text/TextEncoding.h"

+#include <unicode/ucnv.h>

+#include <unicode/ucsdet.h>

namespace blink {

@@ -39,20 +40,75 @@

const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)

{

*detectedEncoding = WTF::TextEncoding();

- int consumedBytes;

- bool isReliable;

- Encoding encoding = CompactEncDet::DetectEncoding(

- data, length, nullptr, nullptr, nullptr,

- EncodingNameAliasToEncoding(hintEncodingName),

- UNKNOWN_LANGUAGE,

- CompactEncDet::WEB_CORPUS,

- false, // Include 7-bit encodings

- &consumedBytes,

- &isReliable);

- if (encoding != UNKNOWN_ENCODING) {

- *detectedEncoding = WTF::TextEncoding(MimeEncodingName(encoding));

+ int matchesCount = 0;

+ UErrorCode status = U_ZERO_ERROR;

+ UCharsetDetector* detector = ucsdet_open(&status);

+ if (U_FAILURE(status))

+ return false;

+ ucsdet_enableInputFilter(detector, true);

+ ucsdet_setText(detector, data, static_cast<int32_t>(length), &status);

+ if (U_FAILURE(status))

+ return false;

+ // FIXME: A few things we can do other than improving

+ // the ICU detector itself.

+ // 1. Use ucsdet_detectAll and pick the most likely one given

+ // "the context" (parent-encoding, referrer encoding, etc).

+ // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.

+ // Chinese, Japanese, Russian, Korean and Hebrew) by picking the

+ // encoding with a highest confidence among the detector-specific

+ // limited set of candidate encodings.

+ // Below is a partial implementation of the first part of what's outlined

+ // above.

+ const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);

+ if (U_FAILURE(status)) {

+ ucsdet_close(detector);

+ return false;

+ }

+ const char* encoding = 0;

+ if (hintEncodingName) {

+ WTF::TextEncoding hintEncoding(hintEncodingName);

+ // 10 is the minimum confidence value consistent with the codepoint

+ // allocation in a given encoding. The size of a chunk passed to

+ // us varies even for the same html file (apparently depending on

+ // the network load). When we're given a rather short chunk, we

+ // don't have a sufficiently reliable signal other than the fact that

+ // the chunk is consistent with a set of encodings. So, instead of

+ // setting an arbitrary threshold, we have to scan all the encodings

+ // consistent with the data.

+ const int32_t kThresold = 10;

+ for (int i = 0; i < matchesCount; ++i) {

+ int32_t confidence = ucsdet_getConfidence(matches[i], &status);

+ if (U_FAILURE(status)) {

+ status = U_ZERO_ERROR;

+ continue;

+ }

+ if (confidence < kThresold)

+ break;

+ const char* matchEncoding = ucsdet_getName(matches[i], &status);

+ if (U_FAILURE(status)) {

+ status = U_ZERO_ERROR;

+ continue;

+ }

+ if (WTF::TextEncoding(matchEncoding) == hintEncoding) {

+ encoding = hintEncodingName;

+ break;

+ }

+ // If no match is found so far, just pick the top match.

+ // This can happen, say, when a parent frame in EUC-JP refers to

+ // a child frame in Shift_JIS and both frames do NOT specify the encoding

+ // making us resort to auto-detection (when it IS turned on).

+ if (!encoding && matchesCount > 0)

+ encoding = ucsdet_getName(matches[0], &status);

+ if (U_SUCCESS(status)) {

+ *detectedEncoding = WTF::TextEncoding(encoding);

+ ucsdet_close(detector);

return true;

}

+ ucsdet_close(detector);

return false;

}

« no previous file with comments | « third_party/WebKit/Source/platform/blink_platform.gyp ('k') | no next file » | no next file with comments »