third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp - Issue 2081653007: Replace ICU with CED for auto encoding detection

Unified Diff: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

Issue 2081653007: Replace ICU with CED for auto encoding detection (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: fix trybot builds Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

diff --git a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

index 616aac0114d74ae3f284b97a740124242c7c7e3e..1f5dd83d53cc86eeeb8d9fe270af736daa960920 100644

--- a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

+++ b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

@@ -31,8 +31,7 @@

#include "platform/text/TextEncodingDetector.h"

#include "wtf/text/TextEncoding.h"

-#include <unicode/ucnv.h>

-#include <unicode/ucsdet.h>

+#include <compact_enc_det/compact_enc_det.h>

tkent 2016/06/22 23:36:40 Please don't use |#include <>| for non-system head

Jinsuk Kim 2016/06/23 00:45:57 Done. Just curious - I was following what was done

tkent 2016/06/23 03:30:39 It's a legacy code inherited from WebKit. We shou

namespace blink {

@@ -40,75 +39,20 @@ bool detectTextEncoding(const char* data, size_t length,

const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)

{

*detectedEncoding = WTF::TextEncoding();

- int matchesCount = 0;

- UErrorCode status = U_ZERO_ERROR;

- UCharsetDetector* detector = ucsdet_open(&status);

- if (U_FAILURE(status))

- return false;

- ucsdet_enableInputFilter(detector, true);

- ucsdet_setText(detector, data, static_cast<int32_t>(length), &status);

- if (U_FAILURE(status))

- return false;

- // FIXME: A few things we can do other than improving

- // the ICU detector itself.

- // 1. Use ucsdet_detectAll and pick the most likely one given

- // "the context" (parent-encoding, referrer encoding, etc).

- // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.

- // Chinese, Japanese, Russian, Korean and Hebrew) by picking the

- // encoding with a highest confidence among the detector-specific

- // limited set of candidate encodings.

- // Below is a partial implementation of the first part of what's outlined

- // above.

- const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);

- if (U_FAILURE(status)) {

- ucsdet_close(detector);

- return false;

- }

- const char* encoding = 0;

- if (hintEncodingName) {

- WTF::TextEncoding hintEncoding(hintEncodingName);

- // 10 is the minimum confidence value consistent with the codepoint

- // allocation in a given encoding. The size of a chunk passed to

- // us varies even for the same html file (apparently depending on

- // the network load). When we're given a rather short chunk, we

- // don't have a sufficiently reliable signal other than the fact that

- // the chunk is consistent with a set of encodings. So, instead of

- // setting an arbitrary threshold, we have to scan all the encodings

- // consistent with the data.

- const int32_t kThresold = 10;

- for (int i = 0; i < matchesCount; ++i) {

- int32_t confidence = ucsdet_getConfidence(matches[i], &status);

- if (U_FAILURE(status)) {

- status = U_ZERO_ERROR;

- continue;

- }

- if (confidence < kThresold)

- break;

- const char* matchEncoding = ucsdet_getName(matches[i], &status);

- if (U_FAILURE(status)) {

- status = U_ZERO_ERROR;

- continue;

- }

- if (WTF::TextEncoding(matchEncoding) == hintEncoding) {

- encoding = hintEncodingName;

- break;

- }

- // If no match is found so far, just pick the top match.

- // This can happen, say, when a parent frame in EUC-JP refers to

- // a child frame in Shift_JIS and both frames do NOT specify the encoding

- // making us resort to auto-detection (when it IS turned on).

- if (!encoding && matchesCount > 0)

- encoding = ucsdet_getName(matches[0], &status);

- if (U_SUCCESS(status)) {

- *detectedEncoding = WTF::TextEncoding(encoding);

- ucsdet_close(detector);

+ int consumedBytes;

+ bool isReliable;

+ Encoding encoding = CompactEncDet::DetectEncoding(

+ data, length, nullptr, nullptr, nullptr,

+ UNKNOWN_ENCODING,

+ UNKNOWN_LANGUAGE,

+ CompactEncDet::WEB_CORPUS,

+ false, // Include 7-bit encodings

+ &consumedBytes,

+ &isReliable);

+ if (encoding != UNKNOWN_ENCODING) {

+ *detectedEncoding = WTF::TextEncoding(MimeEncodingName(encoding));

return true;

}

- ucsdet_close(detector);

return false;

}

« third_party/WebKit/Source/platform/BUILD.gn ('K') | « third_party/WebKit/Source/platform/blink_platform.gyp ('k') | no next file » | no next file with comments »