Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1202)

Unified Diff: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

Issue 2110833005: Revert of Reland "Replace ICU with CED for auto encoding detection" (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/WebKit/Source/platform/blink_platform.gyp ('k') | no next file » | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
diff --git a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
index abda74b2b62b0e43cb7d1f4395e99bd359ce5ec6..616aac0114d74ae3f284b97a740124242c7c7e3e 100644
--- a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
+++ b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
@@ -30,8 +30,9 @@
#include "platform/text/TextEncodingDetector.h"
-#include "third_party/ced/src/compact_enc_det/compact_enc_det.h"
#include "wtf/text/TextEncoding.h"
+#include <unicode/ucnv.h>
+#include <unicode/ucsdet.h>
namespace blink {
@@ -39,20 +40,75 @@
const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)
{
*detectedEncoding = WTF::TextEncoding();
- int consumedBytes;
- bool isReliable;
- Encoding encoding = CompactEncDet::DetectEncoding(
- data, length, nullptr, nullptr, nullptr,
- EncodingNameAliasToEncoding(hintEncodingName),
- UNKNOWN_LANGUAGE,
- CompactEncDet::WEB_CORPUS,
- false, // Include 7-bit encodings
- &consumedBytes,
- &isReliable);
- if (encoding != UNKNOWN_ENCODING) {
- *detectedEncoding = WTF::TextEncoding(MimeEncodingName(encoding));
+ int matchesCount = 0;
+ UErrorCode status = U_ZERO_ERROR;
+ UCharsetDetector* detector = ucsdet_open(&status);
+ if (U_FAILURE(status))
+ return false;
+ ucsdet_enableInputFilter(detector, true);
+ ucsdet_setText(detector, data, static_cast<int32_t>(length), &status);
+ if (U_FAILURE(status))
+ return false;
+
+ // FIXME: A few things we can do other than improving
+ // the ICU detector itself.
+ // 1. Use ucsdet_detectAll and pick the most likely one given
+ // "the context" (parent-encoding, referrer encoding, etc).
+ // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
+ // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
+ // encoding with a highest confidence among the detector-specific
+ // limited set of candidate encodings.
+ // Below is a partial implementation of the first part of what's outlined
+ // above.
+ const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
+ if (U_FAILURE(status)) {
+ ucsdet_close(detector);
+ return false;
+ }
+
+ const char* encoding = 0;
+ if (hintEncodingName) {
+ WTF::TextEncoding hintEncoding(hintEncodingName);
+ // 10 is the minimum confidence value consistent with the codepoint
+ // allocation in a given encoding. The size of a chunk passed to
+ // us varies even for the same html file (apparently depending on
+ // the network load). When we're given a rather short chunk, we
+ // don't have a sufficiently reliable signal other than the fact that
+ // the chunk is consistent with a set of encodings. So, instead of
+ // setting an arbitrary threshold, we have to scan all the encodings
+ // consistent with the data.
+ const int32_t kThresold = 10;
+ for (int i = 0; i < matchesCount; ++i) {
+ int32_t confidence = ucsdet_getConfidence(matches[i], &status);
+ if (U_FAILURE(status)) {
+ status = U_ZERO_ERROR;
+ continue;
+ }
+ if (confidence < kThresold)
+ break;
+ const char* matchEncoding = ucsdet_getName(matches[i], &status);
+ if (U_FAILURE(status)) {
+ status = U_ZERO_ERROR;
+ continue;
+ }
+ if (WTF::TextEncoding(matchEncoding) == hintEncoding) {
+ encoding = hintEncodingName;
+ break;
+ }
+ }
+ }
+ // If no match is found so far, just pick the top match.
+ // This can happen, say, when a parent frame in EUC-JP refers to
+ // a child frame in Shift_JIS and both frames do NOT specify the encoding
+ // making us resort to auto-detection (when it IS turned on).
+ if (!encoding && matchesCount > 0)
+ encoding = ucsdet_getName(matches[0], &status);
+ if (U_SUCCESS(status)) {
+ *detectedEncoding = WTF::TextEncoding(encoding);
+ ucsdet_close(detector);
return true;
}
+ ucsdet_close(detector);
return false;
}
« no previous file with comments | « third_party/WebKit/Source/platform/blink_platform.gyp ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698