Index: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
diff --git a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
index abda74b2b62b0e43cb7d1f4395e99bd359ce5ec6..616aac0114d74ae3f284b97a740124242c7c7e3e 100644 |
--- a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
+++ b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
@@ -30,8 +30,9 @@ |
#include "platform/text/TextEncodingDetector.h" |
-#include "third_party/ced/src/compact_enc_det/compact_enc_det.h" |
#include "wtf/text/TextEncoding.h" |
+#include <unicode/ucnv.h> |
+#include <unicode/ucsdet.h> |
namespace blink { |
@@ -39,20 +40,75 @@ |
const char* hintEncodingName, WTF::TextEncoding* detectedEncoding) |
{ |
*detectedEncoding = WTF::TextEncoding(); |
- int consumedBytes; |
- bool isReliable; |
- Encoding encoding = CompactEncDet::DetectEncoding( |
- data, length, nullptr, nullptr, nullptr, |
- EncodingNameAliasToEncoding(hintEncodingName), |
- UNKNOWN_LANGUAGE, |
- CompactEncDet::WEB_CORPUS, |
- false, // Include 7-bit encodings |
- &consumedBytes, |
- &isReliable); |
- if (encoding != UNKNOWN_ENCODING) { |
- *detectedEncoding = WTF::TextEncoding(MimeEncodingName(encoding)); |
+ int matchesCount = 0; |
+ UErrorCode status = U_ZERO_ERROR; |
+ UCharsetDetector* detector = ucsdet_open(&status); |
+ if (U_FAILURE(status)) |
+ return false; |
+ ucsdet_enableInputFilter(detector, true); |
+ ucsdet_setText(detector, data, static_cast<int32_t>(length), &status); |
+ if (U_FAILURE(status)) |
+ return false; |
+ |
+ // FIXME: A few things we can do other than improving |
+ // the ICU detector itself. |
+ // 1. Use ucsdet_detectAll and pick the most likely one given |
+ // "the context" (parent-encoding, referrer encoding, etc). |
+ // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. |
+ // Chinese, Japanese, Russian, Korean and Hebrew) by picking the |
+ // encoding with a highest confidence among the detector-specific |
+ // limited set of candidate encodings. |
+ // Below is a partial implementation of the first part of what's outlined |
+ // above. |
+ const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); |
+ if (U_FAILURE(status)) { |
+ ucsdet_close(detector); |
+ return false; |
+ } |
+ |
+ const char* encoding = 0; |
+ if (hintEncodingName) { |
+ WTF::TextEncoding hintEncoding(hintEncodingName); |
+ // 10 is the minimum confidence value consistent with the codepoint |
+ // allocation in a given encoding. The size of a chunk passed to |
+ // us varies even for the same html file (apparently depending on |
+ // the network load). When we're given a rather short chunk, we |
+ // don't have a sufficiently reliable signal other than the fact that |
+ // the chunk is consistent with a set of encodings. So, instead of |
+ // setting an arbitrary threshold, we have to scan all the encodings |
+ // consistent with the data. |
+ const int32_t kThresold = 10; |
+ for (int i = 0; i < matchesCount; ++i) { |
+ int32_t confidence = ucsdet_getConfidence(matches[i], &status); |
+ if (U_FAILURE(status)) { |
+ status = U_ZERO_ERROR; |
+ continue; |
+ } |
+ if (confidence < kThresold) |
+ break; |
+ const char* matchEncoding = ucsdet_getName(matches[i], &status); |
+ if (U_FAILURE(status)) { |
+ status = U_ZERO_ERROR; |
+ continue; |
+ } |
+ if (WTF::TextEncoding(matchEncoding) == hintEncoding) { |
+ encoding = hintEncodingName; |
+ break; |
+ } |
+ } |
+ } |
+ // If no match is found so far, just pick the top match. |
+ // This can happen, say, when a parent frame in EUC-JP refers to |
+ // a child frame in Shift_JIS and both frames do NOT specify the encoding |
+ // making us resort to auto-detection (when it IS turned on). |
+ if (!encoding && matchesCount > 0) |
+ encoding = ucsdet_getName(matches[0], &status); |
+ if (U_SUCCESS(status)) { |
+ *detectedEncoding = WTF::TextEncoding(encoding); |
+ ucsdet_close(detector); |
return true; |
} |
+ ucsdet_close(detector); |
return false; |
} |