Index: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
diff --git a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
index 616aac0114d74ae3f284b97a740124242c7c7e3e..abda74b2b62b0e43cb7d1f4395e99bd359ce5ec6 100644 |
--- a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
+++ b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
@@ -30,9 +30,8 @@ |
#include "platform/text/TextEncodingDetector.h" |
+#include "third_party/ced/src/compact_enc_det/compact_enc_det.h" |
#include "wtf/text/TextEncoding.h" |
-#include <unicode/ucnv.h> |
-#include <unicode/ucsdet.h> |
namespace blink { |
@@ -40,75 +39,20 @@ bool detectTextEncoding(const char* data, size_t length, |
const char* hintEncodingName, WTF::TextEncoding* detectedEncoding) |
{ |
*detectedEncoding = WTF::TextEncoding(); |
- int matchesCount = 0; |
- UErrorCode status = U_ZERO_ERROR; |
- UCharsetDetector* detector = ucsdet_open(&status); |
- if (U_FAILURE(status)) |
- return false; |
- ucsdet_enableInputFilter(detector, true); |
- ucsdet_setText(detector, data, static_cast<int32_t>(length), &status); |
- if (U_FAILURE(status)) |
- return false; |
- |
- // FIXME: A few things we can do other than improving |
- // the ICU detector itself. |
- // 1. Use ucsdet_detectAll and pick the most likely one given |
- // "the context" (parent-encoding, referrer encoding, etc). |
- // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. |
- // Chinese, Japanese, Russian, Korean and Hebrew) by picking the |
- // encoding with a highest confidence among the detector-specific |
- // limited set of candidate encodings. |
- // Below is a partial implementation of the first part of what's outlined |
- // above. |
- const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); |
- if (U_FAILURE(status)) { |
- ucsdet_close(detector); |
- return false; |
- } |
- |
- const char* encoding = 0; |
- if (hintEncodingName) { |
- WTF::TextEncoding hintEncoding(hintEncodingName); |
- // 10 is the minimum confidence value consistent with the codepoint |
- // allocation in a given encoding. The size of a chunk passed to |
- // us varies even for the same html file (apparently depending on |
- // the network load). When we're given a rather short chunk, we |
- // don't have a sufficiently reliable signal other than the fact that |
- // the chunk is consistent with a set of encodings. So, instead of |
- // setting an arbitrary threshold, we have to scan all the encodings |
- // consistent with the data. |
- const int32_t kThresold = 10; |
- for (int i = 0; i < matchesCount; ++i) { |
- int32_t confidence = ucsdet_getConfidence(matches[i], &status); |
- if (U_FAILURE(status)) { |
- status = U_ZERO_ERROR; |
- continue; |
- } |
- if (confidence < kThresold) |
- break; |
- const char* matchEncoding = ucsdet_getName(matches[i], &status); |
- if (U_FAILURE(status)) { |
- status = U_ZERO_ERROR; |
- continue; |
- } |
- if (WTF::TextEncoding(matchEncoding) == hintEncoding) { |
- encoding = hintEncodingName; |
- break; |
- } |
- } |
- } |
- // If no match is found so far, just pick the top match. |
- // This can happen, say, when a parent frame in EUC-JP refers to |
- // a child frame in Shift_JIS and both frames do NOT specify the encoding |
- // making us resort to auto-detection (when it IS turned on). |
- if (!encoding && matchesCount > 0) |
- encoding = ucsdet_getName(matches[0], &status); |
- if (U_SUCCESS(status)) { |
- *detectedEncoding = WTF::TextEncoding(encoding); |
- ucsdet_close(detector); |
+ int consumedBytes; |
+ bool isReliable; |
+ Encoding encoding = CompactEncDet::DetectEncoding( |
+ data, length, nullptr, nullptr, nullptr, |
+ EncodingNameAliasToEncoding(hintEncodingName), |
+ UNKNOWN_LANGUAGE, |
+ CompactEncDet::WEB_CORPUS, |
+ false, // Include 7-bit encodings |
jungshik at Google
2016/07/19 21:18:30
The only 7-bit encoding Blink supports is ISO-2022
|
+ &consumedBytes, |
+ &isReliable); |
+ if (encoding != UNKNOWN_ENCODING) { |
+ *detectedEncoding = WTF::TextEncoding(MimeEncodingName(encoding)); |
jungshik at Google
2016/07/19 21:18:30
MimeEncodingName uses kEncodingTable ( https://cs.
|
return true; |
} |
- ucsdet_close(detector); |
return false; |
} |