Chromium Code Reviews| Index: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
| diff --git a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
| index 616aac0114d74ae3f284b97a740124242c7c7e3e..1f5dd83d53cc86eeeb8d9fe270af736daa960920 100644 |
| --- a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
| +++ b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp |
| @@ -31,8 +31,7 @@ |
| #include "platform/text/TextEncodingDetector.h" |
| #include "wtf/text/TextEncoding.h" |
| -#include <unicode/ucnv.h> |
| -#include <unicode/ucsdet.h> |
| +#include <compact_enc_det/compact_enc_det.h> |
|
tkent
2016/06/22 23:36:40
Please don't use |#include <>| for non-system head
Jinsuk Kim
2016/06/23 00:45:57
Done. Just curious - I was following what was done
tkent
2016/06/23 03:30:39
It's a legacy code inherited from WebKit. We shou
|
| namespace blink { |
| @@ -40,75 +39,20 @@ bool detectTextEncoding(const char* data, size_t length, |
| const char* hintEncodingName, WTF::TextEncoding* detectedEncoding) |
| { |
| *detectedEncoding = WTF::TextEncoding(); |
| - int matchesCount = 0; |
| - UErrorCode status = U_ZERO_ERROR; |
| - UCharsetDetector* detector = ucsdet_open(&status); |
| - if (U_FAILURE(status)) |
| - return false; |
| - ucsdet_enableInputFilter(detector, true); |
| - ucsdet_setText(detector, data, static_cast<int32_t>(length), &status); |
| - if (U_FAILURE(status)) |
| - return false; |
| - |
| - // FIXME: A few things we can do other than improving |
| - // the ICU detector itself. |
| - // 1. Use ucsdet_detectAll and pick the most likely one given |
| - // "the context" (parent-encoding, referrer encoding, etc). |
| - // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. |
| - // Chinese, Japanese, Russian, Korean and Hebrew) by picking the |
| - // encoding with a highest confidence among the detector-specific |
| - // limited set of candidate encodings. |
| - // Below is a partial implementation of the first part of what's outlined |
| - // above. |
| - const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); |
| - if (U_FAILURE(status)) { |
| - ucsdet_close(detector); |
| - return false; |
| - } |
| - |
| - const char* encoding = 0; |
| - if (hintEncodingName) { |
| - WTF::TextEncoding hintEncoding(hintEncodingName); |
| - // 10 is the minimum confidence value consistent with the codepoint |
| - // allocation in a given encoding. The size of a chunk passed to |
| - // us varies even for the same html file (apparently depending on |
| - // the network load). When we're given a rather short chunk, we |
| - // don't have a sufficiently reliable signal other than the fact that |
| - // the chunk is consistent with a set of encodings. So, instead of |
| - // setting an arbitrary threshold, we have to scan all the encodings |
| - // consistent with the data. |
| - const int32_t kThresold = 10; |
| - for (int i = 0; i < matchesCount; ++i) { |
| - int32_t confidence = ucsdet_getConfidence(matches[i], &status); |
| - if (U_FAILURE(status)) { |
| - status = U_ZERO_ERROR; |
| - continue; |
| - } |
| - if (confidence < kThresold) |
| - break; |
| - const char* matchEncoding = ucsdet_getName(matches[i], &status); |
| - if (U_FAILURE(status)) { |
| - status = U_ZERO_ERROR; |
| - continue; |
| - } |
| - if (WTF::TextEncoding(matchEncoding) == hintEncoding) { |
| - encoding = hintEncodingName; |
| - break; |
| - } |
| - } |
| - } |
| - // If no match is found so far, just pick the top match. |
| - // This can happen, say, when a parent frame in EUC-JP refers to |
| - // a child frame in Shift_JIS and both frames do NOT specify the encoding |
| - // making us resort to auto-detection (when it IS turned on). |
| - if (!encoding && matchesCount > 0) |
| - encoding = ucsdet_getName(matches[0], &status); |
| - if (U_SUCCESS(status)) { |
| - *detectedEncoding = WTF::TextEncoding(encoding); |
| - ucsdet_close(detector); |
| + int consumedBytes; |
| + bool isReliable; |
| + Encoding encoding = CompactEncDet::DetectEncoding( |
| + data, length, nullptr, nullptr, nullptr, |
| + UNKNOWN_ENCODING, |
| + UNKNOWN_LANGUAGE, |
| + CompactEncDet::WEB_CORPUS, |
| + false, // Include 7-bit encodings |
| + &consumedBytes, |
| + &isReliable); |
| + if (encoding != UNKNOWN_ENCODING) { |
| + *detectedEncoding = WTF::TextEncoding(MimeEncodingName(encoding)); |
| return true; |
| } |
| - ucsdet_close(detector); |
| return false; |
| } |