Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(916)

Unified Diff: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

Issue 1956183002: CL for perf tryjob on linux (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/WebKit/Source/platform/BUILD.gn ('k') | third_party/cld/BUILD.gn » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
diff --git a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
index 476d992a616816ac845d56c88d27756c26438585..d9f09ca967bf15d8191e1a8044ace8cd23c26332 100644
--- a/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
+++ b/third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp
@@ -31,8 +31,8 @@
#include "platform/text/TextEncodingDetector.h"
#include "wtf/text/TextEncoding.h"
-#include <unicode/ucnv.h>
-#include <unicode/ucsdet.h>
+#include "encodings/compact_enc_det/compact_enc_det.h"
+#include "encodings/public/encodings.h"
namespace blink {
@@ -40,75 +40,21 @@ bool detectTextEncodingUniversal(const char* data, size_t length,
const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)
{
*detectedEncoding = WTF::TextEncoding();
- int matchesCount = 0;
- UErrorCode status = U_ZERO_ERROR;
- UCharsetDetector* detector = ucsdet_open(&status);
- if (U_FAILURE(status))
- return false;
- ucsdet_enableInputFilter(detector, true);
- ucsdet_setText(detector, data, static_cast<int32_t>(length), &status);
- if (U_FAILURE(status))
- return false;
-
- // FIXME: A few things we can do other than improving
- // the ICU detector itself.
- // 1. Use ucsdet_detectAll and pick the most likely one given
- // "the context" (parent-encoding, referrer encoding, etc).
- // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g.
- // Chinese, Japanese, Russian, Korean and Hebrew) by picking the
- // encoding with a highest confidence among the detector-specific
- // limited set of candidate encodings.
- // Below is a partial implementation of the first part of what's outlined
- // above.
- const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status);
- if (U_FAILURE(status)) {
- ucsdet_close(detector);
- return false;
- }
-
- const char* encoding = 0;
- if (hintEncodingName) {
- WTF::TextEncoding hintEncoding(hintEncodingName);
- // 10 is the minimum confidence value consistent with the codepoint
- // allocation in a given encoding. The size of a chunk passed to
- // us varies even for the same html file (apparently depending on
- // the network load). When we're given a rather short chunk, we
- // don't have a sufficiently reliable signal other than the fact that
- // the chunk is consistent with a set of encodings. So, instead of
- // setting an arbitrary threshold, we have to scan all the encodings
- // consistent with the data.
- const int32_t kThresold = 10;
- for (int i = 0; i < matchesCount; ++i) {
- int32_t confidence = ucsdet_getConfidence(matches[i], &status);
- if (U_FAILURE(status)) {
- status = U_ZERO_ERROR;
- continue;
- }
- if (confidence < kThresold)
- break;
- const char* matchEncoding = ucsdet_getName(matches[i], &status);
- if (U_FAILURE(status)) {
- status = U_ZERO_ERROR;
- continue;
- }
- if (WTF::TextEncoding(matchEncoding) == hintEncoding) {
- encoding = hintEncodingName;
- break;
- }
- }
- }
- // If no match is found so far, just pick the top match.
- // This can happen, say, when a parent frame in EUC-JP refers to
- // a child frame in Shift_JIS and both frames do NOT specify the encoding
- // making us resort to auto-detection (when it IS turned on).
- if (!encoding && matchesCount > 0)
- encoding = ucsdet_getName(matches[0], &status);
- if (U_SUCCESS(status)) {
- *detectedEncoding = WTF::TextEncoding(encoding);
- ucsdet_close(detector);
+ int bytes_consumed;
+ bool is_reliable;
+ Encoding encoding = CompactEncDet::DetectEncoding(
+ data, length, NULL, NULL, NULL,
+ UNKNOWN_ENCODING,
+ UNKNOWN_LANGUAGE,
+ CompactEncDet::WEB_CORPUS,
+ false, // Include 7-bit encodings
+ &bytes_consumed,
+ &is_reliable);
+ if (encoding != UNKNOWN_ENCODING) {
+ *detectedEncoding = WTF::TextEncoding(MimeEncodingName(encoding));
+ LOG(ERROR) << "Auto-detected encoding: " << encoding;
return true;
}
- ucsdet_close(detector);
return false;
}
« no previous file with comments | « third_party/WebKit/Source/platform/BUILD.gn ('k') | third_party/cld/BUILD.gn » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698