Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(423)

Unified Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1721373002: UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: left out test files that should be landed manually Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp
diff --git a/third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp b/third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp
index 49897d7c3ac409bd860dbaf44bbc4786a39ad3b4..7705bf88d8fce6e87c390645a17716a195330662 100644
--- a/third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp
+++ b/third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp
@@ -28,6 +28,7 @@
#include "wtf/StringExtras.h"
#include "wtf/text/TextCodec.h"
#include "wtf/text/TextEncodingRegistry.h"
+#include "wtf/text/UTF8.h"
using namespace WTF;
@@ -349,23 +350,6 @@ void TextResourceDecoder::checkForMetaCharset(const char* data, size_t length)
return;
}
-// We use the encoding detector in two cases:
-// 1. Encoding detector is turned ON and no other encoding source is
-// available (that is, it's DefaultEncoding).
-// 2. Encoding detector is turned ON and the encoding is set to
-// the encoding of the parent frame, which is also auto-detected.
-// Note that condition #2 is NOT satisfied unless parent-child frame
-// relationship is compliant to the same-origin policy. If they're from
-// different domains, |m_source| would not be set to EncodingFromParentFrame
-// in the first place.
-bool TextResourceDecoder::shouldAutoDetect() const
-{
- // Just checking m_hintEncoding suffices here because it's only set
- // in setHintEncoding when the source is AutoDetectedEncoding.
- return m_encodingDetectionOption == UseAllAutoDetection
- && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
-}
-
String TextResourceDecoder::decode(const char* data, size_t len)
{
size_t lengthOfBOM = 0;
@@ -402,11 +386,7 @@ String TextResourceDecoder::decode(const char* data, size_t len)
if (m_contentType == HTMLContent && !m_checkedForMetaCharset)
checkForMetaCharset(dataForDecode, lengthForDecode);
- if (shouldAutoDetect()) {
- WTF::TextEncoding detectedEncoding;
- if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
- setEncoding(detectedEncoding, EncodingFromContentSniffing);
- }
+ detectTextEncoding(data, len);
ASSERT(m_encoding.isValid());
@@ -419,16 +399,46 @@ String TextResourceDecoder::decode(const char* data, size_t len)
return result;
}
+// We use the encoding detector in following cases:
+// 1. Encoding detector is turned ON and no other encoding source is
+// available (that is, it's DefaultEncoding).
+// 2. Encoding detector is turned ON and the encoding is set to
+// the encoding of the parent frame, which is also auto-detected.
+// Note that condition #2 is NOT satisfied unless parent-child frame
+// relationship is compliant to the same-origin policy. If they're from
+// different domains, |m_source| would not be set to EncodingFromParentFrame
+// in the first place.
+void TextResourceDecoder::detectTextEncoding(const char* data, size_t len)
+{
+ if (!shouldDetectEncoding())
+ return;
+
+ if (WTF::Unicode::isUTF8andNotASCII(data, len)) {
+ setEncoding(UTF8Encoding(), EncodingFromContentSniffing);
+ return;
+ }
+ if (m_encodingDetectionOption == UseAllAutoDetection) {
+ WTF::TextEncoding detectedEncoding;
+ if (detectTextEncodingUniversal(data, len, m_hintEncoding, &detectedEncoding))
+ setEncoding(detectedEncoding, EncodingFromContentSniffing);
+ }
+}
+
+bool TextResourceDecoder::shouldDetectEncoding() const
+{
+ // Just checking m_hintEncoding suffices here because it's only set
+ // in setHintEncoding when the source is AutoDetectedEncoding.
+ return m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding);
+}
+
String TextResourceDecoder::flush()
{
// If we can not identify the encoding even after a document is completely
// loaded, we need to detect the encoding if other conditions for
// autodetection is satisfied.
- if (m_buffer.size() && shouldAutoDetect()
+ if (m_buffer.size()
&& ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_contentType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSContent)))) {
- WTF::TextEncoding detectedEncoding;
- if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
- setEncoding(detectedEncoding, EncodingFromContentSniffing);
+ detectTextEncoding(m_buffer.data(), m_buffer.size());
}
if (!m_codec)

Powered by Google App Engine
This is Rietveld 408576698