third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp - Issue 1888083002: Revert of UTF-8 detector for pages missing encoding info

Unified Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1888083002: Revert of UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « third_party/WebKit/Source/core/html/parser/TextResourceDecoder.h ('k') | third_party/WebKit/Source/core/xmlhttprequest/XMLHttpRequest.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

diff --git a/third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp b/third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

index 7705bf88d8fce6e87c390645a17716a195330662..49897d7c3ac409bd860dbaf44bbc4786a39ad3b4 100644

--- a/third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

+++ b/third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

@@ -28,7 +28,6 @@

#include "wtf/StringExtras.h"

#include "wtf/text/TextCodec.h"

#include "wtf/text/TextEncodingRegistry.h"

-#include "wtf/text/UTF8.h"

using namespace WTF;

@@ -350,56 +349,7 @@

return;

}

-String TextResourceDecoder::decode(const char* data, size_t len)

- size_t lengthOfBOM = 0;

- if (!m_checkedForBOM)

- lengthOfBOM = checkForBOM(data, len);

- bool movedDataToBuffer = false;

- if (m_contentType == CSSContent && !m_checkedForCSSCharset) {

- if (!checkForCSSCharset(data, len, movedDataToBuffer))

- return emptyString();

- }

- // We check XML declaration in HTML content only if there is enough data available

- if (((m_contentType == HTMLContent && len >= minimumLengthOfXMLDeclaration) || m_contentType == XMLContent) && !m_checkedForXMLCharset) {

- if (!checkForXMLCharset(data, len, movedDataToBuffer))

- return emptyString();

- }

- const char* dataForDecode = data + lengthOfBOM;

- size_t lengthForDecode = len - lengthOfBOM;

- if (!m_buffer.isEmpty()) {

- if (!movedDataToBuffer) {

- size_t oldSize = m_buffer.size();

- m_buffer.grow(oldSize + len);

- memcpy(m_buffer.data() + oldSize, data, len);

- }

- dataForDecode = m_buffer.data() + lengthOfBOM;

- lengthForDecode = m_buffer.size() - lengthOfBOM;

- }

- if (m_contentType == HTMLContent && !m_checkedForMetaCharset)

- checkForMetaCharset(dataForDecode, lengthForDecode);

- detectTextEncoding(data, len);

- ASSERT(m_encoding.isValid());

- if (!m_codec)

- m_codec = newTextCodec(m_encoding);

- String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);

- m_buffer.clear();

- return result;

-// We use the encoding detector in following cases:

+// We use the encoding detector in two cases:

// 1. Encoding detector is turned ON and no other encoding source is

// available (that is, it's DefaultEncoding).

// 2. Encoding detector is turned ON and the encoding is set to

@@ -408,27 +358,65 @@

// relationship is compliant to the same-origin policy. If they're from

// different domains, |m_source| would not be set to EncodingFromParentFrame

// in the first place.

-void TextResourceDecoder::detectTextEncoding(const char* data, size_t len)

- if (!shouldDetectEncoding())

- return;

- if (WTF::Unicode::isUTF8andNotASCII(data, len)) {

- setEncoding(UTF8Encoding(), EncodingFromContentSniffing);

- return;

- }

- if (m_encodingDetectionOption == UseAllAutoDetection) {

- WTF::TextEncoding detectedEncoding;

- if (detectTextEncodingUniversal(data, len, m_hintEncoding, &detectedEncoding))

- setEncoding(detectedEncoding, EncodingFromContentSniffing);

- }

-bool TextResourceDecoder::shouldDetectEncoding() const

+bool TextResourceDecoder::shouldAutoDetect() const

{

// Just checking m_hintEncoding suffices here because it's only set

// in setHintEncoding when the source is AutoDetectedEncoding.

- return m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding);

+ return m_encodingDetectionOption == UseAllAutoDetection

+ && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));

+String TextResourceDecoder::decode(const char* data, size_t len)

+ size_t lengthOfBOM = 0;

+ if (!m_checkedForBOM)

+ lengthOfBOM = checkForBOM(data, len);

+ bool movedDataToBuffer = false;

+ if (m_contentType == CSSContent && !m_checkedForCSSCharset) {

+ if (!checkForCSSCharset(data, len, movedDataToBuffer))

+ return emptyString();

+ }

+ // We check XML declaration in HTML content only if there is enough data available

+ if (((m_contentType == HTMLContent && len >= minimumLengthOfXMLDeclaration) || m_contentType == XMLContent) && !m_checkedForXMLCharset) {

+ if (!checkForXMLCharset(data, len, movedDataToBuffer))

+ return emptyString();

+ }

+ const char* dataForDecode = data + lengthOfBOM;

+ size_t lengthForDecode = len - lengthOfBOM;

+ if (!m_buffer.isEmpty()) {

+ if (!movedDataToBuffer) {

+ size_t oldSize = m_buffer.size();

+ m_buffer.grow(oldSize + len);

+ memcpy(m_buffer.data() + oldSize, data, len);

+ }

+ dataForDecode = m_buffer.data() + lengthOfBOM;

+ lengthForDecode = m_buffer.size() - lengthOfBOM;

+ }

+ if (m_contentType == HTMLContent && !m_checkedForMetaCharset)

+ checkForMetaCharset(dataForDecode, lengthForDecode);

+ if (shouldAutoDetect()) {

+ WTF::TextEncoding detectedEncoding;

+ if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))

+ setEncoding(detectedEncoding, EncodingFromContentSniffing);

+ }

+ ASSERT(m_encoding.isValid());

+ if (!m_codec)

+ m_codec = newTextCodec(m_encoding);

+ String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);

+ m_buffer.clear();

+ return result;

}

String TextResourceDecoder::flush()

@@ -436,9 +424,11 @@

// If we can not identify the encoding even after a document is completely

// loaded, we need to detect the encoding if other conditions for

// autodetection is satisfied.

- if (m_buffer.size()

+ if (m_buffer.size() && shouldAutoDetect()

&& ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_contentType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSContent)))) {

- detectTextEncoding(m_buffer.data(), m_buffer.size());

+ WTF::TextEncoding detectedEncoding;

+ if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))

+ setEncoding(detectedEncoding, EncodingFromContentSniffing);

}

if (!m_codec)