third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp - Issue 1470893002: [Fetch] Always use utf-8 for decoding in text()

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1470893002: [Fetch] Always use utf-8 for decoding in text() (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Remove numerical enum value comparison. Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« third_party/WebKit/Source/core/html/parser/TextResourceDecoder.h ('K') | « third_party/WebKit/Source/core/html/parser/TextResourceDecoder.h ('k') | third_party/WebKit/Source/modules/fetch/FetchDataLoader.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)	2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)

3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.	3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.

4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)	4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)

5	5

6 This library is free software; you can redistribute it and/or	6 This library is free software; you can redistribute it and/or

7 modify it under the terms of the GNU Library General Public	7 modify it under the terms of the GNU Library General Public

8 License as published by the Free Software Foundation; either	8 License as published by the Free Software Foundation; either

9 version 2 of the License, or (at your option) any later version.	9 version 2 of the License, or (at your option) any later version.

10	10

(...skipping 94 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
105 {	105 {

106 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII	106 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII

107 // for text/xml. This matches Firefox.	107 // for text/xml. This matches Firefox.

108 if (contentType == XMLContent)	108 if (contentType == XMLContent)

109 return UTF8Encoding();	109 return UTF8Encoding();

110 if (!specifiedDefaultEncoding.isValid())	110 if (!specifiedDefaultEncoding.isValid())

111 return Latin1Encoding();	111 return Latin1Encoding();

112 return specifiedDefaultEncoding;	112 return specifiedDefaultEncoding;

113 }	113 }

114	114

115 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, bool usesEncodingDetector)	115 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, EncodingDetectionOption encodingDetectionOpt ion)

116 : m_contentType(determineContentType(mimeType))	116 : m_contentType(determineContentType(mimeType))

117 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))	117 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))

118 , m_source(DefaultEncoding)	118 , m_source(DefaultEncoding)

119 , m_hintEncoding(0)	119 , m_hintEncoding(0)

120 , m_checkedForBOM(false)	120 , m_checkedForBOM(false)

121 , m_checkedForCSSCharset(false)	121 , m_checkedForCSSCharset(false)

122 , m_checkedForXMLCharset(false)	122 , m_checkedForXMLCharset(false)

123 , m_checkedForMetaCharset(false)	123 , m_checkedForMetaCharset(false)

124 , m_useLenientXMLDecoding(false)	124 , m_useLenientXMLDecoding(false)

125 , m_sawError(false)	125 , m_sawError(false)

126 , m_usesEncodingDetector(usesEncodingDetector)	126 , m_encodingDetectionOption(encodingDetectionOption)

127 {	127 {

	128 ASSERT(!(m_encodingDetectionOption == AlwaysUseUTF8ForText && (m_contentType != PlainTextContent \|\| m_encoding != UTF8Encoding())));
	kouhei (in TOK) 2015/12/14 02:00:22 Optional nit: Would you split this ASSERT stmt? if Optional nit: Would you split this ASSERT stmt? if (m_encodingDetectionOption == AlwaysUseUTF8ForText) ASSERT(m_contentType != PlainTextContent \|\| m_encoding != UTF8Encoding()); hiroshige 2015/12/14 05:44:02 Done. Show quoted text On 2015/12/14 02:00:22, kouhei wrote: > Optional nit: Would you split this ASSERT stmt? > if (m_encodingDetectionOption == AlwaysUseUTF8ForText) > ASSERT(m_contentType != PlainTextContent \|\| m_encoding != UTF8Encoding()); Done.
128 }	129 }

129	130

130 TextResourceDecoder::~TextResourceDecoder()	131 TextResourceDecoder::~TextResourceDecoder()

131 {	132 {

132 }	133 }

133	134

134 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)	135 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)

135 {	136 {

136 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).	137 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).

137 if (!encoding.isValid())	138 if (!encoding.isValid())

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
203 size_t buf1Len = bufferLength;	204 size_t buf1Len = bufferLength;

204 size_t buf2Len = len;	205 size_t buf2Len = len;

205 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());	206 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());

206 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);	207 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);

207 unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	208 unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

208 unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	209 unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

209 unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	210 unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

210 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;	211 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;

211	212

212 // Check for the BOM.	213 // Check for the BOM.

213 if (c1 == 0xFF && c2 == 0xFE) {	214 if (m_encodingDetectionOption != AlwaysUseUTF8ForText && c1 == 0xFF && c2 == 0xFE) {
	kouhei (in TOK) 2015/12/14 02:00:22 Can we reorder the if stmts here? if (c1 == 0xEF Can we reorder the if stmts here? if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { } else if (m_encodingDetectionOption != AlwaysUseUTF8ForText) { if (c1 == 0xFF && c2 == 0xFE) { } else if (c1 == 0xFE && c2 == 0xFF) { } else if (...) { } } hiroshige 2015/12/14 05:44:02 Done. Show quoted text On 2015/12/14 02:00:22, kouhei wrote: > Can we reorder the if stmts here? > > if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { > > } else if (m_encodingDetectionOption != AlwaysUseUTF8ForText) { > if (c1 == 0xFF && c2 == 0xFE) { > > } else if (c1 == 0xFE && c2 == 0xFF) { > > } else if (...) { > > } > } Done.
214 if (c3 \|\| c4) {	215 if (c3 \|\| c4) {

215 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);	216 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);

216 lengthOfBOM = 2;	217 lengthOfBOM = 2;

217 } else {	218 } else {

218 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);	219 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);

219 lengthOfBOM = 4;	220 lengthOfBOM = 4;

220 }	221 }

221 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {	222 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {

222 setEncoding(UTF8Encoding(), AutoDetectedEncoding);	223 setEncoding(UTF8Encoding(), AutoDetectedEncoding);

223 lengthOfBOM = 3;	224 lengthOfBOM = 3;

224 } else if (c1 == 0xFE && c2 == 0xFF) {	225 } else if (m_encodingDetectionOption != AlwaysUseUTF8ForText && c1 == 0xFE & & c2 == 0xFF) {

225 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);	226 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);

226 lengthOfBOM = 2;	227 lengthOfBOM = 2;

227 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) {	228 } else if (m_encodingDetectionOption != AlwaysUseUTF8ForText && !c1 && !c2 & & c3 == 0xFE && c4 == 0xFF) {

228 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);	229 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);

229 lengthOfBOM = 4;	230 lengthOfBOM = 4;

230 }	231 }

231	232

232 if (lengthOfBOM \|\| bufferLength + len >= 4)	233 if (lengthOfBOM \|\| bufferLength + len >= 4)

233 m_checkedForBOM = true;	234 m_checkedForBOM = true;

234	235

235 return lengthOfBOM;	236 return lengthOfBOM;

236 }	237 }

237	238

(...skipping 115 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
353 // 2. Encoding detector is turned ON and the encoding is set to	354 // 2. Encoding detector is turned ON and the encoding is set to

354 // the encoding of the parent frame, which is also auto-detected.	355 // the encoding of the parent frame, which is also auto-detected.

355 // Note that condition #2 is NOT satisfied unless parent-child frame	356 // Note that condition #2 is NOT satisfied unless parent-child frame

356 // relationship is compliant to the same-origin policy. If they're from	357 // relationship is compliant to the same-origin policy. If they're from

357 // different domains, \|m_source\| would not be set to EncodingFromParentFrame	358 // different domains, \|m_source\| would not be set to EncodingFromParentFrame

358 // in the first place.	359 // in the first place.

359 bool TextResourceDecoder::shouldAutoDetect() const	360 bool TextResourceDecoder::shouldAutoDetect() const

360 {	361 {

361 // Just checking m_hintEncoding suffices here because it's only set	362 // Just checking m_hintEncoding suffices here because it's only set

362 // in setHintEncoding when the source is AutoDetectedEncoding.	363 // in setHintEncoding when the source is AutoDetectedEncoding.

363 return m_usesEncodingDetector	364 return m_encodingDetectionOption == UseAllAutoDetection

364 && (m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame && m_hintEncoding));	365 && (m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame && m_hintEncoding));

365 }	366 }

366	367

367 String TextResourceDecoder::decode(const char* data, size_t len)	368 String TextResourceDecoder::decode(const char* data, size_t len)

368 {	369 {

369 size_t lengthOfBOM = 0;	370 size_t lengthOfBOM = 0;

370 if (!m_checkedForBOM)	371 if (!m_checkedForBOM)

371 lengthOfBOM = checkForBOM(data, len);	372 lengthOfBOM = checkForBOM(data, len);

372	373

373 bool movedDataToBuffer = false;	374 bool movedDataToBuffer = false;

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
433 m_codec = newTextCodec(m_encoding);	434 m_codec = newTextCodec(m_encoding);

434	435

435 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);	436 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);

436 m_buffer.clear();	437 m_buffer.clear();

437 m_codec.clear();	438 m_codec.clear();

438 m_checkedForBOM = false; // Skip BOM again when re-decoding.	439 m_checkedForBOM = false; // Skip BOM again when re-decoding.

439 return result;	440 return result;

440 }	441 }

441	442

442 }	443 }

OLD	NEW