third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp - Issue 1470893002: [Fetch] Always use utf-8 for decoding in text()

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1470893002: [Fetch] Always use utf-8 for decoding in text() (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Reflect comments. Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « third_party/WebKit/Source/core/html/parser/TextResourceDecoder.h ('k') | third_party/WebKit/Source/modules/fetch/FetchDataLoader.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)	2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)

3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.	3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.

4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)	4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)

5	5

6 This library is free software; you can redistribute it and/or	6 This library is free software; you can redistribute it and/or

7 modify it under the terms of the GNU Library General Public	7 modify it under the terms of the GNU Library General Public

8 License as published by the Free Software Foundation; either	8 License as published by the Free Software Foundation; either

9 version 2 of the License, or (at your option) any later version.	9 version 2 of the License, or (at your option) any later version.

10	10

(...skipping 93 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
104 {	104 {

105 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII	105 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII

106 // for text/xml. This matches Firefox.	106 // for text/xml. This matches Firefox.

107 if (contentType == XMLContent)	107 if (contentType == XMLContent)

108 return UTF8Encoding();	108 return UTF8Encoding();

109 if (!specifiedDefaultEncoding.isValid())	109 if (!specifiedDefaultEncoding.isValid())

110 return Latin1Encoding();	110 return Latin1Encoding();

111 return specifiedDefaultEncoding;	111 return specifiedDefaultEncoding;

112 }	112 }

113	113

114 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, bool usesEncodingDetector)	114 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, EncodingDetectionOption encodingDetectionOpt ion)

115 : m_contentType(determineContentType(mimeType))	115 : m_contentType(determineContentType(mimeType))

116 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))	116 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))

117 , m_source(DefaultEncoding)	117 , m_source(DefaultEncoding)

118 , m_hintEncoding(0)	118 , m_hintEncoding(0)

119 , m_checkedForBOM(false)	119 , m_checkedForBOM(false)

120 , m_checkedForCSSCharset(false)	120 , m_checkedForCSSCharset(false)

121 , m_checkedForXMLCharset(false)	121 , m_checkedForXMLCharset(false)

122 , m_checkedForMetaCharset(false)	122 , m_checkedForMetaCharset(false)

123 , m_useLenientXMLDecoding(false)	123 , m_useLenientXMLDecoding(false)

124 , m_sawError(false)	124 , m_sawError(false)

125 , m_usesEncodingDetector(usesEncodingDetector)	125 , m_encodingDetectionOption(encodingDetectionOption)

126 {	126 {

	127 if (m_encodingDetectionOption == AlwaysUseUTF8ForText)

	128 ASSERT(m_contentType == PlainTextContent && m_encoding == UTF8Encoding() );

127 }	129 }

128	130

129 TextResourceDecoder::~TextResourceDecoder()	131 TextResourceDecoder::~TextResourceDecoder()

130 {	132 {

131 }	133 }

132	134

133 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)	135 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)

134 {	136 {

135 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).	137 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).

136 if (!encoding.isValid())	138 if (!encoding.isValid())

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
202 size_t buf1Len = bufferLength;	204 size_t buf1Len = bufferLength;

203 size_t buf2Len = len;	205 size_t buf2Len = len;

204 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());	206 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());

205 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);	207 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);

206 unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	208 unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

207 unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	209 unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

208 unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	210 unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

209 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;	211 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;

210	212

211 // Check for the BOM.	213 // Check for the BOM.

212 if (c1 == 0xFF && c2 == 0xFE) {	214 if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {

213 if (c3 \|\| c4) {	215 setEncoding(UTF8Encoding(), AutoDetectedEncoding);

214 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);	216 lengthOfBOM = 3;

	217 } else if (m_encodingDetectionOption != AlwaysUseUTF8ForText) {

	218 if (c1 == 0xFF && c2 == 0xFE) {

	219 if (c3 \|\| c4) {

	220 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);

	221 lengthOfBOM = 2;

	222 } else {

	223 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);

	224 lengthOfBOM = 4;

	225 }

	226 } else if (c1 == 0xFE && c2 == 0xFF) {

	227 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);

215 lengthOfBOM = 2;	228 lengthOfBOM = 2;

216 } else {	229 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) {

217 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);	230 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);

218 lengthOfBOM = 4;	231 lengthOfBOM = 4;

219 }	232 }

220 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {

221 setEncoding(UTF8Encoding(), AutoDetectedEncoding);

222 lengthOfBOM = 3;

223 } else if (c1 == 0xFE && c2 == 0xFF) {

224 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);

225 lengthOfBOM = 2;

226 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) {

227 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);

228 lengthOfBOM = 4;

229 }	233 }

230	234

231 if (lengthOfBOM \|\| bufferLength + len >= 4)	235 if (lengthOfBOM \|\| bufferLength + len >= 4)

232 m_checkedForBOM = true;	236 m_checkedForBOM = true;

233	237

234 return lengthOfBOM;	238 return lengthOfBOM;

235 }	239 }

236	240

237 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)	241 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)

238 {	242 {

(...skipping 113 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
352 // 2. Encoding detector is turned ON and the encoding is set to	356 // 2. Encoding detector is turned ON and the encoding is set to

353 // the encoding of the parent frame, which is also auto-detected.	357 // the encoding of the parent frame, which is also auto-detected.

354 // Note that condition #2 is NOT satisfied unless parent-child frame	358 // Note that condition #2 is NOT satisfied unless parent-child frame

355 // relationship is compliant to the same-origin policy. If they're from	359 // relationship is compliant to the same-origin policy. If they're from

356 // different domains, \|m_source\| would not be set to EncodingFromParentFrame	360 // different domains, \|m_source\| would not be set to EncodingFromParentFrame

357 // in the first place.	361 // in the first place.

358 bool TextResourceDecoder::shouldAutoDetect() const	362 bool TextResourceDecoder::shouldAutoDetect() const

359 {	363 {

360 // Just checking m_hintEncoding suffices here because it's only set	364 // Just checking m_hintEncoding suffices here because it's only set

361 // in setHintEncoding when the source is AutoDetectedEncoding.	365 // in setHintEncoding when the source is AutoDetectedEncoding.

362 return m_usesEncodingDetector	366 return m_encodingDetectionOption == UseAllAutoDetection

363 && (m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame && m_hintEncoding));	367 && (m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame && m_hintEncoding));

364 }	368 }

365	369

366 String TextResourceDecoder::decode(const char* data, size_t len)	370 String TextResourceDecoder::decode(const char* data, size_t len)

367 {	371 {

368 size_t lengthOfBOM = 0;	372 size_t lengthOfBOM = 0;

369 if (!m_checkedForBOM)	373 if (!m_checkedForBOM)

370 lengthOfBOM = checkForBOM(data, len);	374 lengthOfBOM = checkForBOM(data, len);

371	375

372 bool movedDataToBuffer = false;	376 bool movedDataToBuffer = false;

(...skipping 65 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
438 m_codec = newTextCodec(m_encoding);	442 m_codec = newTextCodec(m_encoding);

439	443

440 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);	444 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);

441 m_buffer.clear();	445 m_buffer.clear();

442 m_codec.clear();	446 m_codec.clear();

443 m_checkedForBOM = false; // Skip BOM again when re-decoding.	447 m_checkedForBOM = false; // Skip BOM again when re-decoding.

444 return result;	448 return result;

445 }	449 }

446	450

447 }	451 }

OLD	NEW