third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp - Issue 1470893002: [Fetch] Always use utf-8 for decoding in text()

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1470893002: [Fetch] Always use utf-8 for decoding in text() (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Add fixes. Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « third_party/WebKit/Source/core/html/parser/TextResourceDecoder.h ('k') | third_party/WebKit/Source/modules/fetch/FetchDataLoader.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)	2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)

3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.	3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.

4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)	4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)

5	5

6 This library is free software; you can redistribute it and/or	6 This library is free software; you can redistribute it and/or

7 modify it under the terms of the GNU Library General Public	7 modify it under the terms of the GNU Library General Public

8 License as published by the Free Software Foundation; either	8 License as published by the Free Software Foundation; either

9 version 2 of the License, or (at your option) any later version.	9 version 2 of the License, or (at your option) any later version.

10	10

(...skipping 94 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
105 {	105 {

106 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII	106 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII

107 // for text/xml. This matches Firefox.	107 // for text/xml. This matches Firefox.

108 if (contentType == XMLContent)	108 if (contentType == XMLContent)

109 return UTF8Encoding();	109 return UTF8Encoding();

110 if (!specifiedDefaultEncoding.isValid())	110 if (!specifiedDefaultEncoding.isValid())

111 return Latin1Encoding();	111 return Latin1Encoding();

112 return specifiedDefaultEncoding;	112 return specifiedDefaultEncoding;

113 }	113 }

114	114

115 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, bool usesEncodingDetector)	115 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, bool usesEncodingDetector, BOMCheckOptions b omCheckOptions)

116 : m_contentType(determineContentType(mimeType))	116 : m_contentType(determineContentType(mimeType))

117 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))	117 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))

118 , m_source(DefaultEncoding)	118 , m_source(DefaultEncoding)

119 , m_hintEncoding(0)	119 , m_hintEncoding(0)

120 , m_checkedForBOM(false)	120 , m_checkedForBOM(false)

121 , m_checkedForCSSCharset(false)	121 , m_checkedForCSSCharset(false)

122 , m_checkedForXMLCharset(false)	122 , m_checkedForXMLCharset(false)

123 , m_checkedForMetaCharset(false)	123 , m_checkedForMetaCharset(false)

124 , m_useLenientXMLDecoding(false)	124 , m_useLenientXMLDecoding(false)

125 , m_sawError(false)	125 , m_sawError(false)

126 , m_usesEncodingDetector(usesEncodingDetector)	126 , m_usesEncodingDetector(usesEncodingDetector)

	127 , m_bomCheckOptions(bomCheckOptions)

127 {	128 {

128 }	129 }

129	130

130 TextResourceDecoder::~TextResourceDecoder()	131 TextResourceDecoder::~TextResourceDecoder()

131 {	132 {

132 }	133 }

133	134

134 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)	135 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)

135 {	136 {

136 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).	137 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).

(...skipping 49 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
186 if (end >= len)	187 if (end >= len)

187 return -1;	188 return -1;

188	189

189 encodingLength = end - pos;	190 encodingLength = end - pos;

190 return pos;	191 return pos;

191 }	192 }

192	193

193 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)	194 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)

194 {	195 {

195 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure s ign of a Unicode encoding.	196 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure s ign of a Unicode encoding.

196 // We let it override even a user-chosen encoding.	197 // We let it override even a user-chosen encoding.
	yhirano 2015/11/24 02:06:40 This is the problem, right? Rather than introducin This is the problem, right? Rather than introducing a new parameter, I think it is good to expand \|usesEncodingDetector\| constructor parameter so that a user can choose one of - UseAutoDetection (<= usesEncodingDetector == true) - UseBOMBasedDetectionOnly (<= usesEncodingDetector == false, default value) - DoNotUseAutoDetection (<= Response.text()) . What do you think? hiroshige 2015/11/30 09:50:01 How about Patch Set 8: \|usesEncodingDetector\| ca How about Patch Set 8: \|usesEncodingDetector\| can be: - UseAllAutoDetection (<= usesEncodingDetector == true) - UseContentAndBOMBasedDetection (<= usesEncodingDetector == false, default value) - AlwaysUseUTF8ForText (<= Response.text()) and when it is AlwaysUseUTF8ForText, \|defaultEncoding\| should be utf-8, and \|mimeType\| should be "text/plain". This is not orthogonal, but implementing DoNotUseAutoDetection for all encodings and non-text contents complicates the code: - Even when we don't detect encoding, we should handle utf-8 BOMs in AlwaysUseUTF8ForText, i.e. we should skip utf-8 BOMs (but not other BOMs such as utf-16be BOMs) when we decode the content as utf-8. Supporting this BOM skipping for utf-16/32 makes code more complicated. - In non-text contents (i.e. CSS/XML/HTML), TextResourceDecoder handles encodings explicitly specified in those files (e.g. <meta> tag) and its detection logic is built into other logics in the decoder. Disabling such logic makes code and state management of TextResourceDecoder more complicated. - And anyway we don't have such use cases.
197 ASSERT(!m_checkedForBOM);	198 ASSERT(!m_checkedForBOM);

198	199

199 size_t lengthOfBOM = 0;	200 size_t lengthOfBOM = 0;

200	201

201 size_t bufferLength = m_buffer.size();	202 size_t bufferLength = m_buffer.size();

202	203

203 size_t buf1Len = bufferLength;	204 size_t buf1Len = bufferLength;

204 size_t buf2Len = len;	205 size_t buf2Len = len;

205 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());	206 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());

206 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);	207 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);

207 unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	208 unsigned char c1 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

208 unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	209 unsigned char c2 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

209 unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;	210 unsigned char c3 = buf1Len ? (--buf1Len, buf1++) : buf2Len ? (--buf2Len, b uf2++) : 0;

210 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;	211 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;

211	212

212 // Check for the BOM.	213 // Check for the BOM.

213 if (c1 == 0xFF && c2 == 0xFE) {	214 if (m_bomCheckOptions == CheckForAllBOM && c1 == 0xFF && c2 == 0xFE) {

214 if (c3 \|\| c4) {	215 if (c3 \|\| c4) {

215 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);	216 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);

216 lengthOfBOM = 2;	217 lengthOfBOM = 2;

217 } else {	218 } else {

218 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);	219 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);

219 lengthOfBOM = 4;	220 lengthOfBOM = 4;

220 }	221 }

221 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {	222 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {

222 setEncoding(UTF8Encoding(), AutoDetectedEncoding);	223 setEncoding(UTF8Encoding(), AutoDetectedEncoding);

223 lengthOfBOM = 3;	224 lengthOfBOM = 3;

224 } else if (c1 == 0xFE && c2 == 0xFF) {	225 } else if (m_bomCheckOptions == CheckForAllBOM && c1 == 0xFE && c2 == 0xFF) {

225 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);	226 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);

226 lengthOfBOM = 2;	227 lengthOfBOM = 2;

227 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) {	228 } else if (m_bomCheckOptions == CheckForAllBOM && !c1 && !c2 && c3 == 0xFE & & c4 == 0xFF) {

228 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);	229 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);

229 lengthOfBOM = 4;	230 lengthOfBOM = 4;

230 }	231 }

231	232

232 if (lengthOfBOM \|\| bufferLength + len >= 4)	233 if (lengthOfBOM \|\| bufferLength + len >= 4)

233 m_checkedForBOM = true;	234 m_checkedForBOM = true;

234	235

235 return lengthOfBOM;	236 return lengthOfBOM;

236 }	237 }

237	238

(...skipping 195 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
433 m_codec = newTextCodec(m_encoding);	434 m_codec = newTextCodec(m_encoding);

434	435

435 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);	436 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);

436 m_buffer.clear();	437 m_buffer.clear();

437 m_codec.clear();	438 m_codec.clear();

438 m_checkedForBOM = false; // Skip BOM again when re-decoding.	439 m_checkedForBOM = false; // Skip BOM again when re-decoding.

439 return result;	440 return result;

440 }	441 }

441	442

442 }	443 }

OLD	NEW