Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(94)

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1470893002: [Fetch] Always use utf-8 for decoding in text() (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Add fixes. Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved. 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 5
6 This library is free software; you can redistribute it and/or 6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public 7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either 8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version. 9 version 2 of the License, or (at your option) any later version.
10 10
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
105 { 105 {
106 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII 106 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
107 // for text/xml. This matches Firefox. 107 // for text/xml. This matches Firefox.
108 if (contentType == XMLContent) 108 if (contentType == XMLContent)
109 return UTF8Encoding(); 109 return UTF8Encoding();
110 if (!specifiedDefaultEncoding.isValid()) 110 if (!specifiedDefaultEncoding.isValid())
111 return Latin1Encoding(); 111 return Latin1Encoding();
112 return specifiedDefaultEncoding; 112 return specifiedDefaultEncoding;
113 } 113 }
114 114
115 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, bool usesEncodingDetector) 115 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, bool usesEncodingDetector, BOMCheckOptions b omCheckOptions)
116 : m_contentType(determineContentType(mimeType)) 116 : m_contentType(determineContentType(mimeType))
117 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding)) 117 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
118 , m_source(DefaultEncoding) 118 , m_source(DefaultEncoding)
119 , m_hintEncoding(0) 119 , m_hintEncoding(0)
120 , m_checkedForBOM(false) 120 , m_checkedForBOM(false)
121 , m_checkedForCSSCharset(false) 121 , m_checkedForCSSCharset(false)
122 , m_checkedForXMLCharset(false) 122 , m_checkedForXMLCharset(false)
123 , m_checkedForMetaCharset(false) 123 , m_checkedForMetaCharset(false)
124 , m_useLenientXMLDecoding(false) 124 , m_useLenientXMLDecoding(false)
125 , m_sawError(false) 125 , m_sawError(false)
126 , m_usesEncodingDetector(usesEncodingDetector) 126 , m_usesEncodingDetector(usesEncodingDetector)
127 , m_bomCheckOptions(bomCheckOptions)
127 { 128 {
128 } 129 }
129 130
130 TextResourceDecoder::~TextResourceDecoder() 131 TextResourceDecoder::~TextResourceDecoder()
131 { 132 {
132 } 133 }
133 134
134 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source) 135 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)
135 { 136 {
136 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings). 137 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
186 if (end >= len) 187 if (end >= len)
187 return -1; 188 return -1;
188 189
189 encodingLength = end - pos; 190 encodingLength = end - pos;
190 return pos; 191 return pos;
191 } 192 }
192 193
193 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len) 194 size_t TextResourceDecoder::checkForBOM(const char* data, size_t len)
194 { 195 {
195 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure s ign of a Unicode encoding. 196 // Check for UTF-16/32 or UTF-8 BOM mark at the beginning, which is a sure s ign of a Unicode encoding.
196 // We let it override even a user-chosen encoding. 197 // We let it override even a user-chosen encoding.
yhirano 2015/11/24 02:06:40 This is the problem, right? Rather than introducin
hiroshige 2015/11/30 09:50:01 How about Patch Set 8: |usesEncodingDetector| ca
197 ASSERT(!m_checkedForBOM); 198 ASSERT(!m_checkedForBOM);
198 199
199 size_t lengthOfBOM = 0; 200 size_t lengthOfBOM = 0;
200 201
201 size_t bufferLength = m_buffer.size(); 202 size_t bufferLength = m_buffer.size();
202 203
203 size_t buf1Len = bufferLength; 204 size_t buf1Len = bufferLength;
204 size_t buf2Len = len; 205 size_t buf2Len = len;
205 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data()); 206 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());
206 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); 207 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
207 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 208 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
208 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 209 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
209 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 210 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
210 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; 211 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
211 212
212 // Check for the BOM. 213 // Check for the BOM.
213 if (c1 == 0xFF && c2 == 0xFE) { 214 if (m_bomCheckOptions == CheckForAllBOM && c1 == 0xFF && c2 == 0xFE) {
214 if (c3 || c4) { 215 if (c3 || c4) {
215 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); 216 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
216 lengthOfBOM = 2; 217 lengthOfBOM = 2;
217 } else { 218 } else {
218 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); 219 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
219 lengthOfBOM = 4; 220 lengthOfBOM = 4;
220 } 221 }
221 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { 222 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
222 setEncoding(UTF8Encoding(), AutoDetectedEncoding); 223 setEncoding(UTF8Encoding(), AutoDetectedEncoding);
223 lengthOfBOM = 3; 224 lengthOfBOM = 3;
224 } else if (c1 == 0xFE && c2 == 0xFF) { 225 } else if (m_bomCheckOptions == CheckForAllBOM && c1 == 0xFE && c2 == 0xFF) {
225 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); 226 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
226 lengthOfBOM = 2; 227 lengthOfBOM = 2;
227 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) { 228 } else if (m_bomCheckOptions == CheckForAllBOM && !c1 && !c2 && c3 == 0xFE & & c4 == 0xFF) {
228 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); 229 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
229 lengthOfBOM = 4; 230 lengthOfBOM = 4;
230 } 231 }
231 232
232 if (lengthOfBOM || bufferLength + len >= 4) 233 if (lengthOfBOM || bufferLength + len >= 4)
233 m_checkedForBOM = true; 234 m_checkedForBOM = true;
234 235
235 return lengthOfBOM; 236 return lengthOfBOM;
236 } 237 }
237 238
(...skipping 195 matching lines...) Expand 10 before | Expand all | Expand 10 after
433 m_codec = newTextCodec(m_encoding); 434 m_codec = newTextCodec(m_encoding);
434 435
435 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 436 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
436 m_buffer.clear(); 437 m_buffer.clear();
437 m_codec.clear(); 438 m_codec.clear();
438 m_checkedForBOM = false; // Skip BOM again when re-decoding. 439 m_checkedForBOM = false; // Skip BOM again when re-decoding.
439 return result; 440 return result;
440 } 441 }
441 442
442 } 443 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698