Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(70)

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1470893002: [Fetch] Always use utf-8 for decoding in text() (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Reflect comments. Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved. 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 5
6 This library is free software; you can redistribute it and/or 6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public 7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either 8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version. 9 version 2 of the License, or (at your option) any later version.
10 10
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after
104 { 104 {
105 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII 105 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
106 // for text/xml. This matches Firefox. 106 // for text/xml. This matches Firefox.
107 if (contentType == XMLContent) 107 if (contentType == XMLContent)
108 return UTF8Encoding(); 108 return UTF8Encoding();
109 if (!specifiedDefaultEncoding.isValid()) 109 if (!specifiedDefaultEncoding.isValid())
110 return Latin1Encoding(); 110 return Latin1Encoding();
111 return specifiedDefaultEncoding; 111 return specifiedDefaultEncoding;
112 } 112 }
113 113
114 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, bool usesEncodingDetector) 114 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, EncodingDetectionOption encodingDetectionOpt ion)
115 : m_contentType(determineContentType(mimeType)) 115 : m_contentType(determineContentType(mimeType))
116 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding)) 116 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
117 , m_source(DefaultEncoding) 117 , m_source(DefaultEncoding)
118 , m_hintEncoding(0) 118 , m_hintEncoding(0)
119 , m_checkedForBOM(false) 119 , m_checkedForBOM(false)
120 , m_checkedForCSSCharset(false) 120 , m_checkedForCSSCharset(false)
121 , m_checkedForXMLCharset(false) 121 , m_checkedForXMLCharset(false)
122 , m_checkedForMetaCharset(false) 122 , m_checkedForMetaCharset(false)
123 , m_useLenientXMLDecoding(false) 123 , m_useLenientXMLDecoding(false)
124 , m_sawError(false) 124 , m_sawError(false)
125 , m_usesEncodingDetector(usesEncodingDetector) 125 , m_encodingDetectionOption(encodingDetectionOption)
126 { 126 {
127 if (m_encodingDetectionOption == AlwaysUseUTF8ForText)
128 ASSERT(m_contentType == PlainTextContent && m_encoding == UTF8Encoding() );
127 } 129 }
128 130
129 TextResourceDecoder::~TextResourceDecoder() 131 TextResourceDecoder::~TextResourceDecoder()
130 { 132 {
131 } 133 }
132 134
133 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source) 135 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)
134 { 136 {
135 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings). 137 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
136 if (!encoding.isValid()) 138 if (!encoding.isValid())
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
202 size_t buf1Len = bufferLength; 204 size_t buf1Len = bufferLength;
203 size_t buf2Len = len; 205 size_t buf2Len = len;
204 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data()); 206 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());
205 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); 207 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
206 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 208 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
207 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 209 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
208 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 210 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
209 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; 211 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
210 212
211 // Check for the BOM. 213 // Check for the BOM.
212 if (c1 == 0xFF && c2 == 0xFE) { 214 if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
213 if (c3 || c4) { 215 setEncoding(UTF8Encoding(), AutoDetectedEncoding);
214 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); 216 lengthOfBOM = 3;
217 } else if (m_encodingDetectionOption != AlwaysUseUTF8ForText) {
218 if (c1 == 0xFF && c2 == 0xFE) {
219 if (c3 || c4) {
220 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
221 lengthOfBOM = 2;
222 } else {
223 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
224 lengthOfBOM = 4;
225 }
226 } else if (c1 == 0xFE && c2 == 0xFF) {
227 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
215 lengthOfBOM = 2; 228 lengthOfBOM = 2;
216 } else { 229 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) {
217 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); 230 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
218 lengthOfBOM = 4; 231 lengthOfBOM = 4;
219 } 232 }
220 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
221 setEncoding(UTF8Encoding(), AutoDetectedEncoding);
222 lengthOfBOM = 3;
223 } else if (c1 == 0xFE && c2 == 0xFF) {
224 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
225 lengthOfBOM = 2;
226 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) {
227 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
228 lengthOfBOM = 4;
229 } 233 }
230 234
231 if (lengthOfBOM || bufferLength + len >= 4) 235 if (lengthOfBOM || bufferLength + len >= 4)
232 m_checkedForBOM = true; 236 m_checkedForBOM = true;
233 237
234 return lengthOfBOM; 238 return lengthOfBOM;
235 } 239 }
236 240
237 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer) 241 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool& movedDataToBuffer)
238 { 242 {
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after
352 // 2. Encoding detector is turned ON and the encoding is set to 356 // 2. Encoding detector is turned ON and the encoding is set to
353 // the encoding of the parent frame, which is also auto-detected. 357 // the encoding of the parent frame, which is also auto-detected.
354 // Note that condition #2 is NOT satisfied unless parent-child frame 358 // Note that condition #2 is NOT satisfied unless parent-child frame
355 // relationship is compliant to the same-origin policy. If they're from 359 // relationship is compliant to the same-origin policy. If they're from
356 // different domains, |m_source| would not be set to EncodingFromParentFrame 360 // different domains, |m_source| would not be set to EncodingFromParentFrame
357 // in the first place. 361 // in the first place.
358 bool TextResourceDecoder::shouldAutoDetect() const 362 bool TextResourceDecoder::shouldAutoDetect() const
359 { 363 {
360 // Just checking m_hintEncoding suffices here because it's only set 364 // Just checking m_hintEncoding suffices here because it's only set
361 // in setHintEncoding when the source is AutoDetectedEncoding. 365 // in setHintEncoding when the source is AutoDetectedEncoding.
362 return m_usesEncodingDetector 366 return m_encodingDetectionOption == UseAllAutoDetection
363 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding)); 367 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
364 } 368 }
365 369
366 String TextResourceDecoder::decode(const char* data, size_t len) 370 String TextResourceDecoder::decode(const char* data, size_t len)
367 { 371 {
368 size_t lengthOfBOM = 0; 372 size_t lengthOfBOM = 0;
369 if (!m_checkedForBOM) 373 if (!m_checkedForBOM)
370 lengthOfBOM = checkForBOM(data, len); 374 lengthOfBOM = checkForBOM(data, len);
371 375
372 bool movedDataToBuffer = false; 376 bool movedDataToBuffer = false;
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
438 m_codec = newTextCodec(m_encoding); 442 m_codec = newTextCodec(m_encoding);
439 443
440 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 444 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
441 m_buffer.clear(); 445 m_buffer.clear();
442 m_codec.clear(); 446 m_codec.clear();
443 m_checkedForBOM = false; // Skip BOM again when re-decoding. 447 m_checkedForBOM = false; // Skip BOM again when re-decoding.
444 return result; 448 return result;
445 } 449 }
446 450
447 } 451 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698