Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(558)

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1470893002: [Fetch] Always use utf-8 for decoding in text() (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved. 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 5
6 This library is free software; you can redistribute it and/or 6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public 7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either 8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version. 9 version 2 of the License, or (at your option) any later version.
10 10
(...skipping 94 matching lines...) Expand 10 before | Expand all | Expand 10 after
105 { 105 {
106 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII 106 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8 instead of US-ASCII
107 // for text/xml. This matches Firefox. 107 // for text/xml. This matches Firefox.
108 if (contentType == XMLContent) 108 if (contentType == XMLContent)
109 return UTF8Encoding(); 109 return UTF8Encoding();
110 if (!specifiedDefaultEncoding.isValid()) 110 if (!specifiedDefaultEncoding.isValid())
111 return Latin1Encoding(); 111 return Latin1Encoding();
112 return specifiedDefaultEncoding; 112 return specifiedDefaultEncoding;
113 } 113 }
114 114
115 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, bool usesEncodingDetector) 115 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text Encoding& specifiedDefaultEncoding, EncodingDetectionOption encodingDetectionOpt ion)
116 : m_contentType(determineContentType(mimeType)) 116 : m_contentType(determineContentType(mimeType))
117 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding)) 117 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding))
118 , m_source(DefaultEncoding) 118 , m_source(DefaultEncoding)
119 , m_hintEncoding(0) 119 , m_hintEncoding(0)
120 , m_checkedForBOM(false) 120 , m_checkedForBOM(false)
121 , m_checkedForCSSCharset(false) 121 , m_checkedForCSSCharset(false)
122 , m_checkedForXMLCharset(false) 122 , m_checkedForXMLCharset(false)
123 , m_checkedForMetaCharset(false) 123 , m_checkedForMetaCharset(false)
124 , m_useLenientXMLDecoding(false) 124 , m_useLenientXMLDecoding(false)
125 , m_sawError(false) 125 , m_sawError(false)
126 , m_usesEncodingDetector(usesEncodingDetector) 126 , m_encodingDetectionOption(encodingDetectionOption)
127 { 127 {
128 ASSERT(!(m_encodingDetectionOption == AlwaysUseUTF8ForText && (m_contentType != PlainTextContent || m_encoding != UTF8Encoding())));
128 } 129 }
129 130
130 TextResourceDecoder::~TextResourceDecoder() 131 TextResourceDecoder::~TextResourceDecoder()
131 { 132 {
132 } 133 }
133 134
134 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source) 135 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin gSource source)
135 { 136 {
136 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings). 137 // In case the encoding didn't exist, we keep the old one (helps some sites specifying invalid encodings).
137 if (!encoding.isValid()) 138 if (!encoding.isValid())
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after
203 size_t buf1Len = bufferLength; 204 size_t buf1Len = bufferLength;
204 size_t buf2Len = len; 205 size_t buf2Len = len;
205 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data()); 206 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer. data());
206 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); 207 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data);
207 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 208 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
208 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 209 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
209 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0; 210 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b uf2++) : 0;
210 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; 211 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0;
211 212
212 // Check for the BOM. 213 // Check for the BOM.
213 if (c1 == 0xFF && c2 == 0xFE) { 214 if (m_encodingDetectionOption <= UseContentAndBOMBasedDetection && c1 == 0xF F && c2 == 0xFE) {
yhirano 2015/12/02 08:28:29 [optional] I'm not a fan of numerical enum value c
214 if (c3 || c4) { 215 if (c3 || c4) {
215 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); 216 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding);
216 lengthOfBOM = 2; 217 lengthOfBOM = 2;
217 } else { 218 } else {
218 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); 219 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding);
219 lengthOfBOM = 4; 220 lengthOfBOM = 4;
220 } 221 }
221 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { 222 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) {
222 setEncoding(UTF8Encoding(), AutoDetectedEncoding); 223 setEncoding(UTF8Encoding(), AutoDetectedEncoding);
223 lengthOfBOM = 3; 224 lengthOfBOM = 3;
224 } else if (c1 == 0xFE && c2 == 0xFF) { 225 } else if (m_encodingDetectionOption <= UseContentAndBOMBasedDetection && c1 == 0xFE && c2 == 0xFF) {
225 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); 226 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding);
226 lengthOfBOM = 2; 227 lengthOfBOM = 2;
227 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) { 228 } else if (m_encodingDetectionOption <= UseContentAndBOMBasedDetection && !c 1 && !c2 && c3 == 0xFE && c4 == 0xFF) {
228 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); 229 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding);
229 lengthOfBOM = 4; 230 lengthOfBOM = 4;
230 } 231 }
231 232
232 if (lengthOfBOM || bufferLength + len >= 4) 233 if (lengthOfBOM || bufferLength + len >= 4)
233 m_checkedForBOM = true; 234 m_checkedForBOM = true;
234 235
235 return lengthOfBOM; 236 return lengthOfBOM;
236 } 237 }
237 238
(...skipping 115 matching lines...) Expand 10 before | Expand all | Expand 10 after
353 // 2. Encoding detector is turned ON and the encoding is set to 354 // 2. Encoding detector is turned ON and the encoding is set to
354 // the encoding of the parent frame, which is also auto-detected. 355 // the encoding of the parent frame, which is also auto-detected.
355 // Note that condition #2 is NOT satisfied unless parent-child frame 356 // Note that condition #2 is NOT satisfied unless parent-child frame
356 // relationship is compliant to the same-origin policy. If they're from 357 // relationship is compliant to the same-origin policy. If they're from
357 // different domains, |m_source| would not be set to EncodingFromParentFrame 358 // different domains, |m_source| would not be set to EncodingFromParentFrame
358 // in the first place. 359 // in the first place.
359 bool TextResourceDecoder::shouldAutoDetect() const 360 bool TextResourceDecoder::shouldAutoDetect() const
360 { 361 {
361 // Just checking m_hintEncoding suffices here because it's only set 362 // Just checking m_hintEncoding suffices here because it's only set
362 // in setHintEncoding when the source is AutoDetectedEncoding. 363 // in setHintEncoding when the source is AutoDetectedEncoding.
363 return m_usesEncodingDetector 364 return m_encodingDetectionOption == UseAllAutoDetection
364 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding)); 365 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
365 } 366 }
366 367
367 String TextResourceDecoder::decode(const char* data, size_t len) 368 String TextResourceDecoder::decode(const char* data, size_t len)
368 { 369 {
369 size_t lengthOfBOM = 0; 370 size_t lengthOfBOM = 0;
370 if (!m_checkedForBOM) 371 if (!m_checkedForBOM)
371 lengthOfBOM = checkForBOM(data, len); 372 lengthOfBOM = checkForBOM(data, len);
372 373
373 bool movedDataToBuffer = false; 374 bool movedDataToBuffer = false;
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
433 m_codec = newTextCodec(m_encoding); 434 m_codec = newTextCodec(m_encoding);
434 435
435 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 436 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
436 m_buffer.clear(); 437 m_buffer.clear();
437 m_codec.clear(); 438 m_codec.clear();
438 m_checkedForBOM = false; // Skip BOM again when re-decoding. 439 m_checkedForBOM = false; // Skip BOM again when re-decoding.
439 return result; 440 return result;
440 } 441 }
441 442
442 } 443 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698