OLD | NEW |
1 /* | 1 /* |
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) | 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) |
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All
rights reserved. | 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All
rights reserved. |
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) | 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) |
5 | 5 |
6 This library is free software; you can redistribute it and/or | 6 This library is free software; you can redistribute it and/or |
7 modify it under the terms of the GNU Library General Public | 7 modify it under the terms of the GNU Library General Public |
8 License as published by the Free Software Foundation; either | 8 License as published by the Free Software Foundation; either |
9 version 2 of the License, or (at your option) any later version. | 9 version 2 of the License, or (at your option) any later version. |
10 | 10 |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
104 { | 104 { |
105 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8
instead of US-ASCII | 105 // Despite 8.5 "Text/xml with Omitted Charset" of RFC 3023, we assume UTF-8
instead of US-ASCII |
106 // for text/xml. This matches Firefox. | 106 // for text/xml. This matches Firefox. |
107 if (contentType == XMLContent) | 107 if (contentType == XMLContent) |
108 return UTF8Encoding(); | 108 return UTF8Encoding(); |
109 if (!specifiedDefaultEncoding.isValid()) | 109 if (!specifiedDefaultEncoding.isValid()) |
110 return Latin1Encoding(); | 110 return Latin1Encoding(); |
111 return specifiedDefaultEncoding; | 111 return specifiedDefaultEncoding; |
112 } | 112 } |
113 | 113 |
114 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text
Encoding& specifiedDefaultEncoding, bool usesEncodingDetector) | 114 TextResourceDecoder::TextResourceDecoder(const String& mimeType, const WTF::Text
Encoding& specifiedDefaultEncoding, EncodingDetectionOption encodingDetectionOpt
ion) |
115 : m_contentType(determineContentType(mimeType)) | 115 : m_contentType(determineContentType(mimeType)) |
116 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding)) | 116 , m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding)) |
117 , m_source(DefaultEncoding) | 117 , m_source(DefaultEncoding) |
118 , m_hintEncoding(0) | 118 , m_hintEncoding(0) |
119 , m_checkedForBOM(false) | 119 , m_checkedForBOM(false) |
120 , m_checkedForCSSCharset(false) | 120 , m_checkedForCSSCharset(false) |
121 , m_checkedForXMLCharset(false) | 121 , m_checkedForXMLCharset(false) |
122 , m_checkedForMetaCharset(false) | 122 , m_checkedForMetaCharset(false) |
123 , m_useLenientXMLDecoding(false) | 123 , m_useLenientXMLDecoding(false) |
124 , m_sawError(false) | 124 , m_sawError(false) |
125 , m_usesEncodingDetector(usesEncodingDetector) | 125 , m_encodingDetectionOption(encodingDetectionOption) |
126 { | 126 { |
| 127 if (m_encodingDetectionOption == AlwaysUseUTF8ForText) |
| 128 ASSERT(m_contentType == PlainTextContent && m_encoding == UTF8Encoding()
); |
127 } | 129 } |
128 | 130 |
129 TextResourceDecoder::~TextResourceDecoder() | 131 TextResourceDecoder::~TextResourceDecoder() |
130 { | 132 { |
131 } | 133 } |
132 | 134 |
133 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin
gSource source) | 135 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, Encodin
gSource source) |
134 { | 136 { |
135 // In case the encoding didn't exist, we keep the old one (helps some sites
specifying invalid encodings). | 137 // In case the encoding didn't exist, we keep the old one (helps some sites
specifying invalid encodings). |
136 if (!encoding.isValid()) | 138 if (!encoding.isValid()) |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
202 size_t buf1Len = bufferLength; | 204 size_t buf1Len = bufferLength; |
203 size_t buf2Len = len; | 205 size_t buf2Len = len; |
204 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.
data()); | 206 const unsigned char* buf1 = reinterpret_cast<const unsigned char*>(m_buffer.
data()); |
205 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); | 207 const unsigned char* buf2 = reinterpret_cast<const unsigned char*>(data); |
206 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b
uf2++) : 0; | 208 unsigned char c1 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b
uf2++) : 0; |
207 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b
uf2++) : 0; | 209 unsigned char c2 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b
uf2++) : 0; |
208 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b
uf2++) : 0; | 210 unsigned char c3 = buf1Len ? (--buf1Len, *buf1++) : buf2Len ? (--buf2Len, *b
uf2++) : 0; |
209 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; | 211 unsigned char c4 = buf2Len ? (--buf2Len, *buf2++) : 0; |
210 | 212 |
211 // Check for the BOM. | 213 // Check for the BOM. |
212 if (c1 == 0xFF && c2 == 0xFE) { | 214 if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { |
213 if (c3 || c4) { | 215 setEncoding(UTF8Encoding(), AutoDetectedEncoding); |
214 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); | 216 lengthOfBOM = 3; |
| 217 } else if (m_encodingDetectionOption != AlwaysUseUTF8ForText) { |
| 218 if (c1 == 0xFF && c2 == 0xFE) { |
| 219 if (c3 || c4) { |
| 220 setEncoding(UTF16LittleEndianEncoding(), AutoDetectedEncoding); |
| 221 lengthOfBOM = 2; |
| 222 } else { |
| 223 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); |
| 224 lengthOfBOM = 4; |
| 225 } |
| 226 } else if (c1 == 0xFE && c2 == 0xFF) { |
| 227 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); |
215 lengthOfBOM = 2; | 228 lengthOfBOM = 2; |
216 } else { | 229 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) { |
217 setEncoding(UTF32LittleEndianEncoding(), AutoDetectedEncoding); | 230 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); |
218 lengthOfBOM = 4; | 231 lengthOfBOM = 4; |
219 } | 232 } |
220 } else if (c1 == 0xEF && c2 == 0xBB && c3 == 0xBF) { | |
221 setEncoding(UTF8Encoding(), AutoDetectedEncoding); | |
222 lengthOfBOM = 3; | |
223 } else if (c1 == 0xFE && c2 == 0xFF) { | |
224 setEncoding(UTF16BigEndianEncoding(), AutoDetectedEncoding); | |
225 lengthOfBOM = 2; | |
226 } else if (!c1 && !c2 && c3 == 0xFE && c4 == 0xFF) { | |
227 setEncoding(UTF32BigEndianEncoding(), AutoDetectedEncoding); | |
228 lengthOfBOM = 4; | |
229 } | 233 } |
230 | 234 |
231 if (lengthOfBOM || bufferLength + len >= 4) | 235 if (lengthOfBOM || bufferLength + len >= 4) |
232 m_checkedForBOM = true; | 236 m_checkedForBOM = true; |
233 | 237 |
234 return lengthOfBOM; | 238 return lengthOfBOM; |
235 } | 239 } |
236 | 240 |
237 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool&
movedDataToBuffer) | 241 bool TextResourceDecoder::checkForCSSCharset(const char* data, size_t len, bool&
movedDataToBuffer) |
238 { | 242 { |
(...skipping 113 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
352 // 2. Encoding detector is turned ON and the encoding is set to | 356 // 2. Encoding detector is turned ON and the encoding is set to |
353 // the encoding of the parent frame, which is also auto-detected. | 357 // the encoding of the parent frame, which is also auto-detected. |
354 // Note that condition #2 is NOT satisfied unless parent-child frame | 358 // Note that condition #2 is NOT satisfied unless parent-child frame |
355 // relationship is compliant to the same-origin policy. If they're from | 359 // relationship is compliant to the same-origin policy. If they're from |
356 // different domains, |m_source| would not be set to EncodingFromParentFrame | 360 // different domains, |m_source| would not be set to EncodingFromParentFrame |
357 // in the first place. | 361 // in the first place. |
358 bool TextResourceDecoder::shouldAutoDetect() const | 362 bool TextResourceDecoder::shouldAutoDetect() const |
359 { | 363 { |
360 // Just checking m_hintEncoding suffices here because it's only set | 364 // Just checking m_hintEncoding suffices here because it's only set |
361 // in setHintEncoding when the source is AutoDetectedEncoding. | 365 // in setHintEncoding when the source is AutoDetectedEncoding. |
362 return m_usesEncodingDetector | 366 return m_encodingDetectionOption == UseAllAutoDetection |
363 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame
&& m_hintEncoding)); | 367 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame
&& m_hintEncoding)); |
364 } | 368 } |
365 | 369 |
366 String TextResourceDecoder::decode(const char* data, size_t len) | 370 String TextResourceDecoder::decode(const char* data, size_t len) |
367 { | 371 { |
368 size_t lengthOfBOM = 0; | 372 size_t lengthOfBOM = 0; |
369 if (!m_checkedForBOM) | 373 if (!m_checkedForBOM) |
370 lengthOfBOM = checkForBOM(data, len); | 374 lengthOfBOM = checkForBOM(data, len); |
371 | 375 |
372 bool movedDataToBuffer = false; | 376 bool movedDataToBuffer = false; |
(...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
438 m_codec = newTextCodec(m_encoding); | 442 m_codec = newTextCodec(m_encoding); |
439 | 443 |
440 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF,
m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); | 444 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF,
m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); |
441 m_buffer.clear(); | 445 m_buffer.clear(); |
442 m_codec.clear(); | 446 m_codec.clear(); |
443 m_checkedForBOM = false; // Skip BOM again when re-decoding. | 447 m_checkedForBOM = false; // Skip BOM again when re-decoding. |
444 return result; | 448 return result; |
445 } | 449 } |
446 | 450 |
447 } | 451 } |
OLD | NEW |