| OLD | NEW |
| 1 /* | 1 /* |
| 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) | 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) |
| 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All
rights reserved. | 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All
rights reserved. |
| 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) | 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) |
| 5 | 5 |
| 6 This library is free software; you can redistribute it and/or | 6 This library is free software; you can redistribute it and/or |
| 7 modify it under the terms of the GNU Library General Public | 7 modify it under the terms of the GNU Library General Public |
| 8 License as published by the Free Software Foundation; either | 8 License as published by the Free Software Foundation; either |
| 9 version 2 of the License, or (at your option) any later version. | 9 version 2 of the License, or (at your option) any later version. |
| 10 | 10 |
| (...skipping 10 matching lines...) Expand all Loading... |
| 21 | 21 |
| 22 #include "core/html/parser/TextResourceDecoder.h" | 22 #include "core/html/parser/TextResourceDecoder.h" |
| 23 | 23 |
| 24 #include "core/HTMLNames.h" | 24 #include "core/HTMLNames.h" |
| 25 #include "core/dom/DOMImplementation.h" | 25 #include "core/dom/DOMImplementation.h" |
| 26 #include "core/html/parser/HTMLMetaCharsetParser.h" | 26 #include "core/html/parser/HTMLMetaCharsetParser.h" |
| 27 #include "platform/text/TextEncodingDetector.h" | 27 #include "platform/text/TextEncodingDetector.h" |
| 28 #include "wtf/StringExtras.h" | 28 #include "wtf/StringExtras.h" |
| 29 #include "wtf/text/TextCodec.h" | 29 #include "wtf/text/TextCodec.h" |
| 30 #include "wtf/text/TextEncodingRegistry.h" | 30 #include "wtf/text/TextEncodingRegistry.h" |
| 31 #include "wtf/text/UTF8.h" | |
| 32 | 31 |
| 33 using namespace WTF; | 32 using namespace WTF; |
| 34 | 33 |
| 35 namespace blink { | 34 namespace blink { |
| 36 | 35 |
| 37 using namespace HTMLNames; | 36 using namespace HTMLNames; |
| 38 | 37 |
| 39 const int minimumLengthOfXMLDeclaration = 8; | 38 const int minimumLengthOfXMLDeclaration = 8; |
| 40 | 39 |
| 41 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3,
char b4) | 40 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3,
char b4) |
| (...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 343 | 342 |
| 344 if (!m_charsetParser->checkForMetaCharset(data, length)) | 343 if (!m_charsetParser->checkForMetaCharset(data, length)) |
| 345 return; | 344 return; |
| 346 | 345 |
| 347 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag); | 346 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag); |
| 348 m_charsetParser.clear(); | 347 m_charsetParser.clear(); |
| 349 m_checkedForMetaCharset = true; | 348 m_checkedForMetaCharset = true; |
| 350 return; | 349 return; |
| 351 } | 350 } |
| 352 | 351 |
| 353 String TextResourceDecoder::decode(const char* data, size_t len) | 352 // We use the encoding detector in two cases: |
| 354 { | |
| 355 size_t lengthOfBOM = 0; | |
| 356 if (!m_checkedForBOM) | |
| 357 lengthOfBOM = checkForBOM(data, len); | |
| 358 | |
| 359 bool movedDataToBuffer = false; | |
| 360 | |
| 361 if (m_contentType == CSSContent && !m_checkedForCSSCharset) { | |
| 362 if (!checkForCSSCharset(data, len, movedDataToBuffer)) | |
| 363 return emptyString(); | |
| 364 } | |
| 365 | |
| 366 // We check XML declaration in HTML content only if there is enough data ava
ilable | |
| 367 if (((m_contentType == HTMLContent && len >= minimumLengthOfXMLDeclaration)
|| m_contentType == XMLContent) && !m_checkedForXMLCharset) { | |
| 368 if (!checkForXMLCharset(data, len, movedDataToBuffer)) | |
| 369 return emptyString(); | |
| 370 } | |
| 371 | |
| 372 const char* dataForDecode = data + lengthOfBOM; | |
| 373 size_t lengthForDecode = len - lengthOfBOM; | |
| 374 | |
| 375 if (!m_buffer.isEmpty()) { | |
| 376 if (!movedDataToBuffer) { | |
| 377 size_t oldSize = m_buffer.size(); | |
| 378 m_buffer.grow(oldSize + len); | |
| 379 memcpy(m_buffer.data() + oldSize, data, len); | |
| 380 } | |
| 381 | |
| 382 dataForDecode = m_buffer.data() + lengthOfBOM; | |
| 383 lengthForDecode = m_buffer.size() - lengthOfBOM; | |
| 384 } | |
| 385 | |
| 386 if (m_contentType == HTMLContent && !m_checkedForMetaCharset) | |
| 387 checkForMetaCharset(dataForDecode, lengthForDecode); | |
| 388 | |
| 389 detectTextEncoding(data, len); | |
| 390 | |
| 391 ASSERT(m_encoding.isValid()); | |
| 392 | |
| 393 if (!m_codec) | |
| 394 m_codec = newTextCodec(m_encoding); | |
| 395 | |
| 396 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush,
m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); | |
| 397 | |
| 398 m_buffer.clear(); | |
| 399 return result; | |
| 400 } | |
| 401 | |
| 402 // We use the encoding detector in following cases: | |
| 403 // 1. Encoding detector is turned ON and no other encoding source is | 353 // 1. Encoding detector is turned ON and no other encoding source is |
| 404 // available (that is, it's DefaultEncoding). | 354 // available (that is, it's DefaultEncoding). |
| 405 // 2. Encoding detector is turned ON and the encoding is set to | 355 // 2. Encoding detector is turned ON and the encoding is set to |
| 406 // the encoding of the parent frame, which is also auto-detected. | 356 // the encoding of the parent frame, which is also auto-detected. |
| 407 // Note that condition #2 is NOT satisfied unless parent-child frame | 357 // Note that condition #2 is NOT satisfied unless parent-child frame |
| 408 // relationship is compliant to the same-origin policy. If they're from | 358 // relationship is compliant to the same-origin policy. If they're from |
| 409 // different domains, |m_source| would not be set to EncodingFromParentFrame | 359 // different domains, |m_source| would not be set to EncodingFromParentFrame |
| 410 // in the first place. | 360 // in the first place. |
| 411 void TextResourceDecoder::detectTextEncoding(const char* data, size_t len) | 361 bool TextResourceDecoder::shouldAutoDetect() const |
| 412 { | |
| 413 if (!shouldDetectEncoding()) | |
| 414 return; | |
| 415 | |
| 416 if (WTF::Unicode::isUTF8andNotASCII(data, len)) { | |
| 417 setEncoding(UTF8Encoding(), EncodingFromContentSniffing); | |
| 418 return; | |
| 419 } | |
| 420 if (m_encodingDetectionOption == UseAllAutoDetection) { | |
| 421 WTF::TextEncoding detectedEncoding; | |
| 422 if (detectTextEncodingUniversal(data, len, m_hintEncoding, &detectedEnco
ding)) | |
| 423 setEncoding(detectedEncoding, EncodingFromContentSniffing); | |
| 424 } | |
| 425 } | |
| 426 | |
| 427 bool TextResourceDecoder::shouldDetectEncoding() const | |
| 428 { | 362 { |
| 429 // Just checking m_hintEncoding suffices here because it's only set | 363 // Just checking m_hintEncoding suffices here because it's only set |
| 430 // in setHintEncoding when the source is AutoDetectedEncoding. | 364 // in setHintEncoding when the source is AutoDetectedEncoding. |
| 431 return m_source == DefaultEncoding || (m_source == EncodingFromParentFrame &
& m_hintEncoding); | 365 return m_encodingDetectionOption == UseAllAutoDetection |
| 366 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame
&& m_hintEncoding)); |
| 367 } |
| 368 |
| 369 String TextResourceDecoder::decode(const char* data, size_t len) |
| 370 { |
| 371 size_t lengthOfBOM = 0; |
| 372 if (!m_checkedForBOM) |
| 373 lengthOfBOM = checkForBOM(data, len); |
| 374 |
| 375 bool movedDataToBuffer = false; |
| 376 |
| 377 if (m_contentType == CSSContent && !m_checkedForCSSCharset) { |
| 378 if (!checkForCSSCharset(data, len, movedDataToBuffer)) |
| 379 return emptyString(); |
| 380 } |
| 381 |
| 382 // We check XML declaration in HTML content only if there is enough data ava
ilable |
| 383 if (((m_contentType == HTMLContent && len >= minimumLengthOfXMLDeclaration)
|| m_contentType == XMLContent) && !m_checkedForXMLCharset) { |
| 384 if (!checkForXMLCharset(data, len, movedDataToBuffer)) |
| 385 return emptyString(); |
| 386 } |
| 387 |
| 388 const char* dataForDecode = data + lengthOfBOM; |
| 389 size_t lengthForDecode = len - lengthOfBOM; |
| 390 |
| 391 if (!m_buffer.isEmpty()) { |
| 392 if (!movedDataToBuffer) { |
| 393 size_t oldSize = m_buffer.size(); |
| 394 m_buffer.grow(oldSize + len); |
| 395 memcpy(m_buffer.data() + oldSize, data, len); |
| 396 } |
| 397 |
| 398 dataForDecode = m_buffer.data() + lengthOfBOM; |
| 399 lengthForDecode = m_buffer.size() - lengthOfBOM; |
| 400 } |
| 401 |
| 402 if (m_contentType == HTMLContent && !m_checkedForMetaCharset) |
| 403 checkForMetaCharset(dataForDecode, lengthForDecode); |
| 404 |
| 405 if (shouldAutoDetect()) { |
| 406 WTF::TextEncoding detectedEncoding; |
| 407 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding)) |
| 408 setEncoding(detectedEncoding, EncodingFromContentSniffing); |
| 409 } |
| 410 |
| 411 ASSERT(m_encoding.isValid()); |
| 412 |
| 413 if (!m_codec) |
| 414 m_codec = newTextCodec(m_encoding); |
| 415 |
| 416 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush,
m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); |
| 417 |
| 418 m_buffer.clear(); |
| 419 return result; |
| 432 } | 420 } |
| 433 | 421 |
| 434 String TextResourceDecoder::flush() | 422 String TextResourceDecoder::flush() |
| 435 { | 423 { |
| 436 // If we can not identify the encoding even after a document is completely | 424 // If we can not identify the encoding even after a document is completely |
| 437 // loaded, we need to detect the encoding if other conditions for | 425 // loaded, we need to detect the encoding if other conditions for |
| 438 // autodetection is satisfied. | 426 // autodetection is satisfied. |
| 439 if (m_buffer.size() | 427 if (m_buffer.size() && shouldAutoDetect() |
| 440 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte
ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte
nt)))) { | 428 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte
ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte
nt)))) { |
| 441 detectTextEncoding(m_buffer.data(), m_buffer.size()); | 429 WTF::TextEncoding detectedEncoding; |
| 430 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding,
&detectedEncoding)) |
| 431 setEncoding(detectedEncoding, EncodingFromContentSniffing); |
| 442 } | 432 } |
| 443 | 433 |
| 444 if (!m_codec) | 434 if (!m_codec) |
| 445 m_codec = newTextCodec(m_encoding); | 435 m_codec = newTextCodec(m_encoding); |
| 446 | 436 |
| 447 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF,
m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); | 437 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF,
m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); |
| 448 m_buffer.clear(); | 438 m_buffer.clear(); |
| 449 m_codec.clear(); | 439 m_codec.clear(); |
| 450 m_checkedForBOM = false; // Skip BOM again when re-decoding. | 440 m_checkedForBOM = false; // Skip BOM again when re-decoding. |
| 451 return result; | 441 return result; |
| 452 } | 442 } |
| 453 | 443 |
| 454 } // namespace blink | 444 } // namespace blink |
| OLD | NEW |