Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(21)

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1888083002: Revert of UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved. 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 5
6 This library is free software; you can redistribute it and/or 6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public 7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either 8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version. 9 version 2 of the License, or (at your option) any later version.
10 10
(...skipping 10 matching lines...) Expand all
21 21
22 #include "core/html/parser/TextResourceDecoder.h" 22 #include "core/html/parser/TextResourceDecoder.h"
23 23
24 #include "core/HTMLNames.h" 24 #include "core/HTMLNames.h"
25 #include "core/dom/DOMImplementation.h" 25 #include "core/dom/DOMImplementation.h"
26 #include "core/html/parser/HTMLMetaCharsetParser.h" 26 #include "core/html/parser/HTMLMetaCharsetParser.h"
27 #include "platform/text/TextEncodingDetector.h" 27 #include "platform/text/TextEncodingDetector.h"
28 #include "wtf/StringExtras.h" 28 #include "wtf/StringExtras.h"
29 #include "wtf/text/TextCodec.h" 29 #include "wtf/text/TextCodec.h"
30 #include "wtf/text/TextEncodingRegistry.h" 30 #include "wtf/text/TextEncodingRegistry.h"
31 #include "wtf/text/UTF8.h"
32 31
33 using namespace WTF; 32 using namespace WTF;
34 33
35 namespace blink { 34 namespace blink {
36 35
37 using namespace HTMLNames; 36 using namespace HTMLNames;
38 37
39 const int minimumLengthOfXMLDeclaration = 8; 38 const int minimumLengthOfXMLDeclaration = 8;
40 39
41 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4) 40 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)
(...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after
343 342
344 if (!m_charsetParser->checkForMetaCharset(data, length)) 343 if (!m_charsetParser->checkForMetaCharset(data, length))
345 return; 344 return;
346 345
347 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag); 346 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
348 m_charsetParser.clear(); 347 m_charsetParser.clear();
349 m_checkedForMetaCharset = true; 348 m_checkedForMetaCharset = true;
350 return; 349 return;
351 } 350 }
352 351
353 String TextResourceDecoder::decode(const char* data, size_t len) 352 // We use the encoding detector in two cases:
354 {
355 size_t lengthOfBOM = 0;
356 if (!m_checkedForBOM)
357 lengthOfBOM = checkForBOM(data, len);
358
359 bool movedDataToBuffer = false;
360
361 if (m_contentType == CSSContent && !m_checkedForCSSCharset) {
362 if (!checkForCSSCharset(data, len, movedDataToBuffer))
363 return emptyString();
364 }
365
366 // We check XML declaration in HTML content only if there is enough data ava ilable
367 if (((m_contentType == HTMLContent && len >= minimumLengthOfXMLDeclaration) || m_contentType == XMLContent) && !m_checkedForXMLCharset) {
368 if (!checkForXMLCharset(data, len, movedDataToBuffer))
369 return emptyString();
370 }
371
372 const char* dataForDecode = data + lengthOfBOM;
373 size_t lengthForDecode = len - lengthOfBOM;
374
375 if (!m_buffer.isEmpty()) {
376 if (!movedDataToBuffer) {
377 size_t oldSize = m_buffer.size();
378 m_buffer.grow(oldSize + len);
379 memcpy(m_buffer.data() + oldSize, data, len);
380 }
381
382 dataForDecode = m_buffer.data() + lengthOfBOM;
383 lengthForDecode = m_buffer.size() - lengthOfBOM;
384 }
385
386 if (m_contentType == HTMLContent && !m_checkedForMetaCharset)
387 checkForMetaCharset(dataForDecode, lengthForDecode);
388
389 detectTextEncoding(data, len);
390
391 ASSERT(m_encoding.isValid());
392
393 if (!m_codec)
394 m_codec = newTextCodec(m_encoding);
395
396 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
397
398 m_buffer.clear();
399 return result;
400 }
401
402 // We use the encoding detector in following cases:
403 // 1. Encoding detector is turned ON and no other encoding source is 353 // 1. Encoding detector is turned ON and no other encoding source is
404 // available (that is, it's DefaultEncoding). 354 // available (that is, it's DefaultEncoding).
405 // 2. Encoding detector is turned ON and the encoding is set to 355 // 2. Encoding detector is turned ON and the encoding is set to
406 // the encoding of the parent frame, which is also auto-detected. 356 // the encoding of the parent frame, which is also auto-detected.
407 // Note that condition #2 is NOT satisfied unless parent-child frame 357 // Note that condition #2 is NOT satisfied unless parent-child frame
408 // relationship is compliant to the same-origin policy. If they're from 358 // relationship is compliant to the same-origin policy. If they're from
409 // different domains, |m_source| would not be set to EncodingFromParentFrame 359 // different domains, |m_source| would not be set to EncodingFromParentFrame
410 // in the first place. 360 // in the first place.
411 void TextResourceDecoder::detectTextEncoding(const char* data, size_t len) 361 bool TextResourceDecoder::shouldAutoDetect() const
412 {
413 if (!shouldDetectEncoding())
414 return;
415
416 if (WTF::Unicode::isUTF8andNotASCII(data, len)) {
417 setEncoding(UTF8Encoding(), EncodingFromContentSniffing);
418 return;
419 }
420 if (m_encodingDetectionOption == UseAllAutoDetection) {
421 WTF::TextEncoding detectedEncoding;
422 if (detectTextEncodingUniversal(data, len, m_hintEncoding, &detectedEnco ding))
423 setEncoding(detectedEncoding, EncodingFromContentSniffing);
424 }
425 }
426
427 bool TextResourceDecoder::shouldDetectEncoding() const
428 { 362 {
429 // Just checking m_hintEncoding suffices here because it's only set 363 // Just checking m_hintEncoding suffices here because it's only set
430 // in setHintEncoding when the source is AutoDetectedEncoding. 364 // in setHintEncoding when the source is AutoDetectedEncoding.
431 return m_source == DefaultEncoding || (m_source == EncodingFromParentFrame & & m_hintEncoding); 365 return m_encodingDetectionOption == UseAllAutoDetection
366 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
367 }
368
369 String TextResourceDecoder::decode(const char* data, size_t len)
370 {
371 size_t lengthOfBOM = 0;
372 if (!m_checkedForBOM)
373 lengthOfBOM = checkForBOM(data, len);
374
375 bool movedDataToBuffer = false;
376
377 if (m_contentType == CSSContent && !m_checkedForCSSCharset) {
378 if (!checkForCSSCharset(data, len, movedDataToBuffer))
379 return emptyString();
380 }
381
382 // We check XML declaration in HTML content only if there is enough data ava ilable
383 if (((m_contentType == HTMLContent && len >= minimumLengthOfXMLDeclaration) || m_contentType == XMLContent) && !m_checkedForXMLCharset) {
384 if (!checkForXMLCharset(data, len, movedDataToBuffer))
385 return emptyString();
386 }
387
388 const char* dataForDecode = data + lengthOfBOM;
389 size_t lengthForDecode = len - lengthOfBOM;
390
391 if (!m_buffer.isEmpty()) {
392 if (!movedDataToBuffer) {
393 size_t oldSize = m_buffer.size();
394 m_buffer.grow(oldSize + len);
395 memcpy(m_buffer.data() + oldSize, data, len);
396 }
397
398 dataForDecode = m_buffer.data() + lengthOfBOM;
399 lengthForDecode = m_buffer.size() - lengthOfBOM;
400 }
401
402 if (m_contentType == HTMLContent && !m_checkedForMetaCharset)
403 checkForMetaCharset(dataForDecode, lengthForDecode);
404
405 if (shouldAutoDetect()) {
406 WTF::TextEncoding detectedEncoding;
407 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
408 setEncoding(detectedEncoding, EncodingFromContentSniffing);
409 }
410
411 ASSERT(m_encoding.isValid());
412
413 if (!m_codec)
414 m_codec = newTextCodec(m_encoding);
415
416 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
417
418 m_buffer.clear();
419 return result;
432 } 420 }
433 421
434 String TextResourceDecoder::flush() 422 String TextResourceDecoder::flush()
435 { 423 {
436 // If we can not identify the encoding even after a document is completely 424 // If we can not identify the encoding even after a document is completely
437 // loaded, we need to detect the encoding if other conditions for 425 // loaded, we need to detect the encoding if other conditions for
438 // autodetection is satisfied. 426 // autodetection is satisfied.
439 if (m_buffer.size() 427 if (m_buffer.size() && shouldAutoDetect()
440 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) { 428 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) {
441 detectTextEncoding(m_buffer.data(), m_buffer.size()); 429 WTF::TextEncoding detectedEncoding;
430 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
431 setEncoding(detectedEncoding, EncodingFromContentSniffing);
442 } 432 }
443 433
444 if (!m_codec) 434 if (!m_codec)
445 m_codec = newTextCodec(m_encoding); 435 m_codec = newTextCodec(m_encoding);
446 436
447 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 437 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
448 m_buffer.clear(); 438 m_buffer.clear();
449 m_codec.clear(); 439 m_codec.clear();
450 m_checkedForBOM = false; // Skip BOM again when re-decoding. 440 m_checkedForBOM = false; // Skip BOM again when re-decoding.
451 return result; 441 return result;
452 } 442 }
453 443
454 } // namespace blink 444 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698