Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(141)

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1721373002: UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: left out test files that should be landed manually Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved. 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 5
6 This library is free software; you can redistribute it and/or 6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public 7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either 8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version. 9 version 2 of the License, or (at your option) any later version.
10 10
(...skipping 10 matching lines...) Expand all
21 21
22 #include "core/html/parser/TextResourceDecoder.h" 22 #include "core/html/parser/TextResourceDecoder.h"
23 23
24 #include "core/HTMLNames.h" 24 #include "core/HTMLNames.h"
25 #include "core/dom/DOMImplementation.h" 25 #include "core/dom/DOMImplementation.h"
26 #include "core/html/parser/HTMLMetaCharsetParser.h" 26 #include "core/html/parser/HTMLMetaCharsetParser.h"
27 #include "platform/text/TextEncodingDetector.h" 27 #include "platform/text/TextEncodingDetector.h"
28 #include "wtf/StringExtras.h" 28 #include "wtf/StringExtras.h"
29 #include "wtf/text/TextCodec.h" 29 #include "wtf/text/TextCodec.h"
30 #include "wtf/text/TextEncodingRegistry.h" 30 #include "wtf/text/TextEncodingRegistry.h"
31 #include "wtf/text/UTF8.h"
31 32
32 using namespace WTF; 33 using namespace WTF;
33 34
34 namespace blink { 35 namespace blink {
35 36
36 using namespace HTMLNames; 37 using namespace HTMLNames;
37 38
38 const int minimumLengthOfXMLDeclaration = 8; 39 const int minimumLengthOfXMLDeclaration = 8;
39 40
40 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4) 41 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)
(...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after
342 343
343 if (!m_charsetParser->checkForMetaCharset(data, length)) 344 if (!m_charsetParser->checkForMetaCharset(data, length))
344 return; 345 return;
345 346
346 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag); 347 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
347 m_charsetParser.clear(); 348 m_charsetParser.clear();
348 m_checkedForMetaCharset = true; 349 m_checkedForMetaCharset = true;
349 return; 350 return;
350 } 351 }
351 352
352 // We use the encoding detector in two cases:
353 // 1. Encoding detector is turned ON and no other encoding source is
354 // available (that is, it's DefaultEncoding).
355 // 2. Encoding detector is turned ON and the encoding is set to
356 // the encoding of the parent frame, which is also auto-detected.
357 // Note that condition #2 is NOT satisfied unless parent-child frame
358 // relationship is compliant to the same-origin policy. If they're from
359 // different domains, |m_source| would not be set to EncodingFromParentFrame
360 // in the first place.
361 bool TextResourceDecoder::shouldAutoDetect() const
362 {
363 // Just checking m_hintEncoding suffices here because it's only set
364 // in setHintEncoding when the source is AutoDetectedEncoding.
365 return m_encodingDetectionOption == UseAllAutoDetection
366 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
367 }
368
369 String TextResourceDecoder::decode(const char* data, size_t len) 353 String TextResourceDecoder::decode(const char* data, size_t len)
370 { 354 {
371 size_t lengthOfBOM = 0; 355 size_t lengthOfBOM = 0;
372 if (!m_checkedForBOM) 356 if (!m_checkedForBOM)
373 lengthOfBOM = checkForBOM(data, len); 357 lengthOfBOM = checkForBOM(data, len);
374 358
375 bool movedDataToBuffer = false; 359 bool movedDataToBuffer = false;
376 360
377 if (m_contentType == CSSContent && !m_checkedForCSSCharset) { 361 if (m_contentType == CSSContent && !m_checkedForCSSCharset) {
378 if (!checkForCSSCharset(data, len, movedDataToBuffer)) 362 if (!checkForCSSCharset(data, len, movedDataToBuffer))
(...skipping 16 matching lines...) Expand all
395 memcpy(m_buffer.data() + oldSize, data, len); 379 memcpy(m_buffer.data() + oldSize, data, len);
396 } 380 }
397 381
398 dataForDecode = m_buffer.data() + lengthOfBOM; 382 dataForDecode = m_buffer.data() + lengthOfBOM;
399 lengthForDecode = m_buffer.size() - lengthOfBOM; 383 lengthForDecode = m_buffer.size() - lengthOfBOM;
400 } 384 }
401 385
402 if (m_contentType == HTMLContent && !m_checkedForMetaCharset) 386 if (m_contentType == HTMLContent && !m_checkedForMetaCharset)
403 checkForMetaCharset(dataForDecode, lengthForDecode); 387 checkForMetaCharset(dataForDecode, lengthForDecode);
404 388
405 if (shouldAutoDetect()) { 389 detectTextEncoding(data, len);
406 WTF::TextEncoding detectedEncoding;
407 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
408 setEncoding(detectedEncoding, EncodingFromContentSniffing);
409 }
410 390
411 ASSERT(m_encoding.isValid()); 391 ASSERT(m_encoding.isValid());
412 392
413 if (!m_codec) 393 if (!m_codec)
414 m_codec = newTextCodec(m_encoding); 394 m_codec = newTextCodec(m_encoding);
415 395
416 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 396 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
417 397
418 m_buffer.clear(); 398 m_buffer.clear();
419 return result; 399 return result;
420 } 400 }
421 401
402 // We use the encoding detector in following cases:
403 // 1. Encoding detector is turned ON and no other encoding source is
404 // available (that is, it's DefaultEncoding).
405 // 2. Encoding detector is turned ON and the encoding is set to
406 // the encoding of the parent frame, which is also auto-detected.
407 // Note that condition #2 is NOT satisfied unless parent-child frame
408 // relationship is compliant to the same-origin policy. If they're from
409 // different domains, |m_source| would not be set to EncodingFromParentFrame
410 // in the first place.
411 void TextResourceDecoder::detectTextEncoding(const char* data, size_t len)
412 {
413 if (!shouldDetectEncoding())
414 return;
415
416 if (WTF::Unicode::isUTF8andNotASCII(data, len)) {
417 setEncoding(UTF8Encoding(), EncodingFromContentSniffing);
418 return;
419 }
420 if (m_encodingDetectionOption == UseAllAutoDetection) {
421 WTF::TextEncoding detectedEncoding;
422 if (detectTextEncodingUniversal(data, len, m_hintEncoding, &detectedEnco ding))
423 setEncoding(detectedEncoding, EncodingFromContentSniffing);
424 }
425 }
426
427 bool TextResourceDecoder::shouldDetectEncoding() const
428 {
429 // Just checking m_hintEncoding suffices here because it's only set
430 // in setHintEncoding when the source is AutoDetectedEncoding.
431 return m_source == DefaultEncoding || (m_source == EncodingFromParentFrame & & m_hintEncoding);
432 }
433
422 String TextResourceDecoder::flush() 434 String TextResourceDecoder::flush()
423 { 435 {
424 // If we can not identify the encoding even after a document is completely 436 // If we can not identify the encoding even after a document is completely
425 // loaded, we need to detect the encoding if other conditions for 437 // loaded, we need to detect the encoding if other conditions for
426 // autodetection is satisfied. 438 // autodetection is satisfied.
427 if (m_buffer.size() && shouldAutoDetect() 439 if (m_buffer.size()
428 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) { 440 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) {
429 WTF::TextEncoding detectedEncoding; 441 detectTextEncoding(m_buffer.data(), m_buffer.size());
430 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
431 setEncoding(detectedEncoding, EncodingFromContentSniffing);
432 } 442 }
433 443
434 if (!m_codec) 444 if (!m_codec)
435 m_codec = newTextCodec(m_encoding); 445 m_codec = newTextCodec(m_encoding);
436 446
437 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 447 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
438 m_buffer.clear(); 448 m_buffer.clear();
439 m_codec.clear(); 449 m_codec.clear();
440 m_checkedForBOM = false; // Skip BOM again when re-decoding. 450 m_checkedForBOM = false; // Skip BOM again when re-decoding.
441 return result; 451 return result;
442 } 452 }
443 453
444 } // namespace blink 454 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698