Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(7)

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1979103003: Revert "Reland "UTF-8 detector for pages missing encoding info"" (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: rebased Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved. 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 5
6 This library is free software; you can redistribute it and/or 6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public 7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either 8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version. 9 version 2 of the License, or (at your option) any later version.
10 10
(...skipping 10 matching lines...) Expand all
21 21
22 #include "core/html/parser/TextResourceDecoder.h" 22 #include "core/html/parser/TextResourceDecoder.h"
23 23
24 #include "core/HTMLNames.h" 24 #include "core/HTMLNames.h"
25 #include "core/dom/DOMImplementation.h" 25 #include "core/dom/DOMImplementation.h"
26 #include "core/html/parser/HTMLMetaCharsetParser.h" 26 #include "core/html/parser/HTMLMetaCharsetParser.h"
27 #include "platform/text/TextEncodingDetector.h" 27 #include "platform/text/TextEncodingDetector.h"
28 #include "wtf/StringExtras.h" 28 #include "wtf/StringExtras.h"
29 #include "wtf/text/TextCodec.h" 29 #include "wtf/text/TextCodec.h"
30 #include "wtf/text/TextEncodingRegistry.h" 30 #include "wtf/text/TextEncodingRegistry.h"
31 #include "wtf/text/UTF8.h"
32 31
33 using namespace WTF; 32 using namespace WTF;
34 33
35 namespace blink { 34 namespace blink {
36 35
37 using namespace HTMLNames; 36 using namespace HTMLNames;
38 37
39 const int minimumLengthOfXMLDeclaration = 8; 38 const int minimumLengthOfXMLDeclaration = 8;
40 39
41 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4) 40 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)
(...skipping 301 matching lines...) Expand 10 before | Expand all | Expand 10 after
343 342
344 if (!m_charsetParser->checkForMetaCharset(data, length)) 343 if (!m_charsetParser->checkForMetaCharset(data, length))
345 return; 344 return;
346 345
347 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag); 346 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);
348 m_charsetParser.clear(); 347 m_charsetParser.clear();
349 m_checkedForMetaCharset = true; 348 m_checkedForMetaCharset = true;
350 return; 349 return;
351 } 350 }
352 351
352 // We use the encoding detector in two cases:
353 // 1. Encoding detector is turned ON and no other encoding source is
354 // available (that is, it's DefaultEncoding).
355 // 2. Encoding detector is turned ON and the encoding is set to
356 // the encoding of the parent frame, which is also auto-detected.
357 // Note that condition #2 is NOT satisfied unless parent-child frame
358 // relationship is compliant to the same-origin policy. If they're from
359 // different domains, |m_source| would not be set to EncodingFromParentFrame
360 // in the first place.
361 bool TextResourceDecoder::shouldAutoDetect() const
362 {
363 // Just checking m_hintEncoding suffices here because it's only set
364 // in setHintEncoding when the source is AutoDetectedEncoding.
365 return m_encodingDetectionOption == UseAllAutoDetection
366 && (m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding));
367 }
368
353 String TextResourceDecoder::decode(const char* data, size_t len) 369 String TextResourceDecoder::decode(const char* data, size_t len)
354 { 370 {
355 size_t lengthOfBOM = 0; 371 size_t lengthOfBOM = 0;
356 if (!m_checkedForBOM) 372 if (!m_checkedForBOM)
357 lengthOfBOM = checkForBOM(data, len); 373 lengthOfBOM = checkForBOM(data, len);
358 374
359 bool movedDataToBuffer = false; 375 bool movedDataToBuffer = false;
360 376
361 if (m_contentType == CSSContent && !m_checkedForCSSCharset) { 377 if (m_contentType == CSSContent && !m_checkedForCSSCharset) {
362 if (!checkForCSSCharset(data, len, movedDataToBuffer)) 378 if (!checkForCSSCharset(data, len, movedDataToBuffer))
(...skipping 16 matching lines...) Expand all
379 memcpy(m_buffer.data() + oldSize, data, len); 395 memcpy(m_buffer.data() + oldSize, data, len);
380 } 396 }
381 397
382 dataForDecode = m_buffer.data() + lengthOfBOM; 398 dataForDecode = m_buffer.data() + lengthOfBOM;
383 lengthForDecode = m_buffer.size() - lengthOfBOM; 399 lengthForDecode = m_buffer.size() - lengthOfBOM;
384 } 400 }
385 401
386 if (m_contentType == HTMLContent && !m_checkedForMetaCharset) 402 if (m_contentType == HTMLContent && !m_checkedForMetaCharset)
387 checkForMetaCharset(dataForDecode, lengthForDecode); 403 checkForMetaCharset(dataForDecode, lengthForDecode);
388 404
389 detectTextEncoding(data, len); 405 if (shouldAutoDetect()) {
406 WTF::TextEncoding detectedEncoding;
407 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
408 setEncoding(detectedEncoding, EncodingFromContentSniffing);
409 }
390 410
391 ASSERT(m_encoding.isValid()); 411 ASSERT(m_encoding.isValid());
392 412
393 if (!m_codec) 413 if (!m_codec)
394 m_codec = newTextCodec(m_encoding); 414 m_codec = newTextCodec(m_encoding);
395 415
396 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 416 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
397 417
398 m_buffer.clear(); 418 m_buffer.clear();
399 return result; 419 return result;
400 } 420 }
401 421
402 // We use the encoding detector in following cases:
403 // 1. Encoding detector is turned ON and no other encoding source is
404 // available (that is, it's DefaultEncoding).
405 // 2. Encoding detector is turned ON and the encoding is set to
406 // the encoding of the parent frame, which is also auto-detected.
407 // Note that condition #2 is NOT satisfied unless parent-child frame
408 // relationship is compliant to the same-origin policy. If they're from
409 // different domains, |m_source| would not be set to EncodingFromParentFrame
410 // in the first place.
411 // We also check if the text is encoded in UTF-8 in case the encoding has not
412 // been determined by auto encoding detector (optional). Then |m_source| needs
413 // to be set to anything but DefaultEncoding to avoid further detection
414 // attempts.
415 void TextResourceDecoder::detectTextEncoding(const char* data, size_t len)
416 {
417 if (!shouldDetectEncoding())
418 return;
419
420 if (m_encodingDetectionOption == UseAllAutoDetection) {
421 WTF::TextEncoding detectedEncoding;
422 if (detectTextEncodingUniversal(data, len, m_hintEncoding, &detectedEnco ding)) {
423 setEncoding(detectedEncoding, EncodingFromContentSniffing);
424 return;
425 }
426 }
427
428 if (WTF::Unicode::isUTF8andNotASCII(data, len))
429 setEncoding(UTF8Encoding(), EncodingFromContentSniffing);
430 else
431 m_source = EncodingFromContentSniffing;
432 }
433
434 bool TextResourceDecoder::shouldDetectEncoding() const
435 {
436 // Just checking m_hintEncoding suffices here because it's only set
437 // in setHintEncoding when the source is AutoDetectedEncoding.
438 return m_source == DefaultEncoding || (m_source == EncodingFromParentFrame & & m_hintEncoding);
439 }
440
441 String TextResourceDecoder::flush() 422 String TextResourceDecoder::flush()
442 { 423 {
443 // If we can not identify the encoding even after a document is completely 424 // If we can not identify the encoding even after a document is completely
444 // loaded, we need to detect the encoding if other conditions for 425 // loaded, we need to detect the encoding if other conditions for
445 // autodetection is satisfied. 426 // autodetection is satisfied.
446 if (m_buffer.size() 427 if (m_buffer.size() && shouldAutoDetect()
447 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) { 428 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) {
448 detectTextEncoding(m_buffer.data(), m_buffer.size()); 429 WTF::TextEncoding detectedEncoding;
430 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
431 setEncoding(detectedEncoding, EncodingFromContentSniffing);
449 } 432 }
450 433
451 if (!m_codec) 434 if (!m_codec)
452 m_codec = newTextCodec(m_encoding); 435 m_codec = newTextCodec(m_encoding);
453 436
454 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 437 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
455 m_buffer.clear(); 438 m_buffer.clear();
456 m_codec.clear(); 439 m_codec.clear();
457 m_checkedForBOM = false; // Skip BOM again when re-decoding. 440 m_checkedForBOM = false; // Skip BOM again when re-decoding.
458 return result; 441 return result;
459 } 442 }
460 443
461 } // namespace blink 444 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698