Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(4)

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1721373002: UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved. 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 5
6 This library is free software; you can redistribute it and/or 6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public 7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either 8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version. 9 version 2 of the License, or (at your option) any later version.
10 10
(...skipping 10 matching lines...) Expand all
21 21
22 #include "core/html/parser/TextResourceDecoder.h" 22 #include "core/html/parser/TextResourceDecoder.h"
23 23
24 #include "core/HTMLNames.h" 24 #include "core/HTMLNames.h"
25 #include "core/dom/DOMImplementation.h" 25 #include "core/dom/DOMImplementation.h"
26 #include "core/html/parser/HTMLMetaCharsetParser.h" 26 #include "core/html/parser/HTMLMetaCharsetParser.h"
27 #include "platform/text/TextEncodingDetector.h" 27 #include "platform/text/TextEncodingDetector.h"
28 #include "wtf/StringExtras.h" 28 #include "wtf/StringExtras.h"
29 #include "wtf/text/TextCodec.h" 29 #include "wtf/text/TextCodec.h"
30 #include "wtf/text/TextEncodingRegistry.h" 30 #include "wtf/text/TextEncodingRegistry.h"
31 #include "wtf/text/UTF8.h"
31 32
32 using namespace WTF; 33 using namespace WTF;
33 34
34 namespace blink { 35 namespace blink {
35 36
36 using namespace HTMLNames; 37 using namespace HTMLNames;
37 38
38 const int minimumLengthOfXMLDeclaration = 8; 39 const int minimumLengthOfXMLDeclaration = 8;
39 40
40 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4) 41 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)
(...skipping 354 matching lines...) Expand 10 before | Expand all | Expand 10 after
395 memcpy(m_buffer.data() + oldSize, data, len); 396 memcpy(m_buffer.data() + oldSize, data, len);
396 } 397 }
397 398
398 dataForDecode = m_buffer.data() + lengthOfBOM; 399 dataForDecode = m_buffer.data() + lengthOfBOM;
399 lengthForDecode = m_buffer.size() - lengthOfBOM; 400 lengthForDecode = m_buffer.size() - lengthOfBOM;
400 } 401 }
401 402
402 if (m_contentType == HTMLContent && !m_checkedForMetaCharset) 403 if (m_contentType == HTMLContent && !m_checkedForMetaCharset)
403 checkForMetaCharset(dataForDecode, lengthForDecode); 404 checkForMetaCharset(dataForDecode, lengthForDecode);
404 405
405 if (shouldAutoDetect()) { 406 detectTextEncoding(data, len);
406 WTF::TextEncoding detectedEncoding;
407 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
408 setEncoding(detectedEncoding, EncodingFromContentSniffing);
409 }
410 407
411 ASSERT(m_encoding.isValid()); 408 ASSERT(m_encoding.isValid());
412 409
413 if (!m_codec) 410 if (!m_codec)
414 m_codec = newTextCodec(m_encoding); 411 m_codec = newTextCodec(m_encoding);
415 412
416 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 413 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
417 414
418 m_buffer.clear(); 415 m_buffer.clear();
419 return result; 416 return result;
420 } 417 }
421 418
419 void TextResourceDecoder::detectTextEncoding(const char* data, size_t len)
420 {
421 if (shouldAutoDetect()) {
422 WTF::TextEncoding detectedEncoding;
423 if (detectTextEncodingUniversal(data, len, m_hintEncoding, &detectedEnco ding))
424 setEncoding(detectedEncoding, EncodingFromContentSniffing);
425 return;
426 }
427 if ((m_source == DefaultEncoding || (m_source == EncodingFromParentFrame && m_hintEncoding))) {
jungshik at Google 2016/03/24 06:15:08 nit: The above condition is shared by shouldAuto
Jinsuk Kim 2016/03/25 02:15:42 Done.
428 if (WTF::Unicode::isUTF8Encoded(data, len))
429 setEncoding(UTF8Encoding(), EncodingFromContentSniffing);
jungshik at Google 2016/03/24 06:15:08 Given that isUTF8Encoded excludes 'ASCII' (by chec
Jinsuk Kim 2016/03/25 02:15:42 Makes sense. Done.
430 }
431 }
432
422 String TextResourceDecoder::flush() 433 String TextResourceDecoder::flush()
423 { 434 {
424 // If we can not identify the encoding even after a document is completely 435 // If we can not identify the encoding even after a document is completely
425 // loaded, we need to detect the encoding if other conditions for 436 // loaded, we need to detect the encoding if other conditions for
426 // autodetection is satisfied. 437 // autodetection is satisfied.
427 if (m_buffer.size() && shouldAutoDetect() 438 if (m_buffer.size()
428 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) { 439 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) {
429 WTF::TextEncoding detectedEncoding; 440 detectTextEncoding(m_buffer.data(), m_buffer.size());
430 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
431 setEncoding(detectedEncoding, EncodingFromContentSniffing);
432 } 441 }
433 442
434 if (!m_codec) 443 if (!m_codec)
435 m_codec = newTextCodec(m_encoding); 444 m_codec = newTextCodec(m_encoding);
436 445
437 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 446 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
438 m_buffer.clear(); 447 m_buffer.clear();
439 m_codec.clear(); 448 m_codec.clear();
440 m_checkedForBOM = false; // Skip BOM again when re-decoding. 449 m_checkedForBOM = false; // Skip BOM again when re-decoding.
441 return result; 450 return result;
442 } 451 }
443 452
444 } // namespace blink 453 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698