Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(68)

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1721373002: UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved. 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 5
6 This library is free software; you can redistribute it and/or 6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public 7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either 8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version. 9 version 2 of the License, or (at your option) any later version.
10 10
(...skipping 388 matching lines...) Expand 10 before | Expand all | Expand 10 after
399 lengthForDecode = m_buffer.size() - lengthOfBOM; 399 lengthForDecode = m_buffer.size() - lengthOfBOM;
400 } 400 }
401 401
402 if (m_contentType == HTMLContent && !m_checkedForMetaCharset) 402 if (m_contentType == HTMLContent && !m_checkedForMetaCharset)
403 checkForMetaCharset(dataForDecode, lengthForDecode); 403 checkForMetaCharset(dataForDecode, lengthForDecode);
404 404
405 if (shouldAutoDetect()) { 405 if (shouldAutoDetect()) {
406 WTF::TextEncoding detectedEncoding; 406 WTF::TextEncoding detectedEncoding;
407 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding)) 407 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
408 setEncoding(detectedEncoding, EncodingFromContentSniffing); 408 setEncoding(detectedEncoding, EncodingFromContentSniffing);
409 } else if ((m_source == DefaultEncoding || (m_source == EncodingFromParentFr ame && m_hintEncoding)) && isUTF8Encoded(data, len)) {
aelias_OOO_until_Jul13 2016/02/24 04:37:53 This if statement is duplicative with the other an
Jinsuk Kim 2016/02/24 06:54:54 Done.
410 setEncoding(UTF8Encoding(), EncodingFromContentSniffing);
409 } 411 }
410 412
411 ASSERT(m_encoding.isValid()); 413 ASSERT(m_encoding.isValid());
412 414
413 if (!m_codec) 415 if (!m_codec)
414 m_codec = newTextCodec(m_encoding); 416 m_codec = newTextCodec(m_encoding);
415 417
416 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 418 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
417 419
418 m_buffer.clear(); 420 m_buffer.clear();
419 return result; 421 return result;
420 } 422 }
421 423
422 String TextResourceDecoder::flush() 424 String TextResourceDecoder::flush()
423 { 425 {
424 // If we can not identify the encoding even after a document is completely 426 // If we can not identify the encoding even after a document is completely
425 // loaded, we need to detect the encoding if other conditions for 427 // loaded, we need to detect the encoding if other conditions for
426 // autodetection is satisfied. 428 // autodetection is satisfied.
427 if (m_buffer.size() && shouldAutoDetect() 429 if (m_buffer.size()
428 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) { 430 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) {
429 WTF::TextEncoding detectedEncoding; 431 if (shouldAutoDetect()) {
430 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding)) 432 WTF::TextEncoding detectedEncoding;
431 setEncoding(detectedEncoding, EncodingFromContentSniffing); 433 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncod ing, &detectedEncoding))
434 setEncoding(detectedEncoding, EncodingFromContentSniffing);
435 } else if ((m_source == DefaultEncoding || (m_source == EncodingFromPare ntFrame && m_hintEncoding)) && isUTF8Encoded(m_buffer.data(), m_buffer.size())) {
436 setEncoding(UTF8Encoding(), EncodingFromContentSniffing);
437 }
432 } 438 }
433 439
434 if (!m_codec) 440 if (!m_codec)
435 m_codec = newTextCodec(m_encoding); 441 m_codec = newTextCodec(m_encoding);
436 442
437 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 443 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
438 m_buffer.clear(); 444 m_buffer.clear();
439 m_codec.clear(); 445 m_codec.clear();
440 m_checkedForBOM = false; // Skip BOM again when re-decoding. 446 m_checkedForBOM = false; // Skip BOM again when re-decoding.
441 return result; 447 return result;
442 } 448 }
443 449
444 } // namespace blink 450 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698