Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1184)

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1456843002: Finch experiment: auto-detect text encoding (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: addressed comments Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved. 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.
4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
5 5
6 This library is free software; you can redistribute it and/or 6 This library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Library General Public 7 modify it under the terms of the GNU Library General Public
8 License as published by the Free Software Foundation; either 8 License as published by the Free Software Foundation; either
9 version 2 of the License, or (at your option) any later version. 9 version 2 of the License, or (at your option) any later version.
10 10
11 This library is distributed in the hope that it will be useful, 11 This library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of 12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Library General Public License for more details. 14 Library General Public License for more details.
15 15
16 You should have received a copy of the GNU Library General Public License 16 You should have received a copy of the GNU Library General Public License
17 along with this library; see the file COPYING.LIB. If not, write to 17 along with this library; see the file COPYING.LIB. If not, write to
18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 18 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
19 Boston, MA 02110-1301, USA. 19 Boston, MA 02110-1301, USA.
20 */ 20 */
21 21
22
23 #include "config.h" 22 #include "config.h"
24 #include "core/html/parser/TextResourceDecoder.h" 23 #include "core/html/parser/TextResourceDecoder.h"
25 24
26 #include "core/HTMLNames.h" 25 #include "core/HTMLNames.h"
27 #include "core/dom/DOMImplementation.h" 26 #include "core/dom/DOMImplementation.h"
28 #include "core/html/parser/HTMLMetaCharsetParser.h" 27 #include "core/html/parser/HTMLMetaCharsetParser.h"
29 #include "platform/text/TextEncodingDetector.h" 28 #include "platform/text/TextEncodingDetector.h"
30 #include "wtf/StringExtras.h" 29 #include "wtf/StringExtras.h"
31 #include "wtf/text/TextCodec.h" 30 #include "wtf/text/TextCodec.h"
32 #include "wtf/text/TextEncodingRegistry.h" 31 #include "wtf/text/TextEncodingRegistry.h"
(...skipping 361 matching lines...) Expand 10 before | Expand all | Expand 10 after
394 } 393 }
395 394
396 dataForDecode = m_buffer.data() + lengthOfBOM; 395 dataForDecode = m_buffer.data() + lengthOfBOM;
397 lengthForDecode = m_buffer.size() - lengthOfBOM; 396 lengthForDecode = m_buffer.size() - lengthOfBOM;
398 } 397 }
399 398
400 if (m_contentType == HTMLContent && !m_checkedForMetaCharset) 399 if (m_contentType == HTMLContent && !m_checkedForMetaCharset)
401 checkForMetaCharset(dataForDecode, lengthForDecode); 400 checkForMetaCharset(dataForDecode, lengthForDecode);
402 401
403 if (shouldAutoDetect()) { 402 if (shouldAutoDetect()) {
404 WTF::TextEncoding detectedEncoding; 403 detectTextEncoding(data, len);
405 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))
406 setEncoding(detectedEncoding, EncodingFromContentSniffing);
407 } 404 }
408 405
409 ASSERT(m_encoding.isValid()); 406 ASSERT(m_encoding.isValid());
410 407
411 if (!m_codec) 408 if (!m_codec)
412 m_codec = newTextCodec(m_encoding); 409 m_codec = newTextCodec(m_encoding);
413 410
414 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 411 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
415 412
416 m_buffer.clear(); 413 m_buffer.clear();
417 return result; 414 return result;
418 } 415 }
419 416
417 void TextResourceDecoder::detectTextEncoding(const char* data, size_t len)
418 {
419 WTF::TextEncoding detectedEncoding;
420 bool detected = blink::detectTextEncoding(data, len, m_hintEncoding, &detect edEncoding);
421 if (detected && detectedEncoding != encoding())
422 setEncoding(detectedEncoding, EncodingFromContentSniffing);
423 else
424 setEncoding(detectedEncoding, DefaultEncodingAttemptedSniffing);
425 }
426
420 String TextResourceDecoder::flush() 427 String TextResourceDecoder::flush()
421 { 428 {
422 // If we can not identify the encoding even after a document is completely 429 // If we can not identify the encoding even after a document is completely
423 // loaded, we need to detect the encoding if other conditions for 430 // loaded, we need to detect the encoding if other conditions for
424 // autodetection is satisfied. 431 // autodetection is satisfied.
425 if (m_buffer.size() && shouldAutoDetect() 432 if (m_buffer.size() && shouldAutoDetect()
426 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) { 433 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent || m_conte ntType == XMLContent)) || (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) {
427 WTF::TextEncoding detectedEncoding; 434 detectTextEncoding(m_buffer.data(), m_buffer.size());
428 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))
429 setEncoding(detectedEncoding, EncodingFromContentSniffing);
430 } 435 }
431 436
432 if (!m_codec) 437 if (!m_codec)
433 m_codec = newTextCodec(m_encoding); 438 m_codec = newTextCodec(m_encoding);
434 439
435 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 440 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
436 m_buffer.clear(); 441 m_buffer.clear();
437 m_codec.clear(); 442 m_codec.clear();
438 m_checkedForBOM = false; // Skip BOM again when re-decoding. 443 m_checkedForBOM = false; // Skip BOM again when re-decoding.
439 return result; 444 return result;
440 } 445 }
441 446
442 } 447 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698