Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(7)

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 2655203002: Merge "Pass more hints to encoding detector." to M57 branch (Closed)
Patch Set: Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de) 2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)
3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All 3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All
4 rights reserved. 4 rights reserved.
5 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com) 5 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)
6 6
7 This library is free software; you can redistribute it and/or 7 This library is free software; you can redistribute it and/or
8 modify it under the terms of the GNU Library General Public 8 modify it under the terms of the GNU Library General Public
9 License as published by the Free Software Foundation; either 9 License as published by the Free Software Foundation; either
10 version 2 of the License, or (at your option) any later version. 10 version 2 of the License, or (at your option) any later version.
11 11
12 This library is distributed in the hope that it will be useful, 12 This library is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Library General Public License for more details. 15 Library General Public License for more details.
16 16
17 You should have received a copy of the GNU Library General Public License 17 You should have received a copy of the GNU Library General Public License
18 along with this library; see the file COPYING.LIB. If not, write to 18 along with this library; see the file COPYING.LIB. If not, write to
19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
20 Boston, MA 02110-1301, USA. 20 Boston, MA 02110-1301, USA.
21 */ 21 */
22 22
23 #include "core/html/parser/TextResourceDecoder.h" 23 #include "core/html/parser/TextResourceDecoder.h"
24 24
25 #include "core/HTMLNames.h" 25 #include "core/HTMLNames.h"
26 #include "core/dom/DOMImplementation.h" 26 #include "core/dom/DOMImplementation.h"
27 #include "core/html/parser/HTMLMetaCharsetParser.h" 27 #include "core/html/parser/HTMLMetaCharsetParser.h"
28 #include "platform/Language.h"
28 #include "platform/text/TextEncodingDetector.h" 29 #include "platform/text/TextEncodingDetector.h"
29 #include "wtf/StringExtras.h" 30 #include "wtf/StringExtras.h"
30 #include "wtf/text/TextCodec.h" 31 #include "wtf/text/TextCodec.h"
31 #include "wtf/text/TextEncodingRegistry.h" 32 #include "wtf/text/TextEncodingRegistry.h"
32 33
33 using namespace WTF; 34 using namespace WTF;
34 35
35 namespace blink { 36 namespace blink {
36 37
37 using namespace HTMLNames; 38 using namespace HTMLNames;
(...skipping 97 matching lines...) Expand 10 before | Expand all | Expand 10 after
135 if (contentType == XMLContent) 136 if (contentType == XMLContent)
136 return UTF8Encoding(); 137 return UTF8Encoding();
137 if (!specifiedDefaultEncoding.isValid()) 138 if (!specifiedDefaultEncoding.isValid())
138 return Latin1Encoding(); 139 return Latin1Encoding();
139 return specifiedDefaultEncoding; 140 return specifiedDefaultEncoding;
140 } 141 }
141 142
142 TextResourceDecoder::TextResourceDecoder( 143 TextResourceDecoder::TextResourceDecoder(
143 const String& mimeType, 144 const String& mimeType,
144 const WTF::TextEncoding& specifiedDefaultEncoding, 145 const WTF::TextEncoding& specifiedDefaultEncoding,
145 EncodingDetectionOption encodingDetectionOption) 146 EncodingDetectionOption encodingDetectionOption,
147 const String& url)
146 : m_contentType(determineContentType(mimeType)), 148 : m_contentType(determineContentType(mimeType)),
147 m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding)), 149 m_encoding(defaultEncoding(m_contentType, specifiedDefaultEncoding)),
148 m_source(DefaultEncoding), 150 m_source(DefaultEncoding),
149 m_hintEncoding(0), 151 m_hintEncoding(0),
152 m_hintUrl(url.utf8()),
150 m_checkedForBOM(false), 153 m_checkedForBOM(false),
151 m_checkedForCSSCharset(false), 154 m_checkedForCSSCharset(false),
152 m_checkedForXMLCharset(false), 155 m_checkedForXMLCharset(false),
153 m_checkedForMetaCharset(false), 156 m_checkedForMetaCharset(false),
154 m_useLenientXMLDecoding(false), 157 m_useLenientXMLDecoding(false),
155 m_sawError(false), 158 m_sawError(false),
156 m_encodingDetectionOption(encodingDetectionOption) { 159 m_encodingDetectionOption(encodingDetectionOption) {
157 if (m_encodingDetectionOption == AlwaysUseUTF8ForText) 160 m_hintLanguage[0] = 0;
161 if (m_encodingDetectionOption == AlwaysUseUTF8ForText) {
158 ASSERT(m_contentType == PlainTextContent && m_encoding == UTF8Encoding()); 162 ASSERT(m_contentType == PlainTextContent && m_encoding == UTF8Encoding());
163 } else if (m_encodingDetectionOption == UseAllAutoDetection) {
164 // Checking empty URL helps unit testing. Providing defaultLanguage() is
165 // sometimes difficult in tests.
166 if (!url.isEmpty()) {
167 // This object is created in the main thread, but used in another thread.
168 // We should not share an AtomicString.
169 AtomicString locale = defaultLanguage();
170 if (locale.length() >= 2) {
171 // defaultLanguage() is always an ASCII string.
172 m_hintLanguage[0] = static_cast<char>(locale[0]);
173 m_hintLanguage[1] = static_cast<char>(locale[1]);
174 m_hintLanguage[2] = 0;
175 }
176 }
177 }
159 } 178 }
160 179
161 TextResourceDecoder::~TextResourceDecoder() {} 180 TextResourceDecoder::~TextResourceDecoder() {}
162 181
163 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding, 182 void TextResourceDecoder::setEncoding(const WTF::TextEncoding& encoding,
164 EncodingSource source) { 183 EncodingSource source) {
165 // In case the encoding didn't exist, we keep the old one (helps some sites 184 // In case the encoding didn't exist, we keep the old one (helps some sites
166 // specifying invalid encodings). 185 // specifying invalid encodings).
167 if (!encoding.isValid()) 186 if (!encoding.isValid())
168 return; 187 return;
(...skipping 279 matching lines...) Expand 10 before | Expand all | Expand 10 after
448 467
449 dataForDecode = m_buffer.data() + lengthOfBOM; 468 dataForDecode = m_buffer.data() + lengthOfBOM;
450 lengthForDecode = m_buffer.size() - lengthOfBOM; 469 lengthForDecode = m_buffer.size() - lengthOfBOM;
451 } 470 }
452 471
453 if (m_contentType == HTMLContent && !m_checkedForMetaCharset) 472 if (m_contentType == HTMLContent && !m_checkedForMetaCharset)
454 checkForMetaCharset(dataForDecode, lengthForDecode); 473 checkForMetaCharset(dataForDecode, lengthForDecode);
455 474
456 if (shouldAutoDetect()) { 475 if (shouldAutoDetect()) {
457 WTF::TextEncoding detectedEncoding; 476 WTF::TextEncoding detectedEncoding;
458 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding)) 477 if (detectTextEncoding(data, len, m_hintEncoding, m_hintUrl.data(),
478 m_hintLanguage, &detectedEncoding))
459 setEncoding(detectedEncoding, EncodingFromContentSniffing); 479 setEncoding(detectedEncoding, EncodingFromContentSniffing);
460 } 480 }
461 481
462 ASSERT(m_encoding.isValid()); 482 ASSERT(m_encoding.isValid());
463 483
464 if (!m_codec) 484 if (!m_codec)
465 m_codec = newTextCodec(m_encoding); 485 m_codec = newTextCodec(m_encoding);
466 486
467 String result = m_codec->decode( 487 String result = m_codec->decode(
468 dataForDecode, lengthForDecode, DoNotFlush, 488 dataForDecode, lengthForDecode, DoNotFlush,
469 m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 489 m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
470 490
471 m_buffer.clear(); 491 m_buffer.clear();
472 return result; 492 return result;
473 } 493 }
474 494
475 String TextResourceDecoder::flush() { 495 String TextResourceDecoder::flush() {
476 // If we can not identify the encoding even after a document is completely 496 // If we can not identify the encoding even after a document is completely
477 // loaded, we need to detect the encoding if other conditions for 497 // loaded, we need to detect the encoding if other conditions for
478 // autodetection is satisfied. 498 // autodetection is satisfied.
479 if (m_buffer.size() && shouldAutoDetect() && 499 if (m_buffer.size() && shouldAutoDetect() &&
480 ((!m_checkedForXMLCharset && 500 ((!m_checkedForXMLCharset &&
481 (m_contentType == HTMLContent || m_contentType == XMLContent)) || 501 (m_contentType == HTMLContent || m_contentType == XMLContent)) ||
482 (!m_checkedForCSSCharset && (m_contentType == CSSContent)))) { 502 (!m_checkedForCSSCharset && (m_contentType == CSSContent)))) {
483 WTF::TextEncoding detectedEncoding; 503 WTF::TextEncoding detectedEncoding;
484 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, 504 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding,
485 &detectedEncoding)) 505 m_hintUrl.data(), m_hintLanguage, &detectedEncoding))
486 setEncoding(detectedEncoding, EncodingFromContentSniffing); 506 setEncoding(detectedEncoding, EncodingFromContentSniffing);
487 } 507 }
488 508
489 if (!m_codec) 509 if (!m_codec)
490 m_codec = newTextCodec(m_encoding); 510 m_codec = newTextCodec(m_encoding);
491 511
492 String result = m_codec->decode( 512 String result = m_codec->decode(
493 m_buffer.data(), m_buffer.size(), FetchEOF, 513 m_buffer.data(), m_buffer.size(), FetchEOF,
494 m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError); 514 m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);
495 m_buffer.clear(); 515 m_buffer.clear();
496 m_codec.reset(); 516 m_codec.reset();
497 m_checkedForBOM = false; // Skip BOM again when re-decoding. 517 m_checkedForBOM = false; // Skip BOM again when re-decoding.
498 return result; 518 return result;
499 } 519 }
500 520
501 } // namespace blink 521 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698