third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp - Issue 1888083002: Revert of UTF-8 detector for pages missing encoding info

Side by Side Diff: third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp

Issue 1888083002: Revert of UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « third_party/WebKit/Source/core/html/parser/TextResourceDecoder.h ('k') | third_party/WebKit/Source/core/xmlhttprequest/XMLHttpRequest.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)	2 Copyright (C) 1999 Lars Knoll (knoll@mpi-hd.mpg.de)

3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.	3 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2012 Apple Inc. All rights reserved.

4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)	4 Copyright (C) 2005, 2006, 2007 Alexey Proskuryakov (ap@nypop.com)

5	5

6 This library is free software; you can redistribute it and/or	6 This library is free software; you can redistribute it and/or

7 modify it under the terms of the GNU Library General Public	7 modify it under the terms of the GNU Library General Public

8 License as published by the Free Software Foundation; either	8 License as published by the Free Software Foundation; either

9 version 2 of the License, or (at your option) any later version.	9 version 2 of the License, or (at your option) any later version.

10	10

(...skipping 10 matching lines...) Expand all Loading...
21	21

22 #include "core/html/parser/TextResourceDecoder.h"	22 #include "core/html/parser/TextResourceDecoder.h"

23	23

24 #include "core/HTMLNames.h"	24 #include "core/HTMLNames.h"

25 #include "core/dom/DOMImplementation.h"	25 #include "core/dom/DOMImplementation.h"

26 #include "core/html/parser/HTMLMetaCharsetParser.h"	26 #include "core/html/parser/HTMLMetaCharsetParser.h"

27 #include "platform/text/TextEncodingDetector.h"	27 #include "platform/text/TextEncodingDetector.h"

28 #include "wtf/StringExtras.h"	28 #include "wtf/StringExtras.h"

29 #include "wtf/text/TextCodec.h"	29 #include "wtf/text/TextCodec.h"

30 #include "wtf/text/TextEncodingRegistry.h"	30 #include "wtf/text/TextEncodingRegistry.h"

31 #include "wtf/text/UTF8.h"

32	31

33 using namespace WTF;	32 using namespace WTF;

34	33

35 namespace blink {	34 namespace blink {

36	35

37 using namespace HTMLNames;	36 using namespace HTMLNames;

38	37

39 const int minimumLengthOfXMLDeclaration = 8;	38 const int minimumLengthOfXMLDeclaration = 8;

40	39

41 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)	40 static inline bool bytesEqual(const char* p, char b0, char b1, char b2, char b3, char b4)

(...skipping 301 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
343	342

344 if (!m_charsetParser->checkForMetaCharset(data, length))	343 if (!m_charsetParser->checkForMetaCharset(data, length))

345 return;	344 return;

346	345

347 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);	346 setEncoding(m_charsetParser->encoding(), EncodingFromMetaTag);

348 m_charsetParser.clear();	347 m_charsetParser.clear();

349 m_checkedForMetaCharset = true;	348 m_checkedForMetaCharset = true;

350 return;	349 return;

351 }	350 }

352	351

353 String TextResourceDecoder::decode(const char* data, size_t len)	352 // We use the encoding detector in two cases:

354 {

355 size_t lengthOfBOM = 0;

356 if (!m_checkedForBOM)

357 lengthOfBOM = checkForBOM(data, len);

358

359 bool movedDataToBuffer = false;

360

361 if (m_contentType == CSSContent && !m_checkedForCSSCharset) {

362 if (!checkForCSSCharset(data, len, movedDataToBuffer))

363 return emptyString();

364 }

365

366 // We check XML declaration in HTML content only if there is enough data ava ilable

367 if (((m_contentType == HTMLContent && len >= minimumLengthOfXMLDeclaration) \|\| m_contentType == XMLContent) && !m_checkedForXMLCharset) {

368 if (!checkForXMLCharset(data, len, movedDataToBuffer))

369 return emptyString();

370 }

371

372 const char* dataForDecode = data + lengthOfBOM;

373 size_t lengthForDecode = len - lengthOfBOM;

374

375 if (!m_buffer.isEmpty()) {

376 if (!movedDataToBuffer) {

377 size_t oldSize = m_buffer.size();

378 m_buffer.grow(oldSize + len);

379 memcpy(m_buffer.data() + oldSize, data, len);

380 }

381

382 dataForDecode = m_buffer.data() + lengthOfBOM;

383 lengthForDecode = m_buffer.size() - lengthOfBOM;

384 }

385

386 if (m_contentType == HTMLContent && !m_checkedForMetaCharset)

387 checkForMetaCharset(dataForDecode, lengthForDecode);

388

389 detectTextEncoding(data, len);

390

391 ASSERT(m_encoding.isValid());

392

393 if (!m_codec)

394 m_codec = newTextCodec(m_encoding);

395

396 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);

397

398 m_buffer.clear();

399 return result;

400 }

401

402 // We use the encoding detector in following cases:

403 // 1. Encoding detector is turned ON and no other encoding source is	353 // 1. Encoding detector is turned ON and no other encoding source is

404 // available (that is, it's DefaultEncoding).	354 // available (that is, it's DefaultEncoding).

405 // 2. Encoding detector is turned ON and the encoding is set to	355 // 2. Encoding detector is turned ON and the encoding is set to

406 // the encoding of the parent frame, which is also auto-detected.	356 // the encoding of the parent frame, which is also auto-detected.

407 // Note that condition #2 is NOT satisfied unless parent-child frame	357 // Note that condition #2 is NOT satisfied unless parent-child frame

408 // relationship is compliant to the same-origin policy. If they're from	358 // relationship is compliant to the same-origin policy. If they're from

409 // different domains, \|m_source\| would not be set to EncodingFromParentFrame	359 // different domains, \|m_source\| would not be set to EncodingFromParentFrame

410 // in the first place.	360 // in the first place.

411 void TextResourceDecoder::detectTextEncoding(const char* data, size_t len)	361 bool TextResourceDecoder::shouldAutoDetect() const

412 {

413 if (!shouldDetectEncoding())

414 return;

415

416 if (WTF::Unicode::isUTF8andNotASCII(data, len)) {

417 setEncoding(UTF8Encoding(), EncodingFromContentSniffing);

418 return;

419 }

420 if (m_encodingDetectionOption == UseAllAutoDetection) {

421 WTF::TextEncoding detectedEncoding;

422 if (detectTextEncodingUniversal(data, len, m_hintEncoding, &detectedEnco ding))

423 setEncoding(detectedEncoding, EncodingFromContentSniffing);

424 }

425 }

426

427 bool TextResourceDecoder::shouldDetectEncoding() const

428 {	362 {

429 // Just checking m_hintEncoding suffices here because it's only set	363 // Just checking m_hintEncoding suffices here because it's only set

430 // in setHintEncoding when the source is AutoDetectedEncoding.	364 // in setHintEncoding when the source is AutoDetectedEncoding.

431 return m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame & & m_hintEncoding);	365 return m_encodingDetectionOption == UseAllAutoDetection

	366 && (m_source == DefaultEncoding \|\| (m_source == EncodingFromParentFrame && m_hintEncoding));

	367 }

	368

	369 String TextResourceDecoder::decode(const char* data, size_t len)

	370 {

	371 size_t lengthOfBOM = 0;

	372 if (!m_checkedForBOM)

	373 lengthOfBOM = checkForBOM(data, len);

	374

	375 bool movedDataToBuffer = false;

	376

	377 if (m_contentType == CSSContent && !m_checkedForCSSCharset) {

	378 if (!checkForCSSCharset(data, len, movedDataToBuffer))

	379 return emptyString();

	380 }

	381

	382 // We check XML declaration in HTML content only if there is enough data ava ilable

	383 if (((m_contentType == HTMLContent && len >= minimumLengthOfXMLDeclaration) \|\| m_contentType == XMLContent) && !m_checkedForXMLCharset) {

	384 if (!checkForXMLCharset(data, len, movedDataToBuffer))

	385 return emptyString();

	386 }

	387

	388 const char* dataForDecode = data + lengthOfBOM;

	389 size_t lengthForDecode = len - lengthOfBOM;

	390

	391 if (!m_buffer.isEmpty()) {

	392 if (!movedDataToBuffer) {

	393 size_t oldSize = m_buffer.size();

	394 m_buffer.grow(oldSize + len);

	395 memcpy(m_buffer.data() + oldSize, data, len);

	396 }

	397

	398 dataForDecode = m_buffer.data() + lengthOfBOM;

	399 lengthForDecode = m_buffer.size() - lengthOfBOM;

	400 }

	401

	402 if (m_contentType == HTMLContent && !m_checkedForMetaCharset)

	403 checkForMetaCharset(dataForDecode, lengthForDecode);

	404

	405 if (shouldAutoDetect()) {

	406 WTF::TextEncoding detectedEncoding;

	407 if (detectTextEncoding(data, len, m_hintEncoding, &detectedEncoding))

	408 setEncoding(detectedEncoding, EncodingFromContentSniffing);

	409 }

	410

	411 ASSERT(m_encoding.isValid());

	412

	413 if (!m_codec)

	414 m_codec = newTextCodec(m_encoding);

	415

	416 String result = m_codec->decode(dataForDecode, lengthForDecode, DoNotFlush, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);

	417

	418 m_buffer.clear();

	419 return result;

432 }	420 }

433	421

434 String TextResourceDecoder::flush()	422 String TextResourceDecoder::flush()

435 {	423 {

436 // If we can not identify the encoding even after a document is completely	424 // If we can not identify the encoding even after a document is completely

437 // loaded, we need to detect the encoding if other conditions for	425 // loaded, we need to detect the encoding if other conditions for

438 // autodetection is satisfied.	426 // autodetection is satisfied.

439 if (m_buffer.size()	427 if (m_buffer.size() && shouldAutoDetect()

440 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent \|\| m_conte ntType == XMLContent)) \|\| (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) {	428 && ((!m_checkedForXMLCharset && (m_contentType == HTMLContent \|\| m_conte ntType == XMLContent)) \|\| (!m_checkedForCSSCharset && (m_contentType == CSSConte nt)))) {

441 detectTextEncoding(m_buffer.data(), m_buffer.size());	429 WTF::TextEncoding detectedEncoding;

	430 if (detectTextEncoding(m_buffer.data(), m_buffer.size(), m_hintEncoding, &detectedEncoding))

	431 setEncoding(detectedEncoding, EncodingFromContentSniffing);

442 }	432 }

443	433

444 if (!m_codec)	434 if (!m_codec)

445 m_codec = newTextCodec(m_encoding);	435 m_codec = newTextCodec(m_encoding);

446	436

447 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);	437 String result = m_codec->decode(m_buffer.data(), m_buffer.size(), FetchEOF, m_contentType == XMLContent && !m_useLenientXMLDecoding, m_sawError);

448 m_buffer.clear();	438 m_buffer.clear();

449 m_codec.clear();	439 m_codec.clear();

450 m_checkedForBOM = false; // Skip BOM again when re-decoding.	440 m_checkedForBOM = false; // Skip BOM again when re-decoding.

451 return result;	441 return result;

452 }	442 }

453	443

454 } // namespace blink	444 } // namespace blink

OLD	NEW