third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp - Issue 1721373002: UTF-8 detector for pages missing encoding info

Side by Side Diff: third_party/WebKit/Source/platform/text/TextEncodingDetector.cpp

Issue 1721373002: UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« third_party/WebKit/Source/core/html/parser/TextResourceDecoder.cpp ('K') | « third_party/WebKit/Source/platform/text/TextEncodingDetector.h ('k') | third_party/WebKit/Source/platform/text/TextEncodingDetectorTest.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2008, 2009 Google Inc. All rights reserved.	2 * Copyright (C) 2008, 2009 Google Inc. All rights reserved.

3 *	3 *

4 * Redistribution and use in source and binary forms, with or without	4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions are	5 * modification, are permitted provided that the following conditions are

6 * met:	6 * met:

7 *	7 *

8 * * Redistributions of source code must retain the above copyright	8 * * Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.	9 * notice, this list of conditions and the following disclaimer.

10 * * Redistributions in binary form must reproduce the above	10 * * Redistributions in binary form must reproduce the above

(...skipping 15 matching lines...) Expand all Loading...
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT	26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE	27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.	28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

29 */	29 */

30	30

31 #include "platform/text/TextEncodingDetector.h"	31 #include "platform/text/TextEncodingDetector.h"

32	32

33 #include "wtf/text/TextEncoding.h"	33 #include "wtf/text/TextEncoding.h"

34 #include <unicode/ucnv.h>	34 #include <unicode/ucnv.h>

35 #include <unicode/ucsdet.h>	35 #include <unicode/ucsdet.h>

	36 #include <unicode/utf8.h>

36	37

37 namespace blink {	38 namespace blink {

38	39

39 bool detectTextEncoding(const char* data, size_t length,	40 bool detectTextEncoding(const char* data, size_t length,

40 const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)	41 const char* hintEncodingName, WTF::TextEncoding* detectedEncoding)

41 {	42 {

42 *detectedEncoding = WTF::TextEncoding();	43 *detectedEncoding = WTF::TextEncoding();

43 int matchesCount = 0;	44 int matchesCount = 0;

44 UErrorCode status = U_ZERO_ERROR;	45 UErrorCode status = U_ZERO_ERROR;

45 UCharsetDetector* detector = ucsdet_open(&status);	46 UCharsetDetector* detector = ucsdet_open(&status);

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
105 encoding = ucsdet_getName(matches[0], &status);	106 encoding = ucsdet_getName(matches[0], &status);

106 if (U_SUCCESS(status)) {	107 if (U_SUCCESS(status)) {

107 *detectedEncoding = WTF::TextEncoding(encoding);	108 *detectedEncoding = WTF::TextEncoding(encoding);

108 ucsdet_close(detector);	109 ucsdet_close(detector);

109 return true;	110 return true;

110 }	111 }

111 ucsdet_close(detector);	112 ucsdet_close(detector);

112 return false;	113 return false;

113 }	114 }

114	115

	116 bool isUTF8Encoded(const char* data, size_t length)

	117 {

	118 int32_t srcLen = static_cast<int32_t>(length);

	119 int32_t charIndex = 0;

	120 bool markDetected = false;

	121

	122 while (charIndex < srcLen) {

	123 int32_t codePoint;

	124 if ((uint8_t)(data[charIndex]) >= 0x80)

	125 markDetected = true;

	126 U8_NEXT(data, charIndex, srcLen, codePoint);

	127 if (!U_IS_UNICODE_CHAR(codePoint))
	aelias_OOO_until_Jul13 2016/02/24 04:37:54 According to http://icu-project.org/apiref/icu4c/u According to http://icu-project.org/apiref/icu4c/utf8_8h.html reference for U8_NEXT, "If the offset points to a trail byte or an illegal UTF-8 sequence, then c is set to a negative value.", so how about just checking for < 0 instead? Jinsuk Kim 2016/02/24 06:54:54 Thanks for looking into the detail. Ran the unitte Show quoted text On 2016/02/24 04:37:54, aelias wrote: > According to http://icu-project.org/apiref/icu4c/utf8_8h.html reference for > U8_NEXT, "If the offset points to a trail byte or an illegal UTF-8 sequence, > then c is set to a negative value.", so how about just checking for < 0 instead? Thanks for looking into the detail. Ran the unittests again with codePoint < 0 to find that it works in almost all cases but for following: // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF> EXPECT_FALSE(isUTF8Encoded("\xef\xbf\xbe", 3)); // U+FFFE EXPECT_FALSE(isUTF8Encoded("\xf3\xbf\xbf\xbf", 4)); // U+10FFFF EXPECT_FALSE(isUTF8Encoded("\xef\xb7\x90", 3)); // U+FDD0 EXPECT_FALSE(isUTF8Encoded("\xef\xb7\xaf", 3)); // U+FDEF Not sure how important/practical these cases are.
	128 return false;

	129 }

	130 return markDetected;

	131 }

	132

115 } // namespace blink	133 } // namespace blink

OLD	NEW