third_party/WebKit/Source/wtf/text/UTF8Test.cpp - Issue 1721373002: UTF-8 detector for pages missing encoding info

Side by Side Diff

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Side by Side Diff: third_party/WebKit/Source/wtf/text/UTF8Test.cpp

Issue 1721373002: UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2016 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "wtf/text/UTF8.h"

	6

	7 #include "testing/gtest/include/gtest/gtest.h"

	8

	9 namespace WTF {

	10 namespace Unicode {

	11

	12 TEST(UTF8Test, testIsUTF8Encoded)

	13 {

	14 EXPECT_TRUE(isUTF8Encoded("\xc2\x81", 2));

	15 EXPECT_TRUE(isUTF8Encoded("\xe1\x80\xbf", 3));

	16 EXPECT_TRUE(isUTF8Encoded("\xf1\x80\xa0\xbf", 4));

	17 EXPECT_TRUE(isUTF8Encoded("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf", 10));

	18

	19 // Surrogate code points

	20 EXPECT_FALSE(isUTF8Encoded("\xed\xa0\x80\xed\xbf\xbf", 6));

	21 EXPECT_FALSE(isUTF8Encoded("\xed\xa0\x8f", 3));

	22 EXPECT_FALSE(isUTF8Encoded("\xed\xbf\xbf", 3));

	23

	24 // Overlong sequences

	25 EXPECT_FALSE(isUTF8Encoded("\xc0\x80", 2)); // U+0000

	26 EXPECT_FALSE(isUTF8Encoded("\xc1\x80\xc1\x81", 4)); // "AB"

	27 EXPECT_FALSE(isUTF8Encoded("\xe0\x80\x80", 3)); // U+0000

	28 EXPECT_FALSE(isUTF8Encoded("\xe0\x82\x80", 3)); // U+0080

	29 EXPECT_FALSE(isUTF8Encoded("\xe0\x9f\xbf", 3)); // U+07ff

	30 EXPECT_FALSE(isUTF8Encoded("\xf0\x80\x80\x8D", 4)); // U+000D

	31 EXPECT_FALSE(isUTF8Encoded("\xf0\x80\x82\x91", 4)); // U+0091

	32 EXPECT_FALSE(isUTF8Encoded("\xf0\x80\xa0\x80", 4)); // U+0800

	33 EXPECT_FALSE(isUTF8Encoded("\xf0\x8f\xbb\xbf", 4)); // U+FEFF (BOM)

	34 EXPECT_FALSE(isUTF8Encoded("\xf8\x80\x80\x80\xbf", 5)); // U+003F

	35 EXPECT_FALSE(isUTF8Encoded("\xfc\x80\x80\x80\xa0\xa5", 6)); // U+00A5

	36

	37 // Beyond U+10FFFF (the upper limit of Unicode codespace)

	38 EXPECT_FALSE(isUTF8Encoded("\xf4\x90\x80\x80", 4)); // U+110000

	39 EXPECT_FALSE(isUTF8Encoded("\xf8\xa0\xbf\x80\xbf", 5)); // 5 bytes

	40 EXPECT_FALSE(isUTF8Encoded("\xfc\x9c\xbf\x80\xbf\x80", 6)); // 6 bytes

	41

	42 // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF >

	43 EXPECT_FALSE(isUTF8Encoded("\xef\xbf\xbe", 3)); // U+FFFE

	44 EXPECT_FALSE(isUTF8Encoded("\xf0\x8f\xbf\xbe", 4)); // U+1FFFE

	45 EXPECT_FALSE(isUTF8Encoded("\xf3\xbf\xbf\xbf", 4)); // U+10FFFF

	46 EXPECT_FALSE(isUTF8Encoded("\xef\xb7\x90", 3)); // U+FDD0

	47 EXPECT_FALSE(isUTF8Encoded("\xef\xb7\xaf", 3)); // U+FDEF

	48

	49 // Strings in legacy encodings.

	50 EXPECT_FALSE(isUTF8Encoded("caf\xe9", 4)); // cafe with U+00E9 in ISO-8859-1

	51 EXPECT_FALSE(isUTF8Encoded("\xb0\xa1\xb0\xa2", 4)); // U+AC00, U+AC001 in EU C-KR

	52 EXPECT_FALSE(isUTF8Encoded("\xa7\x41\xa6\x6e", 4)); // U+4F60 U+597D in Big5

	53 // "abc" with U+201[CD] in windows-125[0-8]

	54 EXPECT_FALSE(isUTF8Encoded("\x93" "abc\x94", 4));

	55 // U+0639 U+064E U+0644 U+064E in ISO-8859-6

	56 EXPECT_FALSE(isUTF8Encoded("\xd9\xee\xe4\xee", 4));

	57 // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7

	58 EXPECT_FALSE(isUTF8Encoded("\xe3\xe5\xe9\xdC", 4));

	59 EXPECT_FALSE(isUTF8Encoded("abc", 3)); // plain ASCII

	60 }

	61

	62 } // namespace Unicode

	63 } // namespace WTF

OLD	NEW

« third_party/WebKit/Source/wtf/text/UTF8.cpp ('K') | « third_party/WebKit/Source/wtf/text/UTF8.cpp ('k') | third_party/WebKit/Source/wtf/wtf.gypi » ('j') | no next file with comments »