Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(6)

Side by Side Diff: third_party/WebKit/Source/platform/text/TextEncodingDetectorTest.cpp

Issue 1721373002: UTF-8 detector for pages missing encoding info (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "platform/text/TextEncodingDetector.h"
6
7 #include "testing/gtest/include/gtest/gtest.h"
8
9 namespace blink {
10
11 TEST(TextEncodingDetectorTest, testIsUTF8Encoded)
12 {
13 EXPECT_TRUE(isUTF8Encoded("\xc2\x81", 2));
14 EXPECT_TRUE(isUTF8Encoded("\xe1\x80\xbf", 3));
15 EXPECT_TRUE(isUTF8Encoded("\xf1\x80\xa0\xbf", 4));
16 EXPECT_TRUE(isUTF8Encoded("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf", 10));
17
18 // Surrogate code points
19 EXPECT_FALSE(isUTF8Encoded("\xed\xa0\x80\xed\xbf\xbf", 6));
20 EXPECT_FALSE(isUTF8Encoded("\xed\xa0\x8f", 3));
21 EXPECT_FALSE(isUTF8Encoded("\xed\xbf\xbf", 3));
22
23 // Overlong sequences
24 EXPECT_FALSE(isUTF8Encoded("\xc0\x80", 2)); // U+0000
25 EXPECT_FALSE(isUTF8Encoded("\xc1\x80\xc1\x81", 4)); // "AB"
26 EXPECT_FALSE(isUTF8Encoded("\xe0\x80\x80", 3)); // U+0000
27 EXPECT_FALSE(isUTF8Encoded("\xe0\x82\x80", 3)); // U+0080
28 EXPECT_FALSE(isUTF8Encoded("\xe0\x9f\xbf", 3)); // U+07ff
29 EXPECT_FALSE(isUTF8Encoded("\xf0\x80\x80\x8D", 4)); // U+000D
30 EXPECT_FALSE(isUTF8Encoded("\xf0\x80\x82\x91", 4)); // U+0091
31 EXPECT_FALSE(isUTF8Encoded("\xf0\x80\xa0\x80", 4)); // U+0800
32 EXPECT_FALSE(isUTF8Encoded("\xf0\x8f\xbb\xbf", 4)); // U+FEFF (BOM)
33 EXPECT_FALSE(isUTF8Encoded("\xf8\x80\x80\x80\xbf", 5)); // U+003F
34 EXPECT_FALSE(isUTF8Encoded("\xfc\x80\x80\x80\xa0\xa5", 6)); // U+00A5
35
36 // Beyond U+10FFFF (the upper limit of Unicode codespace)
37 EXPECT_FALSE(isUTF8Encoded("\xf4\x90\x80\x80", 4)); // U+110000
38 EXPECT_FALSE(isUTF8Encoded("\xf8\xa0\xbf\x80\xbf", 5)); // 5 bytes
39 EXPECT_FALSE(isUTF8Encoded("\xfc\x9c\xbf\x80\xbf\x80", 6)); // 6 bytes
40
41 // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF >
42 EXPECT_FALSE(isUTF8Encoded("\xef\xbf\xbe", 3)); // U+FFFE
43 EXPECT_FALSE(isUTF8Encoded("\xf0\x8f\xbf\xbe", 4)); // U+1FFFE
44 EXPECT_FALSE(isUTF8Encoded("\xf3\xbf\xbf\xbf", 4)); // U+10FFFF
45 EXPECT_FALSE(isUTF8Encoded("\xef\xb7\x90", 3)); // U+FDD0
46 EXPECT_FALSE(isUTF8Encoded("\xef\xb7\xaf", 3)); // U+FDEF
47
48 // Strings in legacy encodings.
49 EXPECT_FALSE(isUTF8Encoded("caf\xe9", 4)); // cafe with U+00E9 in ISO-8859-1
50 EXPECT_FALSE(isUTF8Encoded("\xb0\xa1\xb0\xa2", 4)); // U+AC00, U+AC001 in EU C-KR
51 EXPECT_FALSE(isUTF8Encoded("\xa7\x41\xa6\x6e", 4)); // U+4F60 U+597D in Big5
52 // "abc" with U+201[CD] in windows-125[0-8]
53 EXPECT_FALSE(isUTF8Encoded("\x93" "abc\x94", 4));
54 // U+0639 U+064E U+0644 U+064E in ISO-8859-6
55 EXPECT_FALSE(isUTF8Encoded("\xd9\xee\xe4\xee", 4));
56 // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7
57 EXPECT_FALSE(isUTF8Encoded("\xe3\xe5\xe9\xdC", 4));
58 EXPECT_FALSE(isUTF8Encoded("abc", 3)); // plain ASCII
59 }
60 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698