OLD | NEW |
(Empty) | |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 #include "wtf/text/UTF8.h" |
| 6 |
| 7 #include "testing/gtest/include/gtest/gtest.h" |
| 8 |
| 9 namespace WTF { |
| 10 namespace Unicode { |
| 11 |
| 12 TEST(UTF8Test, testIsUTF8Encoded) |
| 13 { |
| 14 EXPECT_TRUE(isUTF8Encoded("\xc2\x81", 2)); |
| 15 EXPECT_TRUE(isUTF8Encoded("\xe1\x80\xbf", 3)); |
| 16 EXPECT_TRUE(isUTF8Encoded("\xf1\x80\xa0\xbf", 4)); |
| 17 EXPECT_TRUE(isUTF8Encoded("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf", 10)); |
| 18 |
| 19 // Surrogate code points |
| 20 EXPECT_FALSE(isUTF8Encoded("\xed\xa0\x80\xed\xbf\xbf", 6)); |
| 21 EXPECT_FALSE(isUTF8Encoded("\xed\xa0\x8f", 3)); |
| 22 EXPECT_FALSE(isUTF8Encoded("\xed\xbf\xbf", 3)); |
| 23 |
| 24 // Overlong sequences |
| 25 EXPECT_FALSE(isUTF8Encoded("\xc0\x80", 2)); // U+0000 |
| 26 EXPECT_FALSE(isUTF8Encoded("\xc1\x80\xc1\x81", 4)); // "AB" |
| 27 EXPECT_FALSE(isUTF8Encoded("\xe0\x80\x80", 3)); // U+0000 |
| 28 EXPECT_FALSE(isUTF8Encoded("\xe0\x82\x80", 3)); // U+0080 |
| 29 EXPECT_FALSE(isUTF8Encoded("\xe0\x9f\xbf", 3)); // U+07ff |
| 30 EXPECT_FALSE(isUTF8Encoded("\xf0\x80\x80\x8D", 4)); // U+000D |
| 31 EXPECT_FALSE(isUTF8Encoded("\xf0\x80\x82\x91", 4)); // U+0091 |
| 32 EXPECT_FALSE(isUTF8Encoded("\xf0\x80\xa0\x80", 4)); // U+0800 |
| 33 EXPECT_FALSE(isUTF8Encoded("\xf0\x8f\xbb\xbf", 4)); // U+FEFF (BOM) |
| 34 EXPECT_FALSE(isUTF8Encoded("\xf8\x80\x80\x80\xbf", 5)); // U+003F |
| 35 EXPECT_FALSE(isUTF8Encoded("\xfc\x80\x80\x80\xa0\xa5", 6)); // U+00A5 |
| 36 |
| 37 // Beyond U+10FFFF (the upper limit of Unicode codespace) |
| 38 EXPECT_FALSE(isUTF8Encoded("\xf4\x90\x80\x80", 4)); // U+110000 |
| 39 EXPECT_FALSE(isUTF8Encoded("\xf8\xa0\xbf\x80\xbf", 5)); // 5 bytes |
| 40 EXPECT_FALSE(isUTF8Encoded("\xfc\x9c\xbf\x80\xbf\x80", 6)); // 6 bytes |
| 41 |
| 42 // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF
> |
| 43 EXPECT_FALSE(isUTF8Encoded("\xef\xbf\xbe", 3)); // U+FFFE |
| 44 EXPECT_FALSE(isUTF8Encoded("\xf0\x8f\xbf\xbe", 4)); // U+1FFFE |
| 45 EXPECT_FALSE(isUTF8Encoded("\xf3\xbf\xbf\xbf", 4)); // U+10FFFF |
| 46 EXPECT_FALSE(isUTF8Encoded("\xef\xb7\x90", 3)); // U+FDD0 |
| 47 EXPECT_FALSE(isUTF8Encoded("\xef\xb7\xaf", 3)); // U+FDEF |
| 48 |
| 49 // Strings in legacy encodings. |
| 50 EXPECT_FALSE(isUTF8Encoded("caf\xe9", 4)); // cafe with U+00E9 in ISO-8859-1 |
| 51 EXPECT_FALSE(isUTF8Encoded("\xb0\xa1\xb0\xa2", 4)); // U+AC00, U+AC001 in EU
C-KR |
| 52 EXPECT_FALSE(isUTF8Encoded("\xa7\x41\xa6\x6e", 4)); // U+4F60 U+597D in Big5 |
| 53 // "abc" with U+201[CD] in windows-125[0-8] |
| 54 EXPECT_FALSE(isUTF8Encoded("\x93" "abc\x94", 4)); |
| 55 // U+0639 U+064E U+0644 U+064E in ISO-8859-6 |
| 56 EXPECT_FALSE(isUTF8Encoded("\xd9\xee\xe4\xee", 4)); |
| 57 // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7 |
| 58 EXPECT_FALSE(isUTF8Encoded("\xe3\xe5\xe9\xdC", 4)); |
| 59 EXPECT_FALSE(isUTF8Encoded("abc", 3)); // plain ASCII |
| 60 } |
| 61 |
| 62 } // namespace Unicode |
| 63 } // namespace WTF |
OLD | NEW |