base/string_util_unittest.cc - Issue 4268: IsStringUTF8 unittest and enforcing UTF-8 in JSON deserialization

Unified Diff: base/string_util_unittest.cc

Issue 4268: IsStringUTF8 unittest and enforcing UTF-8 in JSON deserialization (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 12 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: base/string_util_unittest.cc

===================================================================

--- base/string_util_unittest.cc (revision 2506)

+++ base/string_util_unittest.cc (working copy)

@@ -113,6 +113,71 @@

}

+TEST(StringUtilTest, IsStringUTF8) {

+ EXPECT_TRUE(IsStringUTF8("abc"));

+ EXPECT_TRUE(IsStringUTF8("\xc2\x81"));

+ EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf"));

+ EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf"));

+ EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"));

+ EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM

+ // surrogate code points

+ EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf"));

+ EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f"));

+ EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf"));

+ // overlong sequences

+ EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000

+ EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB"

+ EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000

+ EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080

+ EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff

+ EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D

+ EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091

+ EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800

+ EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM)

+ EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F

+ EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5

+ // Beyond U+10FFFF (the upper limit of Unicode codespace)

+ EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000

+ EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes

+ EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes

+ // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE)

+ EXPECT_FALSE(IsStringUTF8("\xfe\xff"));

+ EXPECT_FALSE(IsStringUTF8("\xff\xfe"));

+ EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4)));

+ EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00"));

+ // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF>

+ EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE)

+ EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE

+ EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF

+ // This should also be false, but currently we pass them through.

+ // Disable them for now.

+#if 0

+ EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0

+ EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF

+#endif

+ // Strings in legacy encodings. We can certainly make up strings

+ // in a legacy encoding that are valid in UTF-8, but in real data,

+ // most of them are invalid as UTF-8.

+ EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1

+ EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR

+ EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5

+ // "abc" with U+201[CD] in windows-125[0-8]

+ EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94"));

+ // U+0639 U+064E U+0644 U+064E in ISO-8859-6

+ EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee"));

+ // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7

+ EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC"));

static const wchar_t* const kConvertRoundtripCases[] = {

L"Google Video",

// "网页图片资讯更多 »"

« no previous file with comments | « base/string_util.cc ('k') | net/base/net_util.cc » ('j') | no next file with comments »