base/string_util_unittest.cc - Issue 4268: IsStringUTF8 unittest and enforcing UTF-8 in JSON deserialization

Side by Side Diff: base/string_util_unittest.cc

Issue 4268: IsStringUTF8 unittest and enforcing UTF-8 in JSON deserialization (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: '' Created 12 years, 3 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.	1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include <math.h>	5 #include <math.h>

6 #include <stdarg.h>	6 #include <stdarg.h>

7	7

8 #include <limits>	8 #include <limits>

9 #include <sstream>	9 #include <sstream>

10	10

(...skipping 95 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
106 {L" a \r b\n c \r\n d \t\re \t f \n ", true, L"abcde f"},	106 {L" a \r b\n c \r\n d \t\re \t f \n ", true, L"abcde f"},

107 };	107 };

108	108

109 TEST(StringUtilTest, CollapseWhitespace) {	109 TEST(StringUtilTest, CollapseWhitespace) {

110 for (size_t i = 0; i < arraysize(collapse_cases); ++i) {	110 for (size_t i = 0; i < arraysize(collapse_cases); ++i) {

111 const collapse_case& value = collapse_cases[i];	111 const collapse_case& value = collapse_cases[i];

112 EXPECT_EQ(value.output, CollapseWhitespace(value.input, value.trim));	112 EXPECT_EQ(value.output, CollapseWhitespace(value.input, value.trim));

113 }	113 }

114 }	114 }

115	115

	116

	117 TEST(StringUtilTest, IsStringUTF8) {

	118 EXPECT_TRUE(IsStringUTF8("abc"));

	119 EXPECT_TRUE(IsStringUTF8("\xc2\x81"));

	120 EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf"));

	121 EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf"));

	122 EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"));

	123 EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM

	124

	125

	126 // surrogate code points

	127 EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf"));

	128 EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f"));

	129 EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf"));

	130

	131 // overlong sequences

	132 EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000

	133 EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB"

	134 EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000

	135 EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080

	136 EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff

	137 EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D

	138 EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091

	139 EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800

	140 EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM)

	141 EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F

	142 EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5

	143

	144 // Beyond U+10FFFF (the upper limit of Unicode codespace)

	145 EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000

	146 EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes

	147 EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes

	148

	149 // BOMs in UTF-16(BE\|LE) and UTF-32(BE\|LE)

	150 EXPECT_FALSE(IsStringUTF8("\xfe\xff"));

	151 EXPECT_FALSE(IsStringUTF8("\xff\xfe"));

	152 EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4)));

	153 EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00"));

	154

	155 // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF>

	156 EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE)

	157 EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE

	158 EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF

	159

	160 // This should also be false, but currently we pass them through.

	161 // Disable them for now.

	162 #if 0

	163 EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0

	164 EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF

	165 #endif

	166

	167 // Strings in legacy encodings. We can certainly make up strings

	168 // in a legacy encoding that are valid in UTF-8, but in real data,

	169 // most of them are invalid as UTF-8.

	170 EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1

	171 EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR

	172 EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5

	173 // "abc" with U+201[CD] in windows-125[0-8]

	174 EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94"));

	175 // U+0639 U+064E U+0644 U+064E in ISO-8859-6

	176 EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee"));

	177 // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7

	178 EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC"));

	179 }

	180

116 static const wchar_t* const kConvertRoundtripCases[] = {	181 static const wchar_t* const kConvertRoundtripCases[] = {

117 L"Google Video",	182 L"Google Video",

118 // "网页图片资讯更多 »"	183 // "网页图片资讯更多 »"

119 L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb",	184 L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb",

120 // "Παγκόσμιος Ιστός"	185 // "Παγκόσμιος Ιστός"

121 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"	186 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"

122 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2",	187 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2",

123 // "Поиск страниц на русском"	188 // "Поиск страниц на русском"

124 L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442"	189 L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442"

125 L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430"	190 L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430"

(...skipping 1182 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1308 { L"%f %d %o %u", true },	1373 { L"%f %d %o %u", true },

1309 { L"%-8d (%02.1f%)", true },	1374 { L"%-8d (%02.1f%)", true },

1310 { L"% 10s", false },	1375 { L"% 10s", false },

1311 { L"% 10ls", true }	1376 { L"% 10ls", true }

1312 };	1377 };

1313 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) {	1378 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) {

1314 EXPECT_EQ(cases[i].portable, base::IsWprintfFormatPortable(cases[i].input));	1379 EXPECT_EQ(cases[i].portable, base::IsWprintfFormatPortable(cases[i].input));

1315 }	1380 }

1316 }	1381 }

1317	1382

OLD	NEW

« no previous file with comments | « base/string_util.cc ('k') | net/base/net_util.cc » ('j') | no next file with comments »