Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(42)

Side by Side Diff: base/string_util_unittest.cc

Issue 4268: IsStringUTF8 unittest and enforcing UTF-8 in JSON deserialization (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/
Patch Set: '' Created 12 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « base/string_util.cc ('k') | net/base/net_util.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include <math.h> 5 #include <math.h>
6 #include <stdarg.h> 6 #include <stdarg.h>
7 7
8 #include <limits> 8 #include <limits>
9 #include <sstream> 9 #include <sstream>
10 10
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after
106 {L" a \r b\n c \r\n d \t\re \t f \n ", true, L"abcde f"}, 106 {L" a \r b\n c \r\n d \t\re \t f \n ", true, L"abcde f"},
107 }; 107 };
108 108
109 TEST(StringUtilTest, CollapseWhitespace) { 109 TEST(StringUtilTest, CollapseWhitespace) {
110 for (size_t i = 0; i < arraysize(collapse_cases); ++i) { 110 for (size_t i = 0; i < arraysize(collapse_cases); ++i) {
111 const collapse_case& value = collapse_cases[i]; 111 const collapse_case& value = collapse_cases[i];
112 EXPECT_EQ(value.output, CollapseWhitespace(value.input, value.trim)); 112 EXPECT_EQ(value.output, CollapseWhitespace(value.input, value.trim));
113 } 113 }
114 } 114 }
115 115
116
117 TEST(StringUtilTest, IsStringUTF8) {
118 EXPECT_TRUE(IsStringUTF8("abc"));
119 EXPECT_TRUE(IsStringUTF8("\xc2\x81"));
120 EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf"));
121 EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf"));
122 EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"));
123 EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM
124
125
126 // surrogate code points
127 EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf"));
128 EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f"));
129 EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf"));
130
131 // overlong sequences
132 EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000
133 EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB"
134 EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000
135 EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080
136 EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff
137 EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D
138 EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091
139 EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800
140 EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM)
141 EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F
142 EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5
143
144 // Beyond U+10FFFF (the upper limit of Unicode codespace)
145 EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000
146 EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes
147 EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes
148
149 // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE)
150 EXPECT_FALSE(IsStringUTF8("\xfe\xff"));
151 EXPECT_FALSE(IsStringUTF8("\xff\xfe"));
152 EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4)));
153 EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00"));
154
155 // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF>
156 EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE)
157 EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE
158 EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF
159
160 // This should also be false, but currently we pass them through.
161 // Disable them for now.
162 #if 0
163 EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0
164 EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF
165 #endif
166
167 // Strings in legacy encodings. We can certainly make up strings
168 // in a legacy encoding that are valid in UTF-8, but in real data,
169 // most of them are invalid as UTF-8.
170 EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1
171 EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR
172 EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5
173 // "abc" with U+201[CD] in windows-125[0-8]
174 EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94"));
175 // U+0639 U+064E U+0644 U+064E in ISO-8859-6
176 EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee"));
177 // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7
178 EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC"));
179 }
180
116 static const wchar_t* const kConvertRoundtripCases[] = { 181 static const wchar_t* const kConvertRoundtripCases[] = {
117 L"Google Video", 182 L"Google Video",
118 // "网页 图片 资讯更多 »" 183 // "网页 图片 资讯更多 »"
119 L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", 184 L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb",
120 // "Παγκόσμιος Ιστός" 185 // "Παγκόσμιος Ιστός"
121 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" 186 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
122 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", 187 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2",
123 // "Поиск страниц на русском" 188 // "Поиск страниц на русском"
124 L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" 189 L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442"
125 L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" 190 L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430"
(...skipping 1182 matching lines...) Expand 10 before | Expand all | Expand 10 after
1308 { L"%f %d %o %u", true }, 1373 { L"%f %d %o %u", true },
1309 { L"%-8d (%02.1f%)", true }, 1374 { L"%-8d (%02.1f%)", true },
1310 { L"% 10s", false }, 1375 { L"% 10s", false },
1311 { L"% 10ls", true } 1376 { L"% 10ls", true }
1312 }; 1377 };
1313 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) { 1378 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) {
1314 EXPECT_EQ(cases[i].portable, base::IsWprintfFormatPortable(cases[i].input)); 1379 EXPECT_EQ(cases[i].portable, base::IsWprintfFormatPortable(cases[i].input));
1315 } 1380 }
1316 } 1381 }
1317 1382
OLDNEW
« no previous file with comments | « base/string_util.cc ('k') | net/base/net_util.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698