OLD | NEW |
1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2006-2008 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include <math.h> | 5 #include <math.h> |
6 #include <stdarg.h> | 6 #include <stdarg.h> |
7 | 7 |
8 #include <limits> | 8 #include <limits> |
9 #include <sstream> | 9 #include <sstream> |
10 | 10 |
(...skipping 95 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
106 {L" a \r b\n c \r\n d \t\re \t f \n ", true, L"abcde f"}, | 106 {L" a \r b\n c \r\n d \t\re \t f \n ", true, L"abcde f"}, |
107 }; | 107 }; |
108 | 108 |
109 TEST(StringUtilTest, CollapseWhitespace) { | 109 TEST(StringUtilTest, CollapseWhitespace) { |
110 for (size_t i = 0; i < arraysize(collapse_cases); ++i) { | 110 for (size_t i = 0; i < arraysize(collapse_cases); ++i) { |
111 const collapse_case& value = collapse_cases[i]; | 111 const collapse_case& value = collapse_cases[i]; |
112 EXPECT_EQ(value.output, CollapseWhitespace(value.input, value.trim)); | 112 EXPECT_EQ(value.output, CollapseWhitespace(value.input, value.trim)); |
113 } | 113 } |
114 } | 114 } |
115 | 115 |
| 116 |
| 117 TEST(StringUtilTest, IsStringUTF8) { |
| 118 EXPECT_TRUE(IsStringUTF8("abc")); |
| 119 EXPECT_TRUE(IsStringUTF8("\xc2\x81")); |
| 120 EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf")); |
| 121 EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf")); |
| 122 EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf")); |
| 123 EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM |
| 124 |
| 125 |
| 126 // surrogate code points |
| 127 EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf")); |
| 128 EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f")); |
| 129 EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf")); |
| 130 |
| 131 // overlong sequences |
| 132 EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000 |
| 133 EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB" |
| 134 EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000 |
| 135 EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080 |
| 136 EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff |
| 137 EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D |
| 138 EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091 |
| 139 EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800 |
| 140 EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM) |
| 141 EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F |
| 142 EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5 |
| 143 |
| 144 // Beyond U+10FFFF (the upper limit of Unicode codespace) |
| 145 EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000 |
| 146 EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes |
| 147 EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes |
| 148 |
| 149 // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE) |
| 150 EXPECT_FALSE(IsStringUTF8("\xfe\xff")); |
| 151 EXPECT_FALSE(IsStringUTF8("\xff\xfe")); |
| 152 EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4))); |
| 153 EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00")); |
| 154 |
| 155 // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF> |
| 156 EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE) |
| 157 EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE |
| 158 EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF |
| 159 |
| 160 // This should also be false, but currently we pass them through. |
| 161 // Disable them for now. |
| 162 #if 0 |
| 163 EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0 |
| 164 EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF |
| 165 #endif |
| 166 |
| 167 // Strings in legacy encodings. We can certainly make up strings |
| 168 // in a legacy encoding that are valid in UTF-8, but in real data, |
| 169 // most of them are invalid as UTF-8. |
| 170 EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1 |
| 171 EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR |
| 172 EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5 |
| 173 // "abc" with U+201[CD] in windows-125[0-8] |
| 174 EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94")); |
| 175 // U+0639 U+064E U+0644 U+064E in ISO-8859-6 |
| 176 EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee")); |
| 177 // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7 |
| 178 EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC")); |
| 179 } |
| 180 |
116 static const wchar_t* const kConvertRoundtripCases[] = { | 181 static const wchar_t* const kConvertRoundtripCases[] = { |
117 L"Google Video", | 182 L"Google Video", |
118 // "网页 图片 资讯更多 »" | 183 // "网页 图片 资讯更多 »" |
119 L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", | 184 L"\x7f51\x9875\x0020\x56fe\x7247\x0020\x8d44\x8baf\x66f4\x591a\x0020\x00bb", |
120 // "Παγκόσμιος Ιστός" | 185 // "Παγκόσμιος Ιστός" |
121 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" | 186 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9" |
122 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", | 187 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2", |
123 // "Поиск страниц на русском" | 188 // "Поиск страниц на русском" |
124 L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" | 189 L"\x041f\x043e\x0438\x0441\x043a\x0020\x0441\x0442" |
125 L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" | 190 L"\x0440\x0430\x043d\x0438\x0446\x0020\x043d\x0430" |
(...skipping 1182 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1308 { L"%f %d %o %u", true }, | 1373 { L"%f %d %o %u", true }, |
1309 { L"%-8d (%02.1f%)", true }, | 1374 { L"%-8d (%02.1f%)", true }, |
1310 { L"% 10s", false }, | 1375 { L"% 10s", false }, |
1311 { L"% 10ls", true } | 1376 { L"% 10ls", true } |
1312 }; | 1377 }; |
1313 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) { | 1378 for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) { |
1314 EXPECT_EQ(cases[i].portable, base::IsWprintfFormatPortable(cases[i].input)); | 1379 EXPECT_EQ(cases[i].portable, base::IsWprintfFormatPortable(cases[i].input)); |
1315 } | 1380 } |
1316 } | 1381 } |
1317 | 1382 |
OLD | NEW |