| OLD | NEW |
| (Empty) |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 #include "base/i18n/streaming_utf8_validator.h" | |
| 6 | |
| 7 #include <stdio.h> | |
| 8 #include <string.h> | |
| 9 | |
| 10 #include <string> | |
| 11 | |
| 12 #include "base/strings/string_piece.h" | |
| 13 #include "testing/gtest/include/gtest/gtest.h" | |
| 14 | |
| 15 // Define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST to verify that this class | |
| 16 // accepts exactly the same set of 4-byte strings as ICU-based validation. This | |
| 17 // tests every possible 4-byte string, so it is too slow to run routinely on | |
| 18 // low-powered machines. | |
| 19 // | |
| 20 // #define BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST | |
| 21 | |
| 22 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST | |
| 23 | |
| 24 #include "base/basictypes.h" | |
| 25 #include "base/bind.h" | |
| 26 #include "base/location.h" | |
| 27 #include "base/logging.h" | |
| 28 #include "base/memory/ref_counted.h" | |
| 29 #include "base/strings/string_util.h" | |
| 30 #include "base/strings/stringprintf.h" | |
| 31 #include "base/strings/utf_string_conversion_utils.h" | |
| 32 #include "base/synchronization/condition_variable.h" | |
| 33 #include "base/synchronization/lock.h" | |
| 34 #include "base/threading/sequenced_worker_pool.h" | |
| 35 #include "third_party/icu/source/common/unicode/utf8.h" | |
| 36 | |
| 37 #endif // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST | |
| 38 | |
| 39 namespace base { | |
| 40 namespace { | |
| 41 | |
| 42 // Avoid having to qualify the enum values in the tests. | |
| 43 const StreamingUtf8Validator::State VALID_ENDPOINT = | |
| 44 StreamingUtf8Validator::VALID_ENDPOINT; | |
| 45 const StreamingUtf8Validator::State VALID_MIDPOINT = | |
| 46 StreamingUtf8Validator::VALID_MIDPOINT; | |
| 47 const StreamingUtf8Validator::State INVALID = StreamingUtf8Validator::INVALID; | |
| 48 | |
| 49 #ifdef BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST | |
| 50 | |
| 51 const uint32 kThoroughTestChunkSize = 1 << 24; | |
| 52 | |
| 53 class StreamingUtf8ValidatorThoroughTest : public ::testing::Test { | |
| 54 protected: | |
| 55 StreamingUtf8ValidatorThoroughTest() | |
| 56 : all_done_(&lock_), tasks_dispatched_(0), tasks_finished_(0) {} | |
| 57 | |
| 58 // This uses the same logic as base::IsStringUTF8 except it considers | |
| 59 // non-characters valid (and doesn't require a string as input). | |
| 60 static bool IsStringUtf8(const char* src, int32 src_len) { | |
| 61 int32 char_index = 0; | |
| 62 | |
| 63 while (char_index < src_len) { | |
| 64 int32 code_point; | |
| 65 U8_NEXT(src, char_index, src_len, code_point); | |
| 66 if (!base::IsValidCodepoint(code_point)) | |
| 67 return false; | |
| 68 } | |
| 69 return true; | |
| 70 } | |
| 71 | |
| 72 // Converts the passed-in integer to a 4 byte string and then | |
| 73 // verifies that IsStringUtf8 and StreamingUtf8Validator agree on | |
| 74 // whether it is valid UTF-8 or not. | |
| 75 void TestNumber(uint32 n) const { | |
| 76 char test[sizeof n]; | |
| 77 memcpy(test, &n, sizeof n); | |
| 78 StreamingUtf8Validator validator; | |
| 79 EXPECT_EQ(IsStringUtf8(test, sizeof n), | |
| 80 validator.AddBytes(test, sizeof n) == VALID_ENDPOINT) | |
| 81 << "Difference of opinion for \"" | |
| 82 << base::StringPrintf("\\x%02X\\x%02X\\x%02X\\x%02X", | |
| 83 test[0] & 0xFF, | |
| 84 test[1] & 0xFF, | |
| 85 test[2] & 0xFF, | |
| 86 test[3] & 0xFF) << "\""; | |
| 87 } | |
| 88 | |
| 89 public: | |
| 90 // Tests the 4-byte sequences corresponding to the |size| integers | |
| 91 // starting at |begin|. This is intended to be run from a worker | |
| 92 // pool. Signals |all_done_| at the end if it thinks all tasks are | |
| 93 // finished. | |
| 94 void TestRange(uint32 begin, uint32 size) { | |
| 95 for (uint32 i = 0; i < size; ++i) { | |
| 96 TestNumber(begin + i); | |
| 97 } | |
| 98 base::AutoLock al(lock_); | |
| 99 ++tasks_finished_; | |
| 100 LOG(INFO) << tasks_finished_ << " / " << tasks_dispatched_ | |
| 101 << " tasks done\n"; | |
| 102 if (tasks_finished_ >= tasks_dispatched_) { | |
| 103 all_done_.Signal(); | |
| 104 } | |
| 105 } | |
| 106 | |
| 107 protected: | |
| 108 base::Lock lock_; | |
| 109 base::ConditionVariable all_done_; | |
| 110 int tasks_dispatched_; | |
| 111 int tasks_finished_; | |
| 112 }; | |
| 113 | |
| 114 TEST_F(StreamingUtf8ValidatorThoroughTest, TestEverything) { | |
| 115 scoped_refptr<base::SequencedWorkerPool> pool = | |
| 116 new base::SequencedWorkerPool(32, "TestEverything"); | |
| 117 base::AutoLock al(lock_); | |
| 118 uint32 begin = 0; | |
| 119 do { | |
| 120 pool->PostWorkerTask( | |
| 121 FROM_HERE, | |
| 122 base::Bind(&StreamingUtf8ValidatorThoroughTest::TestRange, | |
| 123 base::Unretained(this), | |
| 124 begin, | |
| 125 kThoroughTestChunkSize)); | |
| 126 ++tasks_dispatched_; | |
| 127 begin += kThoroughTestChunkSize; | |
| 128 } while (begin != 0); | |
| 129 while (tasks_finished_ < tasks_dispatched_) | |
| 130 all_done_.Wait(); | |
| 131 } | |
| 132 | |
| 133 #endif // BASE_I18N_UTF8_VALIDATOR_THOROUGH_TEST | |
| 134 | |
| 135 // These valid and invalid UTF-8 sequences are based on the tests from | |
| 136 // base/strings/string_util_unittest.cc | |
| 137 | |
| 138 // All of the strings in |valid| must represent a single codepoint, because | |
| 139 // partial sequences are constructed by taking non-empty prefixes of these | |
| 140 // strings. | |
| 141 const char* const valid[] = {"\r", "\n", "a", | |
| 142 "\xc2\x81", "\xe1\x80\xbf", "\xf1\x80\xa0\xbf", | |
| 143 "\xef\xbb\xbf", // UTF-8 BOM | |
| 144 }; | |
| 145 | |
| 146 const char* const* const valid_end = valid + arraysize(valid); | |
| 147 | |
| 148 const char* const invalid[] = { | |
| 149 // always invalid bytes | |
| 150 "\xc0", "\xc1", | |
| 151 "\xf5", "\xf6", "\xf7", | |
| 152 "\xf8", "\xf9", "\xfa", "\xfb", "\xfc", "\xfd", "\xfe", "\xff", | |
| 153 // surrogate code points | |
| 154 "\xed\xa0\x80", "\xed\x0a\x8f", "\xed\xbf\xbf", | |
| 155 // | |
| 156 // overlong sequences | |
| 157 "\xc0\x80" // U+0000 | |
| 158 "\xc1\x80", // "A" | |
| 159 "\xc1\x81", // "B" | |
| 160 "\xe0\x80\x80", // U+0000 | |
| 161 "\xe0\x82\x80", // U+0080 | |
| 162 "\xe0\x9f\xbf", // U+07ff | |
| 163 "\xf0\x80\x80\x8D", // U+000D | |
| 164 "\xf0\x80\x82\x91", // U+0091 | |
| 165 "\xf0\x80\xa0\x80", // U+0800 | |
| 166 "\xf0\x8f\xbb\xbf", // U+FEFF (BOM) | |
| 167 "\xf8\x80\x80\x80\xbf", // U+003F | |
| 168 "\xfc\x80\x80\x80\xa0\xa5", | |
| 169 // | |
| 170 // Beyond U+10FFFF | |
| 171 "\xf4\x90\x80\x80", // U+110000 | |
| 172 "\xf8\xa0\xbf\x80\xbf", // 5 bytes | |
| 173 "\xfc\x9c\xbf\x80\xbf\x80", // 6 bytes | |
| 174 // | |
| 175 // BOMs in UTF-16(BE|LE) | |
| 176 "\xfe\xff", "\xff\xfe", | |
| 177 }; | |
| 178 | |
| 179 const char* const* const invalid_end = invalid + arraysize(invalid); | |
| 180 | |
| 181 // A ForwardIterator which returns all the non-empty prefixes of the elements of | |
| 182 // "valid". | |
| 183 class PartialIterator { | |
| 184 public: | |
| 185 // The constructor returns the first iterator, ie. it is equivalent to | |
| 186 // begin(). | |
| 187 PartialIterator() : index_(0), prefix_length_(0) { Advance(); } | |
| 188 // The trivial destructor left intentionally undefined. | |
| 189 // This is a value type; the default copy constructor and assignment operator | |
| 190 // generated by the compiler are used. | |
| 191 | |
| 192 static PartialIterator end() { return PartialIterator(arraysize(valid), 1); } | |
| 193 | |
| 194 PartialIterator& operator++() { | |
| 195 Advance(); | |
| 196 return *this; | |
| 197 } | |
| 198 | |
| 199 base::StringPiece operator*() const { | |
| 200 return base::StringPiece(valid[index_], prefix_length_); | |
| 201 } | |
| 202 | |
| 203 bool operator==(const PartialIterator& rhs) const { | |
| 204 return index_ == rhs.index_ && prefix_length_ == rhs.prefix_length_; | |
| 205 } | |
| 206 | |
| 207 bool operator!=(const PartialIterator& rhs) const { return !(rhs == *this); } | |
| 208 | |
| 209 private: | |
| 210 // This constructor is used by the end() method. | |
| 211 PartialIterator(size_t index, size_t prefix_length) | |
| 212 : index_(index), prefix_length_(prefix_length) {} | |
| 213 | |
| 214 void Advance() { | |
| 215 if (index_ < arraysize(valid) && prefix_length_ < strlen(valid[index_])) | |
| 216 ++prefix_length_; | |
| 217 while (index_ < arraysize(valid) && | |
| 218 prefix_length_ == strlen(valid[index_])) { | |
| 219 ++index_; | |
| 220 prefix_length_ = 1; | |
| 221 } | |
| 222 } | |
| 223 | |
| 224 // The UTF-8 sequence, as an offset into the |valid| array. | |
| 225 size_t index_; | |
| 226 size_t prefix_length_; | |
| 227 }; | |
| 228 | |
| 229 // A test fixture for tests which test one UTF-8 sequence (or invalid | |
| 230 // byte sequence) at a time. | |
| 231 class StreamingUtf8ValidatorSingleSequenceTest : public ::testing::Test { | |
| 232 protected: | |
| 233 // Iterator must be convertible when de-referenced to StringPiece. | |
| 234 template <typename Iterator> | |
| 235 void CheckRange(Iterator begin, | |
| 236 Iterator end, | |
| 237 StreamingUtf8Validator::State expected) { | |
| 238 for (Iterator it = begin; it != end; ++it) { | |
| 239 StreamingUtf8Validator validator; | |
| 240 base::StringPiece sequence = *it; | |
| 241 EXPECT_EQ(expected, | |
| 242 validator.AddBytes(sequence.data(), sequence.size())) | |
| 243 << "Failed for \"" << sequence << "\""; | |
| 244 } | |
| 245 } | |
| 246 | |
| 247 // Adding input a byte at a time should make absolutely no difference. | |
| 248 template <typename Iterator> | |
| 249 void CheckRangeByteAtATime(Iterator begin, | |
| 250 Iterator end, | |
| 251 StreamingUtf8Validator::State expected) { | |
| 252 for (Iterator it = begin; it != end; ++it) { | |
| 253 StreamingUtf8Validator validator; | |
| 254 base::StringPiece sequence = *it; | |
| 255 StreamingUtf8Validator::State state = VALID_ENDPOINT; | |
| 256 for (base::StringPiece::const_iterator cit = sequence.begin(); | |
| 257 cit != sequence.end(); | |
| 258 ++cit) { | |
| 259 state = validator.AddBytes(&*cit, 1); | |
| 260 } | |
| 261 EXPECT_EQ(expected, state) << "Failed for \"" << sequence << "\""; | |
| 262 } | |
| 263 } | |
| 264 }; | |
| 265 | |
| 266 // A test fixture for tests which test the concatenation of byte sequences. | |
| 267 class StreamingUtf8ValidatorDoubleSequenceTest : public ::testing::Test { | |
| 268 protected: | |
| 269 // Check every possible concatenation of byte sequences from two | |
| 270 // ranges, and verify that the combination matches the expected | |
| 271 // state. | |
| 272 template <typename Iterator1, typename Iterator2> | |
| 273 void CheckCombinations(Iterator1 begin1, | |
| 274 Iterator1 end1, | |
| 275 Iterator2 begin2, | |
| 276 Iterator2 end2, | |
| 277 StreamingUtf8Validator::State expected) { | |
| 278 StreamingUtf8Validator validator; | |
| 279 for (Iterator1 it1 = begin1; it1 != end1; ++it1) { | |
| 280 base::StringPiece c1 = *it1; | |
| 281 for (Iterator2 it2 = begin2; it2 != end2; ++it2) { | |
| 282 base::StringPiece c2 = *it2; | |
| 283 validator.AddBytes(c1.data(), c1.size()); | |
| 284 EXPECT_EQ(expected, validator.AddBytes(c2.data(), c2.size())) | |
| 285 << "Failed for \"" << c1 << c2 << "\""; | |
| 286 validator.Reset(); | |
| 287 } | |
| 288 } | |
| 289 } | |
| 290 }; | |
| 291 | |
| 292 TEST(StreamingUtf8ValidatorTest, NothingIsValid) { | |
| 293 static const char kNothing[] = ""; | |
| 294 EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNothing, 0)); | |
| 295 } | |
| 296 | |
| 297 // Because the members of the |valid| array need to be non-zero length | |
| 298 // sequences and are measured with strlen(), |valid| cannot be used it | |
| 299 // to test the NUL character '\0', so the NUL character gets its own | |
| 300 // test. | |
| 301 TEST(StreamingUtf8ValidatorTest, NulIsValid) { | |
| 302 static const char kNul[] = "\x00"; | |
| 303 EXPECT_EQ(VALID_ENDPOINT, StreamingUtf8Validator().AddBytes(kNul, 1)); | |
| 304 } | |
| 305 | |
| 306 // Just a basic sanity test before we start getting fancy. | |
| 307 TEST(StreamingUtf8ValidatorTest, HelloWorld) { | |
| 308 static const char kHelloWorld[] = "Hello, World!"; | |
| 309 EXPECT_EQ( | |
| 310 VALID_ENDPOINT, | |
| 311 StreamingUtf8Validator().AddBytes(kHelloWorld, strlen(kHelloWorld))); | |
| 312 } | |
| 313 | |
| 314 // Check that the Reset() method works. | |
| 315 TEST(StreamingUtf8ValidatorTest, ResetWorks) { | |
| 316 StreamingUtf8Validator validator; | |
| 317 EXPECT_EQ(INVALID, validator.AddBytes("\xC0", 1)); | |
| 318 EXPECT_EQ(INVALID, validator.AddBytes("a", 1)); | |
| 319 validator.Reset(); | |
| 320 EXPECT_EQ(VALID_ENDPOINT, validator.AddBytes("a", 1)); | |
| 321 } | |
| 322 | |
| 323 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Valid) { | |
| 324 CheckRange(valid, valid_end, VALID_ENDPOINT); | |
| 325 } | |
| 326 | |
| 327 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Partial) { | |
| 328 CheckRange(PartialIterator(), PartialIterator::end(), VALID_MIDPOINT); | |
| 329 } | |
| 330 | |
| 331 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, Invalid) { | |
| 332 CheckRange(invalid, invalid_end, INVALID); | |
| 333 } | |
| 334 | |
| 335 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, ValidByByte) { | |
| 336 CheckRangeByteAtATime(valid, valid_end, VALID_ENDPOINT); | |
| 337 } | |
| 338 | |
| 339 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, PartialByByte) { | |
| 340 CheckRangeByteAtATime( | |
| 341 PartialIterator(), PartialIterator::end(), VALID_MIDPOINT); | |
| 342 } | |
| 343 | |
| 344 TEST_F(StreamingUtf8ValidatorSingleSequenceTest, InvalidByByte) { | |
| 345 CheckRangeByteAtATime(invalid, invalid_end, INVALID); | |
| 346 } | |
| 347 | |
| 348 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusValidIsValid) { | |
| 349 CheckCombinations(valid, valid_end, valid, valid_end, VALID_ENDPOINT); | |
| 350 } | |
| 351 | |
| 352 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusPartialIsPartial) { | |
| 353 CheckCombinations(valid, | |
| 354 valid_end, | |
| 355 PartialIterator(), | |
| 356 PartialIterator::end(), | |
| 357 VALID_MIDPOINT); | |
| 358 } | |
| 359 | |
| 360 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusValidIsInvalid) { | |
| 361 CheckCombinations( | |
| 362 PartialIterator(), PartialIterator::end(), valid, valid_end, INVALID); | |
| 363 } | |
| 364 | |
| 365 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusPartialIsInvalid) { | |
| 366 CheckCombinations(PartialIterator(), | |
| 367 PartialIterator::end(), | |
| 368 PartialIterator(), | |
| 369 PartialIterator::end(), | |
| 370 INVALID); | |
| 371 } | |
| 372 | |
| 373 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, ValidPlusInvalidIsInvalid) { | |
| 374 CheckCombinations(valid, valid_end, invalid, invalid_end, INVALID); | |
| 375 } | |
| 376 | |
| 377 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusValidIsInvalid) { | |
| 378 CheckCombinations(invalid, invalid_end, valid, valid_end, INVALID); | |
| 379 } | |
| 380 | |
| 381 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusInvalidIsInvalid) { | |
| 382 CheckCombinations(invalid, invalid_end, invalid, invalid_end, INVALID); | |
| 383 } | |
| 384 | |
| 385 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, InvalidPlusPartialIsInvalid) { | |
| 386 CheckCombinations( | |
| 387 invalid, invalid_end, PartialIterator(), PartialIterator::end(), INVALID); | |
| 388 } | |
| 389 | |
| 390 TEST_F(StreamingUtf8ValidatorDoubleSequenceTest, PartialPlusInvalidIsInvalid) { | |
| 391 CheckCombinations( | |
| 392 PartialIterator(), PartialIterator::end(), invalid, invalid_end, INVALID); | |
| 393 } | |
| 394 | |
| 395 TEST(StreamingUtf8ValidatorValidateTest, EmptyIsValid) { | |
| 396 EXPECT_TRUE(StreamingUtf8Validator::Validate(std::string())); | |
| 397 } | |
| 398 | |
| 399 TEST(StreamingUtf8ValidatorValidateTest, SimpleValidCase) { | |
| 400 EXPECT_TRUE(StreamingUtf8Validator::Validate("\xc2\x81")); | |
| 401 } | |
| 402 | |
| 403 TEST(StreamingUtf8ValidatorValidateTest, SimpleInvalidCase) { | |
| 404 EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc0\x80")); | |
| 405 } | |
| 406 | |
| 407 TEST(StreamingUtf8ValidatorValidateTest, TruncatedIsInvalid) { | |
| 408 EXPECT_FALSE(StreamingUtf8Validator::Validate("\xc2")); | |
| 409 } | |
| 410 | |
| 411 } // namespace | |
| 412 } // namespace base | |
| OLD | NEW |