| OLD | NEW |
| (Empty) |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 // Create a state machine for validating UTF-8. The algorithm in brief: | |
| 6 // 1. Convert the complete unicode range of code points, except for the | |
| 7 // surrogate code points, to an ordered array of sequences of bytes in | |
| 8 // UTF-8. | |
| 9 // 2. Convert individual bytes to ranges, starting from the right of each byte | |
| 10 // sequence. For each range, ensure the bytes on the left and the ranges | |
| 11 // on the right are the identical. | |
| 12 // 3. Convert the resulting list of ranges into a state machine, collapsing | |
| 13 // identical states. | |
| 14 // 4. Convert the state machine to an array of bytes. | |
| 15 // 5. Output as a C++ file. | |
| 16 // | |
| 17 // To use: | |
| 18 // $ ninja -C out/Release build_utf8_validator_tables | |
| 19 // $ out/Release/build_utf8_validator_tables | |
| 20 // --output=base/i18n/utf8_validator_tables.cc | |
| 21 // $ git add base/i18n/utf8_validator_tables.cc | |
| 22 // | |
| 23 // Because the table is not expected to ever change, it is checked into the | |
| 24 // repository rather than being regenerated at build time. | |
| 25 // | |
| 26 // This code uses type uint8 throughout to represent bytes, to avoid | |
| 27 // signed/unsigned char confusion. | |
| 28 | |
| 29 #include <stdio.h> | |
| 30 #include <stdlib.h> | |
| 31 #include <string.h> | |
| 32 | |
| 33 #include <algorithm> | |
| 34 #include <map> | |
| 35 #include <string> | |
| 36 #include <vector> | |
| 37 | |
| 38 #include "base/basictypes.h" | |
| 39 #include "base/command_line.h" | |
| 40 #include "base/files/file_path.h" | |
| 41 #include "base/files/file_util.h" | |
| 42 #include "base/logging.h" | |
| 43 #include "base/numerics/safe_conversions.h" | |
| 44 #include "base/strings/stringprintf.h" | |
| 45 #include "third_party/icu/source/common/unicode/utf8.h" | |
| 46 | |
| 47 namespace { | |
| 48 | |
| 49 const char kHelpText[] = | |
| 50 "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n"; | |
| 51 | |
| 52 const char kProlog[] = | |
| 53 "// Copyright 2013 The Chromium Authors. All rights reserved.\n" | |
| 54 "// Use of this source code is governed by a BSD-style license that can " | |
| 55 "be\n" | |
| 56 "// found in the LICENSE file.\n" | |
| 57 "\n" | |
| 58 "// This file is auto-generated by build_utf8_validator_tables.\n" | |
| 59 "// DO NOT EDIT.\n" | |
| 60 "\n" | |
| 61 "#include \"base/i18n/utf8_validator_tables.h\"\n" | |
| 62 "\n" | |
| 63 "namespace base {\n" | |
| 64 "namespace internal {\n" | |
| 65 "\n" | |
| 66 "const uint8 kUtf8ValidatorTables[] = {\n"; | |
| 67 | |
| 68 const char kEpilog[] = | |
| 69 "};\n" | |
| 70 "\n" | |
| 71 "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n" | |
| 72 "\n" | |
| 73 "} // namespace internal\n" | |
| 74 "} // namespace base\n"; | |
| 75 | |
| 76 // Ranges are inclusive at both ends--they represent [from, to] | |
| 77 class Range { | |
| 78 public: | |
| 79 // Ranges always start with just one byte. | |
| 80 explicit Range(uint8 value) : from_(value), to_(value) {} | |
| 81 | |
| 82 // Range objects are copyable and assignable to be used in STL | |
| 83 // containers. Since they only contain non-pointer POD types, the default copy | |
| 84 // constructor, assignment operator and destructor will work. | |
| 85 | |
| 86 // Add a byte to the range. We intentionally only support adding a byte at the | |
| 87 // end, since that is the only operation the code needs. | |
| 88 void AddByte(uint8 to) { | |
| 89 CHECK(to == to_ + 1); | |
| 90 to_ = to; | |
| 91 } | |
| 92 | |
| 93 uint8 from() const { return from_; } | |
| 94 uint8 to() const { return to_; } | |
| 95 | |
| 96 bool operator<(const Range& rhs) const { | |
| 97 return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to())); | |
| 98 } | |
| 99 | |
| 100 bool operator==(const Range& rhs) const { | |
| 101 return from() == rhs.from() && to() == rhs.to(); | |
| 102 } | |
| 103 | |
| 104 private: | |
| 105 uint8 from_; | |
| 106 uint8 to_; | |
| 107 }; | |
| 108 | |
| 109 // A vector of Ranges is like a simple regular expression--it corresponds to | |
| 110 // a set of strings of the same length that have bytes in each position in | |
| 111 // the appropriate range. | |
| 112 typedef std::vector<Range> StringSet; | |
| 113 | |
| 114 // A UTF-8 "character" is represented by a sequence of bytes. | |
| 115 typedef std::vector<uint8> Character; | |
| 116 | |
| 117 // In the second stage of the algorithm, we want to convert a large list of | |
| 118 // Characters into a small list of StringSets. | |
| 119 struct Pair { | |
| 120 Character character; | |
| 121 StringSet set; | |
| 122 }; | |
| 123 | |
| 124 typedef std::vector<Pair> PairVector; | |
| 125 | |
| 126 // A class to print a table of numbers in the same style as clang-format. | |
| 127 class TablePrinter { | |
| 128 public: | |
| 129 explicit TablePrinter(FILE* stream) | |
| 130 : stream_(stream), values_on_this_line_(0), current_offset_(0) {} | |
| 131 | |
| 132 void PrintValue(uint8 value) { | |
| 133 if (values_on_this_line_ == 0) { | |
| 134 fputs(" ", stream_); | |
| 135 } else if (values_on_this_line_ == kMaxValuesPerLine) { | |
| 136 fprintf(stream_, " // 0x%02x\n ", current_offset_); | |
| 137 values_on_this_line_ = 0; | |
| 138 } | |
| 139 fprintf(stream_, " 0x%02x,", static_cast<int>(value)); | |
| 140 ++values_on_this_line_; | |
| 141 ++current_offset_; | |
| 142 } | |
| 143 | |
| 144 void NewLine() { | |
| 145 while (values_on_this_line_ < kMaxValuesPerLine) { | |
| 146 fputs(" ", stream_); | |
| 147 ++values_on_this_line_; | |
| 148 } | |
| 149 fprintf(stream_, " // 0x%02x\n", current_offset_); | |
| 150 values_on_this_line_ = 0; | |
| 151 } | |
| 152 | |
| 153 private: | |
| 154 // stdio stream. Not owned. | |
| 155 FILE* stream_; | |
| 156 | |
| 157 // Number of values so far printed on this line. | |
| 158 int values_on_this_line_; | |
| 159 | |
| 160 // Total values printed so far. | |
| 161 int current_offset_; | |
| 162 | |
| 163 static const int kMaxValuesPerLine = 8; | |
| 164 | |
| 165 DISALLOW_COPY_AND_ASSIGN(TablePrinter); | |
| 166 }; | |
| 167 | |
| 168 // Start by filling a PairVector with characters. The resulting vector goes from | |
| 169 // "\x00" to "\xf4\x8f\xbf\xbf". | |
| 170 PairVector InitializeCharacters() { | |
| 171 PairVector vector; | |
| 172 for (int i = 0; i <= 0x10FFFF; ++i) { | |
| 173 if (i >= 0xD800 && i < 0xE000) { | |
| 174 // Surrogate codepoints are not permitted. Non-character code points are | |
| 175 // explicitly permitted. | |
| 176 continue; | |
| 177 } | |
| 178 uint8 bytes[4]; | |
| 179 unsigned int offset = 0; | |
| 180 UBool is_error = false; | |
| 181 U8_APPEND(bytes, offset, arraysize(bytes), i, is_error); | |
| 182 DCHECK(!is_error); | |
| 183 DCHECK_GT(offset, 0u); | |
| 184 DCHECK_LE(offset, arraysize(bytes)); | |
| 185 Pair pair = {Character(bytes, bytes + offset), StringSet()}; | |
| 186 vector.push_back(pair); | |
| 187 } | |
| 188 return vector; | |
| 189 } | |
| 190 | |
| 191 // Construct a new Pair from |character| and the concatenation of |new_range| | |
| 192 // and |existing_set|, and append it to |pairs|. | |
| 193 void ConstructPairAndAppend(const Character& character, | |
| 194 const Range& new_range, | |
| 195 const StringSet& existing_set, | |
| 196 PairVector* pairs) { | |
| 197 Pair new_pair = {character, StringSet(1, new_range)}; | |
| 198 new_pair.set.insert( | |
| 199 new_pair.set.end(), existing_set.begin(), existing_set.end()); | |
| 200 pairs->push_back(new_pair); | |
| 201 } | |
| 202 | |
| 203 // Each pass over the PairVector strips one byte off the right-hand-side of the | |
| 204 // characters and adds a range to the set on the right. For example, the first | |
| 205 // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0", | |
| 206 // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0", | |
| 207 // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0", | |
| 208 // [\xa0-\xbf][\x80-\xbf]). | |
| 209 void MoveRightMostCharToSet(PairVector* pairs) { | |
| 210 PairVector new_pairs; | |
| 211 PairVector::const_iterator it = pairs->begin(); | |
| 212 while (it != pairs->end() && it->character.empty()) { | |
| 213 new_pairs.push_back(*it); | |
| 214 ++it; | |
| 215 } | |
| 216 CHECK(it != pairs->end()); | |
| 217 Character unconverted_bytes(it->character.begin(), it->character.end() - 1); | |
| 218 Range new_range(it->character.back()); | |
| 219 StringSet converted = it->set; | |
| 220 ++it; | |
| 221 while (it != pairs->end()) { | |
| 222 const Pair& current_pair = *it++; | |
| 223 if (current_pair.character.size() == unconverted_bytes.size() + 1 && | |
| 224 std::equal(unconverted_bytes.begin(), | |
| 225 unconverted_bytes.end(), | |
| 226 current_pair.character.begin()) && | |
| 227 converted == current_pair.set) { | |
| 228 // The particular set of UTF-8 codepoints we are validating guarantees | |
| 229 // that each byte range will be contiguous. This would not necessarily be | |
| 230 // true for an arbitrary set of UTF-8 codepoints. | |
| 231 DCHECK_EQ(new_range.to() + 1, current_pair.character.back()); | |
| 232 new_range.AddByte(current_pair.character.back()); | |
| 233 continue; | |
| 234 } | |
| 235 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); | |
| 236 unconverted_bytes = Character(current_pair.character.begin(), | |
| 237 current_pair.character.end() - 1); | |
| 238 new_range = Range(current_pair.character.back()); | |
| 239 converted = current_pair.set; | |
| 240 } | |
| 241 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); | |
| 242 new_pairs.swap(*pairs); | |
| 243 } | |
| 244 | |
| 245 void MoveAllCharsToSets(PairVector* pairs) { | |
| 246 // Since each pass of the function moves one character, and UTF-8 sequences | |
| 247 // are at most 4 characters long, this simply runs the algorithm four times. | |
| 248 for (int i = 0; i < 4; ++i) { | |
| 249 MoveRightMostCharToSet(pairs); | |
| 250 } | |
| 251 #if DCHECK_IS_ON() | |
| 252 for (PairVector::const_iterator it = pairs->begin(); it != pairs->end(); | |
| 253 ++it) { | |
| 254 DCHECK(it->character.empty()); | |
| 255 } | |
| 256 #endif | |
| 257 } | |
| 258 | |
| 259 // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f], | |
| 260 // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the | |
| 261 // algorithm is working. Use the command-line option | |
| 262 // --vmodule=build_utf8_validator_tables=1 to see this output. | |
| 263 void LogStringSets(const PairVector& pairs) { | |
| 264 for (PairVector::const_iterator pair_it = pairs.begin(); | |
| 265 pair_it != pairs.end(); | |
| 266 ++pair_it) { | |
| 267 std::string set_as_string; | |
| 268 for (StringSet::const_iterator set_it = pair_it->set.begin(); | |
| 269 set_it != pair_it->set.end(); | |
| 270 ++set_it) { | |
| 271 set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]", | |
| 272 static_cast<int>(set_it->from()), | |
| 273 static_cast<int>(set_it->to())); | |
| 274 } | |
| 275 VLOG(1) << set_as_string; | |
| 276 } | |
| 277 } | |
| 278 | |
| 279 // A single state in the state machine is represented by a sorted vector of | |
| 280 // start bytes and target states. All input bytes in the range between the start | |
| 281 // byte and the next entry in the vector (or 0xFF) result in a transition to the | |
| 282 // target state. | |
| 283 struct StateRange { | |
| 284 uint8 from; | |
| 285 uint8 target_state; | |
| 286 }; | |
| 287 | |
| 288 typedef std::vector<StateRange> State; | |
| 289 | |
| 290 // Generates a state where all bytes go to state 1 (invalid). This is also used | |
| 291 // as an initialiser for other states (since bytes from outside the desired | |
| 292 // range are invalid). | |
| 293 State GenerateInvalidState() { | |
| 294 const StateRange range = {0, 1}; | |
| 295 return State(1, range); | |
| 296 } | |
| 297 | |
| 298 // A map from a state (ie. a set of strings which will match from this state) to | |
| 299 // a number (which is an index into the array of states). | |
| 300 typedef std::map<StringSet, uint8> StateMap; | |
| 301 | |
| 302 // Create a new state corresponding to |set|, add it |states| and |state_map| | |
| 303 // and return the index it was given in |states|. | |
| 304 uint8 MakeState(const StringSet& set, | |
| 305 std::vector<State>* states, | |
| 306 StateMap* state_map) { | |
| 307 DCHECK(!set.empty()); | |
| 308 const Range& range = set.front(); | |
| 309 const StringSet rest(set.begin() + 1, set.end()); | |
| 310 const StateMap::const_iterator where = state_map->find(rest); | |
| 311 const uint8 target_state = where == state_map->end() | |
| 312 ? MakeState(rest, states, state_map) | |
| 313 : where->second; | |
| 314 DCHECK_LT(0, range.from()); | |
| 315 DCHECK_LT(range.to(), 0xFF); | |
| 316 const StateRange new_state_initializer[] = { | |
| 317 {0, 1}, {range.from(), target_state}, | |
| 318 {static_cast<uint8>(range.to() + 1), 1}}; | |
| 319 states->push_back( | |
| 320 State(new_state_initializer, | |
| 321 new_state_initializer + arraysize(new_state_initializer))); | |
| 322 const uint8 new_state_number = | |
| 323 base::checked_cast<uint8>(states->size() - 1); | |
| 324 CHECK(state_map->insert(std::make_pair(set, new_state_number)).second); | |
| 325 return new_state_number; | |
| 326 } | |
| 327 | |
| 328 std::vector<State> GenerateStates(const PairVector& pairs) { | |
| 329 // States 0 and 1 are the initial/valid state and invalid state, respectively. | |
| 330 std::vector<State> states(2, GenerateInvalidState()); | |
| 331 StateMap state_map; | |
| 332 state_map.insert(std::make_pair(StringSet(), 0)); | |
| 333 for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) { | |
| 334 DCHECK(it->character.empty()); | |
| 335 DCHECK(!it->set.empty()); | |
| 336 const Range& range = it->set.front(); | |
| 337 const StringSet rest(it->set.begin() + 1, it->set.end()); | |
| 338 const StateMap::const_iterator where = state_map.find(rest); | |
| 339 const uint8 target_state = where == state_map.end() | |
| 340 ? MakeState(rest, &states, &state_map) | |
| 341 : where->second; | |
| 342 if (states[0].back().from == range.from()) { | |
| 343 DCHECK_EQ(1, states[0].back().target_state); | |
| 344 states[0].back().target_state = target_state; | |
| 345 DCHECK_LT(range.to(), 0xFF); | |
| 346 const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1}; | |
| 347 states[0].push_back(new_range); | |
| 348 } else { | |
| 349 DCHECK_LT(range.to(), 0xFF); | |
| 350 const StateRange new_range_initializer[] = {{range.from(), target_state}, | |
| 351 {static_cast<uint8>(range.to() + 1), 1}}; | |
| 352 states[0] | |
| 353 .insert(states[0].end(), | |
| 354 new_range_initializer, | |
| 355 new_range_initializer + arraysize(new_range_initializer)); | |
| 356 } | |
| 357 } | |
| 358 return states; | |
| 359 } | |
| 360 | |
| 361 // Output the generated states as a C++ table. Two tricks are used to compact | |
| 362 // the table: each state in the table starts with a shift value which indicates | |
| 363 // how many bits we can discard from the right-hand-side of the byte before | |
| 364 // doing the table lookup. Secondly, only the state-transitions for bytes | |
| 365 // with the top-bit set are included in the table; bytes without the top-bit set | |
| 366 // are just ASCII and are handled directly by the code. | |
| 367 void PrintStates(const std::vector<State>& states, FILE* stream) { | |
| 368 // First calculate the start-offset of each state. This allows the state | |
| 369 // machine to jump directly to the correct offset, avoiding an extra | |
| 370 // indirection. State 0 starts at offset 0. | |
| 371 std::vector<uint8> state_offset(1, 0); | |
| 372 std::vector<uint8> shifts; | |
| 373 uint8 pos = 0; | |
| 374 | |
| 375 for (std::vector<State>::const_iterator state_it = states.begin(); | |
| 376 state_it != states.end(); | |
| 377 ++state_it) { | |
| 378 // We want to set |shift| to the (0-based) index of the least-significant | |
| 379 // set bit in any of the ranges for this state, since this tells us how many | |
| 380 // bits we can discard and still determine what range a byte lies in. Sadly | |
| 381 // it appears that ffs() is not portable, so we do it clumsily. | |
| 382 uint8 shift = 7; | |
| 383 for (State::const_iterator range_it = state_it->begin(); | |
| 384 range_it != state_it->end(); | |
| 385 ++range_it) { | |
| 386 while (shift > 0 && range_it->from % (1 << shift) != 0) { | |
| 387 --shift; | |
| 388 } | |
| 389 } | |
| 390 shifts.push_back(shift); | |
| 391 pos += 1 + (1 << (7 - shift)); | |
| 392 state_offset.push_back(pos); | |
| 393 } | |
| 394 | |
| 395 DCHECK_EQ(129, state_offset[1]); | |
| 396 | |
| 397 fputs(kProlog, stream); | |
| 398 TablePrinter table_printer(stream); | |
| 399 | |
| 400 for (uint8 state_index = 0; state_index < states.size(); ++state_index) { | |
| 401 const uint8 shift = shifts[state_index]; | |
| 402 uint8 next_range = 0; | |
| 403 uint8 target_state = 1; | |
| 404 fprintf(stream, | |
| 405 " // State %d, offset 0x%02x\n", | |
| 406 static_cast<int>(state_index), | |
| 407 static_cast<int>(state_offset[state_index])); | |
| 408 table_printer.PrintValue(shift); | |
| 409 for (int i = 0; i < 0x100; i += (1 << shift)) { | |
| 410 if (next_range < states[state_index].size() && | |
| 411 states[state_index][next_range].from == i) { | |
| 412 target_state = states[state_index][next_range].target_state; | |
| 413 ++next_range; | |
| 414 } | |
| 415 if (i >= 0x80) { | |
| 416 table_printer.PrintValue(state_offset[target_state]); | |
| 417 } | |
| 418 } | |
| 419 table_printer.NewLine(); | |
| 420 } | |
| 421 | |
| 422 fputs(kEpilog, stream); | |
| 423 } | |
| 424 | |
| 425 } // namespace | |
| 426 | |
| 427 int main(int argc, char* argv[]) { | |
| 428 base::CommandLine::Init(argc, argv); | |
| 429 logging::LoggingSettings settings; | |
| 430 settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG; | |
| 431 logging::InitLogging(settings); | |
| 432 if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) { | |
| 433 fwrite(kHelpText, 1, arraysize(kHelpText), stdout); | |
| 434 exit(EXIT_SUCCESS); | |
| 435 } | |
| 436 base::FilePath filename = | |
| 437 base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output"); | |
| 438 | |
| 439 FILE* output = stdout; | |
| 440 if (!filename.empty()) { | |
| 441 output = base::OpenFile(filename, "wb"); | |
| 442 if (!output) | |
| 443 PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe() | |
| 444 << "' for writing"; | |
| 445 } | |
| 446 | |
| 447 // Step 1: Enumerate the characters | |
| 448 PairVector pairs = InitializeCharacters(); | |
| 449 // Step 2: Convert to sets. | |
| 450 MoveAllCharsToSets(&pairs); | |
| 451 if (VLOG_IS_ON(1)) { | |
| 452 LogStringSets(pairs); | |
| 453 } | |
| 454 // Step 3: Generate states. | |
| 455 std::vector<State> states = GenerateStates(pairs); | |
| 456 // Step 4/5: Print output | |
| 457 PrintStates(states, output); | |
| 458 | |
| 459 if (!filename.empty()) { | |
| 460 if (!base::CloseFile(output)) | |
| 461 PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe() | |
| 462 << "'"; | |
| 463 } | |
| 464 | |
| 465 return EXIT_SUCCESS; | |
| 466 } | |
| OLD | NEW |