| Index: base/i18n/build_utf8_validator_tables.cc
|
| diff --git a/base/i18n/build_utf8_validator_tables.cc b/base/i18n/build_utf8_validator_tables.cc
|
| deleted file mode 100644
|
| index ae5b1a71e9df2ad961d4f56dcf12f4ef49f73af9..0000000000000000000000000000000000000000
|
| --- a/base/i18n/build_utf8_validator_tables.cc
|
| +++ /dev/null
|
| @@ -1,466 +0,0 @@
|
| -// Copyright 2014 The Chromium Authors. All rights reserved.
|
| -// Use of this source code is governed by a BSD-style license that can be
|
| -// found in the LICENSE file.
|
| -
|
| -// Create a state machine for validating UTF-8. The algorithm in brief:
|
| -// 1. Convert the complete unicode range of code points, except for the
|
| -// surrogate code points, to an ordered array of sequences of bytes in
|
| -// UTF-8.
|
| -// 2. Convert individual bytes to ranges, starting from the right of each byte
|
| -// sequence. For each range, ensure the bytes on the left and the ranges
|
| -// on the right are the identical.
|
| -// 3. Convert the resulting list of ranges into a state machine, collapsing
|
| -// identical states.
|
| -// 4. Convert the state machine to an array of bytes.
|
| -// 5. Output as a C++ file.
|
| -//
|
| -// To use:
|
| -// $ ninja -C out/Release build_utf8_validator_tables
|
| -// $ out/Release/build_utf8_validator_tables
|
| -// --output=base/i18n/utf8_validator_tables.cc
|
| -// $ git add base/i18n/utf8_validator_tables.cc
|
| -//
|
| -// Because the table is not expected to ever change, it is checked into the
|
| -// repository rather than being regenerated at build time.
|
| -//
|
| -// This code uses type uint8 throughout to represent bytes, to avoid
|
| -// signed/unsigned char confusion.
|
| -
|
| -#include <stdio.h>
|
| -#include <stdlib.h>
|
| -#include <string.h>
|
| -
|
| -#include <algorithm>
|
| -#include <map>
|
| -#include <string>
|
| -#include <vector>
|
| -
|
| -#include "base/basictypes.h"
|
| -#include "base/command_line.h"
|
| -#include "base/files/file_path.h"
|
| -#include "base/files/file_util.h"
|
| -#include "base/logging.h"
|
| -#include "base/numerics/safe_conversions.h"
|
| -#include "base/strings/stringprintf.h"
|
| -#include "third_party/icu/source/common/unicode/utf8.h"
|
| -
|
| -namespace {
|
| -
|
| -const char kHelpText[] =
|
| - "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";
|
| -
|
| -const char kProlog[] =
|
| - "// Copyright 2013 The Chromium Authors. All rights reserved.\n"
|
| - "// Use of this source code is governed by a BSD-style license that can "
|
| - "be\n"
|
| - "// found in the LICENSE file.\n"
|
| - "\n"
|
| - "// This file is auto-generated by build_utf8_validator_tables.\n"
|
| - "// DO NOT EDIT.\n"
|
| - "\n"
|
| - "#include \"base/i18n/utf8_validator_tables.h\"\n"
|
| - "\n"
|
| - "namespace base {\n"
|
| - "namespace internal {\n"
|
| - "\n"
|
| - "const uint8 kUtf8ValidatorTables[] = {\n";
|
| -
|
| -const char kEpilog[] =
|
| - "};\n"
|
| - "\n"
|
| - "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n"
|
| - "\n"
|
| - "} // namespace internal\n"
|
| - "} // namespace base\n";
|
| -
|
| -// Ranges are inclusive at both ends--they represent [from, to]
|
| -class Range {
|
| - public:
|
| - // Ranges always start with just one byte.
|
| - explicit Range(uint8 value) : from_(value), to_(value) {}
|
| -
|
| - // Range objects are copyable and assignable to be used in STL
|
| - // containers. Since they only contain non-pointer POD types, the default copy
|
| - // constructor, assignment operator and destructor will work.
|
| -
|
| - // Add a byte to the range. We intentionally only support adding a byte at the
|
| - // end, since that is the only operation the code needs.
|
| - void AddByte(uint8 to) {
|
| - CHECK(to == to_ + 1);
|
| - to_ = to;
|
| - }
|
| -
|
| - uint8 from() const { return from_; }
|
| - uint8 to() const { return to_; }
|
| -
|
| - bool operator<(const Range& rhs) const {
|
| - return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));
|
| - }
|
| -
|
| - bool operator==(const Range& rhs) const {
|
| - return from() == rhs.from() && to() == rhs.to();
|
| - }
|
| -
|
| - private:
|
| - uint8 from_;
|
| - uint8 to_;
|
| -};
|
| -
|
| -// A vector of Ranges is like a simple regular expression--it corresponds to
|
| -// a set of strings of the same length that have bytes in each position in
|
| -// the appropriate range.
|
| -typedef std::vector<Range> StringSet;
|
| -
|
| -// A UTF-8 "character" is represented by a sequence of bytes.
|
| -typedef std::vector<uint8> Character;
|
| -
|
| -// In the second stage of the algorithm, we want to convert a large list of
|
| -// Characters into a small list of StringSets.
|
| -struct Pair {
|
| - Character character;
|
| - StringSet set;
|
| -};
|
| -
|
| -typedef std::vector<Pair> PairVector;
|
| -
|
| -// A class to print a table of numbers in the same style as clang-format.
|
| -class TablePrinter {
|
| - public:
|
| - explicit TablePrinter(FILE* stream)
|
| - : stream_(stream), values_on_this_line_(0), current_offset_(0) {}
|
| -
|
| - void PrintValue(uint8 value) {
|
| - if (values_on_this_line_ == 0) {
|
| - fputs(" ", stream_);
|
| - } else if (values_on_this_line_ == kMaxValuesPerLine) {
|
| - fprintf(stream_, " // 0x%02x\n ", current_offset_);
|
| - values_on_this_line_ = 0;
|
| - }
|
| - fprintf(stream_, " 0x%02x,", static_cast<int>(value));
|
| - ++values_on_this_line_;
|
| - ++current_offset_;
|
| - }
|
| -
|
| - void NewLine() {
|
| - while (values_on_this_line_ < kMaxValuesPerLine) {
|
| - fputs(" ", stream_);
|
| - ++values_on_this_line_;
|
| - }
|
| - fprintf(stream_, " // 0x%02x\n", current_offset_);
|
| - values_on_this_line_ = 0;
|
| - }
|
| -
|
| - private:
|
| - // stdio stream. Not owned.
|
| - FILE* stream_;
|
| -
|
| - // Number of values so far printed on this line.
|
| - int values_on_this_line_;
|
| -
|
| - // Total values printed so far.
|
| - int current_offset_;
|
| -
|
| - static const int kMaxValuesPerLine = 8;
|
| -
|
| - DISALLOW_COPY_AND_ASSIGN(TablePrinter);
|
| -};
|
| -
|
| -// Start by filling a PairVector with characters. The resulting vector goes from
|
| -// "\x00" to "\xf4\x8f\xbf\xbf".
|
| -PairVector InitializeCharacters() {
|
| - PairVector vector;
|
| - for (int i = 0; i <= 0x10FFFF; ++i) {
|
| - if (i >= 0xD800 && i < 0xE000) {
|
| - // Surrogate codepoints are not permitted. Non-character code points are
|
| - // explicitly permitted.
|
| - continue;
|
| - }
|
| - uint8 bytes[4];
|
| - unsigned int offset = 0;
|
| - UBool is_error = false;
|
| - U8_APPEND(bytes, offset, arraysize(bytes), i, is_error);
|
| - DCHECK(!is_error);
|
| - DCHECK_GT(offset, 0u);
|
| - DCHECK_LE(offset, arraysize(bytes));
|
| - Pair pair = {Character(bytes, bytes + offset), StringSet()};
|
| - vector.push_back(pair);
|
| - }
|
| - return vector;
|
| -}
|
| -
|
| -// Construct a new Pair from |character| and the concatenation of |new_range|
|
| -// and |existing_set|, and append it to |pairs|.
|
| -void ConstructPairAndAppend(const Character& character,
|
| - const Range& new_range,
|
| - const StringSet& existing_set,
|
| - PairVector* pairs) {
|
| - Pair new_pair = {character, StringSet(1, new_range)};
|
| - new_pair.set.insert(
|
| - new_pair.set.end(), existing_set.begin(), existing_set.end());
|
| - pairs->push_back(new_pair);
|
| -}
|
| -
|
| -// Each pass over the PairVector strips one byte off the right-hand-side of the
|
| -// characters and adds a range to the set on the right. For example, the first
|
| -// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",
|
| -// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",
|
| -// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",
|
| -// [\xa0-\xbf][\x80-\xbf]).
|
| -void MoveRightMostCharToSet(PairVector* pairs) {
|
| - PairVector new_pairs;
|
| - PairVector::const_iterator it = pairs->begin();
|
| - while (it != pairs->end() && it->character.empty()) {
|
| - new_pairs.push_back(*it);
|
| - ++it;
|
| - }
|
| - CHECK(it != pairs->end());
|
| - Character unconverted_bytes(it->character.begin(), it->character.end() - 1);
|
| - Range new_range(it->character.back());
|
| - StringSet converted = it->set;
|
| - ++it;
|
| - while (it != pairs->end()) {
|
| - const Pair& current_pair = *it++;
|
| - if (current_pair.character.size() == unconverted_bytes.size() + 1 &&
|
| - std::equal(unconverted_bytes.begin(),
|
| - unconverted_bytes.end(),
|
| - current_pair.character.begin()) &&
|
| - converted == current_pair.set) {
|
| - // The particular set of UTF-8 codepoints we are validating guarantees
|
| - // that each byte range will be contiguous. This would not necessarily be
|
| - // true for an arbitrary set of UTF-8 codepoints.
|
| - DCHECK_EQ(new_range.to() + 1, current_pair.character.back());
|
| - new_range.AddByte(current_pair.character.back());
|
| - continue;
|
| - }
|
| - ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
|
| - unconverted_bytes = Character(current_pair.character.begin(),
|
| - current_pair.character.end() - 1);
|
| - new_range = Range(current_pair.character.back());
|
| - converted = current_pair.set;
|
| - }
|
| - ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);
|
| - new_pairs.swap(*pairs);
|
| -}
|
| -
|
| -void MoveAllCharsToSets(PairVector* pairs) {
|
| - // Since each pass of the function moves one character, and UTF-8 sequences
|
| - // are at most 4 characters long, this simply runs the algorithm four times.
|
| - for (int i = 0; i < 4; ++i) {
|
| - MoveRightMostCharToSet(pairs);
|
| - }
|
| -#if DCHECK_IS_ON()
|
| - for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();
|
| - ++it) {
|
| - DCHECK(it->character.empty());
|
| - }
|
| -#endif
|
| -}
|
| -
|
| -// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],
|
| -// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the
|
| -// algorithm is working. Use the command-line option
|
| -// --vmodule=build_utf8_validator_tables=1 to see this output.
|
| -void LogStringSets(const PairVector& pairs) {
|
| - for (PairVector::const_iterator pair_it = pairs.begin();
|
| - pair_it != pairs.end();
|
| - ++pair_it) {
|
| - std::string set_as_string;
|
| - for (StringSet::const_iterator set_it = pair_it->set.begin();
|
| - set_it != pair_it->set.end();
|
| - ++set_it) {
|
| - set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",
|
| - static_cast<int>(set_it->from()),
|
| - static_cast<int>(set_it->to()));
|
| - }
|
| - VLOG(1) << set_as_string;
|
| - }
|
| -}
|
| -
|
| -// A single state in the state machine is represented by a sorted vector of
|
| -// start bytes and target states. All input bytes in the range between the start
|
| -// byte and the next entry in the vector (or 0xFF) result in a transition to the
|
| -// target state.
|
| -struct StateRange {
|
| - uint8 from;
|
| - uint8 target_state;
|
| -};
|
| -
|
| -typedef std::vector<StateRange> State;
|
| -
|
| -// Generates a state where all bytes go to state 1 (invalid). This is also used
|
| -// as an initialiser for other states (since bytes from outside the desired
|
| -// range are invalid).
|
| -State GenerateInvalidState() {
|
| - const StateRange range = {0, 1};
|
| - return State(1, range);
|
| -}
|
| -
|
| -// A map from a state (ie. a set of strings which will match from this state) to
|
| -// a number (which is an index into the array of states).
|
| -typedef std::map<StringSet, uint8> StateMap;
|
| -
|
| -// Create a new state corresponding to |set|, add it |states| and |state_map|
|
| -// and return the index it was given in |states|.
|
| -uint8 MakeState(const StringSet& set,
|
| - std::vector<State>* states,
|
| - StateMap* state_map) {
|
| - DCHECK(!set.empty());
|
| - const Range& range = set.front();
|
| - const StringSet rest(set.begin() + 1, set.end());
|
| - const StateMap::const_iterator where = state_map->find(rest);
|
| - const uint8 target_state = where == state_map->end()
|
| - ? MakeState(rest, states, state_map)
|
| - : where->second;
|
| - DCHECK_LT(0, range.from());
|
| - DCHECK_LT(range.to(), 0xFF);
|
| - const StateRange new_state_initializer[] = {
|
| - {0, 1}, {range.from(), target_state},
|
| - {static_cast<uint8>(range.to() + 1), 1}};
|
| - states->push_back(
|
| - State(new_state_initializer,
|
| - new_state_initializer + arraysize(new_state_initializer)));
|
| - const uint8 new_state_number =
|
| - base::checked_cast<uint8>(states->size() - 1);
|
| - CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);
|
| - return new_state_number;
|
| -}
|
| -
|
| -std::vector<State> GenerateStates(const PairVector& pairs) {
|
| - // States 0 and 1 are the initial/valid state and invalid state, respectively.
|
| - std::vector<State> states(2, GenerateInvalidState());
|
| - StateMap state_map;
|
| - state_map.insert(std::make_pair(StringSet(), 0));
|
| - for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) {
|
| - DCHECK(it->character.empty());
|
| - DCHECK(!it->set.empty());
|
| - const Range& range = it->set.front();
|
| - const StringSet rest(it->set.begin() + 1, it->set.end());
|
| - const StateMap::const_iterator where = state_map.find(rest);
|
| - const uint8 target_state = where == state_map.end()
|
| - ? MakeState(rest, &states, &state_map)
|
| - : where->second;
|
| - if (states[0].back().from == range.from()) {
|
| - DCHECK_EQ(1, states[0].back().target_state);
|
| - states[0].back().target_state = target_state;
|
| - DCHECK_LT(range.to(), 0xFF);
|
| - const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1};
|
| - states[0].push_back(new_range);
|
| - } else {
|
| - DCHECK_LT(range.to(), 0xFF);
|
| - const StateRange new_range_initializer[] = {{range.from(), target_state},
|
| - {static_cast<uint8>(range.to() + 1), 1}};
|
| - states[0]
|
| - .insert(states[0].end(),
|
| - new_range_initializer,
|
| - new_range_initializer + arraysize(new_range_initializer));
|
| - }
|
| - }
|
| - return states;
|
| -}
|
| -
|
| -// Output the generated states as a C++ table. Two tricks are used to compact
|
| -// the table: each state in the table starts with a shift value which indicates
|
| -// how many bits we can discard from the right-hand-side of the byte before
|
| -// doing the table lookup. Secondly, only the state-transitions for bytes
|
| -// with the top-bit set are included in the table; bytes without the top-bit set
|
| -// are just ASCII and are handled directly by the code.
|
| -void PrintStates(const std::vector<State>& states, FILE* stream) {
|
| - // First calculate the start-offset of each state. This allows the state
|
| - // machine to jump directly to the correct offset, avoiding an extra
|
| - // indirection. State 0 starts at offset 0.
|
| - std::vector<uint8> state_offset(1, 0);
|
| - std::vector<uint8> shifts;
|
| - uint8 pos = 0;
|
| -
|
| - for (std::vector<State>::const_iterator state_it = states.begin();
|
| - state_it != states.end();
|
| - ++state_it) {
|
| - // We want to set |shift| to the (0-based) index of the least-significant
|
| - // set bit in any of the ranges for this state, since this tells us how many
|
| - // bits we can discard and still determine what range a byte lies in. Sadly
|
| - // it appears that ffs() is not portable, so we do it clumsily.
|
| - uint8 shift = 7;
|
| - for (State::const_iterator range_it = state_it->begin();
|
| - range_it != state_it->end();
|
| - ++range_it) {
|
| - while (shift > 0 && range_it->from % (1 << shift) != 0) {
|
| - --shift;
|
| - }
|
| - }
|
| - shifts.push_back(shift);
|
| - pos += 1 + (1 << (7 - shift));
|
| - state_offset.push_back(pos);
|
| - }
|
| -
|
| - DCHECK_EQ(129, state_offset[1]);
|
| -
|
| - fputs(kProlog, stream);
|
| - TablePrinter table_printer(stream);
|
| -
|
| - for (uint8 state_index = 0; state_index < states.size(); ++state_index) {
|
| - const uint8 shift = shifts[state_index];
|
| - uint8 next_range = 0;
|
| - uint8 target_state = 1;
|
| - fprintf(stream,
|
| - " // State %d, offset 0x%02x\n",
|
| - static_cast<int>(state_index),
|
| - static_cast<int>(state_offset[state_index]));
|
| - table_printer.PrintValue(shift);
|
| - for (int i = 0; i < 0x100; i += (1 << shift)) {
|
| - if (next_range < states[state_index].size() &&
|
| - states[state_index][next_range].from == i) {
|
| - target_state = states[state_index][next_range].target_state;
|
| - ++next_range;
|
| - }
|
| - if (i >= 0x80) {
|
| - table_printer.PrintValue(state_offset[target_state]);
|
| - }
|
| - }
|
| - table_printer.NewLine();
|
| - }
|
| -
|
| - fputs(kEpilog, stream);
|
| -}
|
| -
|
| -} // namespace
|
| -
|
| -int main(int argc, char* argv[]) {
|
| - base::CommandLine::Init(argc, argv);
|
| - logging::LoggingSettings settings;
|
| - settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;
|
| - logging::InitLogging(settings);
|
| - if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) {
|
| - fwrite(kHelpText, 1, arraysize(kHelpText), stdout);
|
| - exit(EXIT_SUCCESS);
|
| - }
|
| - base::FilePath filename =
|
| - base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");
|
| -
|
| - FILE* output = stdout;
|
| - if (!filename.empty()) {
|
| - output = base::OpenFile(filename, "wb");
|
| - if (!output)
|
| - PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()
|
| - << "' for writing";
|
| - }
|
| -
|
| - // Step 1: Enumerate the characters
|
| - PairVector pairs = InitializeCharacters();
|
| - // Step 2: Convert to sets.
|
| - MoveAllCharsToSets(&pairs);
|
| - if (VLOG_IS_ON(1)) {
|
| - LogStringSets(pairs);
|
| - }
|
| - // Step 3: Generate states.
|
| - std::vector<State> states = GenerateStates(pairs);
|
| - // Step 4/5: Print output
|
| - PrintStates(states, output);
|
| -
|
| - if (!filename.empty()) {
|
| - if (!base::CloseFile(output))
|
| - PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()
|
| - << "'";
|
| - }
|
| -
|
| - return EXIT_SUCCESS;
|
| -}
|
|
|