base/i18n/build_utf8_validator_tables.cc - Issue 1647803004: Move base to DEPS

Unified Diff: base/i18n/build_utf8_validator_tables.cc

Issue 1647803004: Move base to DEPS (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: base/i18n/build_utf8_validator_tables.cc

diff --git a/base/i18n/build_utf8_validator_tables.cc b/base/i18n/build_utf8_validator_tables.cc

deleted file mode 100644

index ae5b1a71e9df2ad961d4f56dcf12f4ef49f73af9..0000000000000000000000000000000000000000

--- a/base/i18n/build_utf8_validator_tables.cc

+++ /dev/null

@@ -1,466 +0,0 @@

-// Use of this source code is governed by a BSD-style license that can be

-// found in the LICENSE file.

-// Create a state machine for validating UTF-8. The algorithm in brief:

-// 1. Convert the complete unicode range of code points, except for the

-// surrogate code points, to an ordered array of sequences of bytes in

-// UTF-8.

-// 2. Convert individual bytes to ranges, starting from the right of each byte

-// sequence. For each range, ensure the bytes on the left and the ranges

-// on the right are the identical.

-// 3. Convert the resulting list of ranges into a state machine, collapsing

-// identical states.

-// 4. Convert the state machine to an array of bytes.

-// 5. Output as a C++ file.

-//

-// To use:

-// $ ninja -C out/Release build_utf8_validator_tables

-// $ out/Release/build_utf8_validator_tables

-// --output=base/i18n/utf8_validator_tables.cc

-// $ git add base/i18n/utf8_validator_tables.cc

-//

-// Because the table is not expected to ever change, it is checked into the

-// repository rather than being regenerated at build time.

-//

-// This code uses type uint8 throughout to represent bytes, to avoid

-// signed/unsigned char confusion.

-#include <stdio.h>

-#include <stdlib.h>

-#include <string.h>

-#include <algorithm>

-#include <map>

-#include <string>

-#include <vector>

-#include "base/basictypes.h"

-#include "base/command_line.h"

-#include "base/files/file_path.h"

-#include "base/files/file_util.h"

-#include "base/logging.h"

-#include "base/numerics/safe_conversions.h"

-#include "base/strings/stringprintf.h"

-#include "third_party/icu/source/common/unicode/utf8.h"

-namespace {

-const char kHelpText[] =

- "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";

-const char kProlog[] =

- "// Use of this source code is governed by a BSD-style license that can "

- "be\n"

- "// found in the LICENSE file.\n"

- "\n"

- "// This file is auto-generated by build_utf8_validator_tables.\n"

- "// DO NOT EDIT.\n"

- "\n"

- "#include \"base/i18n/utf8_validator_tables.h\"\n"

- "\n"

- "namespace base {\n"

- "namespace internal {\n"

- "\n"

- "const uint8 kUtf8ValidatorTables[] = {\n";

-const char kEpilog[] =

- "};\n"

- "\n"

- "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n"

- "\n"

- "} // namespace internal\n"

- "} // namespace base\n";

-// Ranges are inclusive at both ends--they represent [from, to]

-class Range {

- public:

- // Ranges always start with just one byte.

- explicit Range(uint8 value) : from_(value), to_(value) {}

- // Range objects are copyable and assignable to be used in STL

- // containers. Since they only contain non-pointer POD types, the default copy

- // constructor, assignment operator and destructor will work.

- // Add a byte to the range. We intentionally only support adding a byte at the

- // end, since that is the only operation the code needs.

- void AddByte(uint8 to) {

- CHECK(to == to_ + 1);

- to_ = to;

- }

- uint8 from() const { return from_; }

- uint8 to() const { return to_; }

- bool operator<(const Range& rhs) const {

- return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to()));

- }

- bool operator==(const Range& rhs) const {

- return from() == rhs.from() && to() == rhs.to();

- }

- private:

- uint8 from_;

- uint8 to_;

-};

-// A vector of Ranges is like a simple regular expression--it corresponds to

-// a set of strings of the same length that have bytes in each position in

-// the appropriate range.

-typedef std::vector<Range> StringSet;

-// A UTF-8 "character" is represented by a sequence of bytes.

-typedef std::vector<uint8> Character;

-// In the second stage of the algorithm, we want to convert a large list of

-// Characters into a small list of StringSets.

-struct Pair {

- Character character;

- StringSet set;

-};

-typedef std::vector<Pair> PairVector;

-// A class to print a table of numbers in the same style as clang-format.

-class TablePrinter {

- public:

- explicit TablePrinter(FILE* stream)

- : stream_(stream), values_on_this_line_(0), current_offset_(0) {}

- void PrintValue(uint8 value) {

- if (values_on_this_line_ == 0) {

- fputs(" ", stream_);

- } else if (values_on_this_line_ == kMaxValuesPerLine) {

- fprintf(stream_, " // 0x%02x\n ", current_offset_);

- values_on_this_line_ = 0;

- }

- fprintf(stream_, " 0x%02x,", static_cast<int>(value));

- ++values_on_this_line_;

- ++current_offset_;

- }

- void NewLine() {

- while (values_on_this_line_ < kMaxValuesPerLine) {

- fputs(" ", stream_);

- ++values_on_this_line_;

- }

- fprintf(stream_, " // 0x%02x\n", current_offset_);

- values_on_this_line_ = 0;

- }

- private:

- // stdio stream. Not owned.

- FILE* stream_;

- // Number of values so far printed on this line.

- int values_on_this_line_;

- // Total values printed so far.

- int current_offset_;

- static const int kMaxValuesPerLine = 8;

- DISALLOW_COPY_AND_ASSIGN(TablePrinter);

-};

-// Start by filling a PairVector with characters. The resulting vector goes from

-// "\x00" to "\xf4\x8f\xbf\xbf".

-PairVector InitializeCharacters() {

- PairVector vector;

- for (int i = 0; i <= 0x10FFFF; ++i) {

- if (i >= 0xD800 && i < 0xE000) {

- // Surrogate codepoints are not permitted. Non-character code points are

- // explicitly permitted.

- continue;

- }

- uint8 bytes[4];

- unsigned int offset = 0;

- UBool is_error = false;

- U8_APPEND(bytes, offset, arraysize(bytes), i, is_error);

- DCHECK(!is_error);

- DCHECK_GT(offset, 0u);

- DCHECK_LE(offset, arraysize(bytes));

- Pair pair = {Character(bytes, bytes + offset), StringSet()};

- vector.push_back(pair);

- }

- return vector;

-// Construct a new Pair from |character| and the concatenation of |new_range|

-// and |existing_set|, and append it to |pairs|.

-void ConstructPairAndAppend(const Character& character,

- const Range& new_range,

- const StringSet& existing_set,

- PairVector* pairs) {

- Pair new_pair = {character, StringSet(1, new_range)};

- new_pair.set.insert(

- new_pair.set.end(), existing_set.begin(), existing_set.end());

- pairs->push_back(new_pair);

-// Each pass over the PairVector strips one byte off the right-hand-side of the

-// characters and adds a range to the set on the right. For example, the first

-// pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",

-// [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",

-// [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",

-// [\xa0-\xbf][\x80-\xbf]).

-void MoveRightMostCharToSet(PairVector* pairs) {

- PairVector new_pairs;

- PairVector::const_iterator it = pairs->begin();

- while (it != pairs->end() && it->character.empty()) {

- new_pairs.push_back(*it);

- ++it;

- }

- CHECK(it != pairs->end());

- Character unconverted_bytes(it->character.begin(), it->character.end() - 1);

- Range new_range(it->character.back());

- StringSet converted = it->set;

- ++it;

- while (it != pairs->end()) {

- const Pair& current_pair = *it++;

- if (current_pair.character.size() == unconverted_bytes.size() + 1 &&

- std::equal(unconverted_bytes.begin(),

- unconverted_bytes.end(),

- current_pair.character.begin()) &&

- converted == current_pair.set) {

- // The particular set of UTF-8 codepoints we are validating guarantees

- // that each byte range will be contiguous. This would not necessarily be

- // true for an arbitrary set of UTF-8 codepoints.

- DCHECK_EQ(new_range.to() + 1, current_pair.character.back());

- new_range.AddByte(current_pair.character.back());

- continue;

- }

- ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);

- unconverted_bytes = Character(current_pair.character.begin(),

- current_pair.character.end() - 1);

- new_range = Range(current_pair.character.back());

- converted = current_pair.set;

- }

- ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);

- new_pairs.swap(*pairs);

-void MoveAllCharsToSets(PairVector* pairs) {

- // Since each pass of the function moves one character, and UTF-8 sequences

- // are at most 4 characters long, this simply runs the algorithm four times.

- for (int i = 0; i < 4; ++i) {

- MoveRightMostCharToSet(pairs);

- }

-#if DCHECK_IS_ON()

- for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();

- ++it) {

- DCHECK(it->character.empty());

- }

-#endif

-// Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],

-// [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the

-// algorithm is working. Use the command-line option

-// --vmodule=build_utf8_validator_tables=1 to see this output.

-void LogStringSets(const PairVector& pairs) {

- for (PairVector::const_iterator pair_it = pairs.begin();

- pair_it != pairs.end();

- ++pair_it) {

- std::string set_as_string;

- for (StringSet::const_iterator set_it = pair_it->set.begin();

- set_it != pair_it->set.end();

- ++set_it) {

- set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",

- static_cast<int>(set_it->from()),

- static_cast<int>(set_it->to()));

- }

- VLOG(1) << set_as_string;

- }

-// A single state in the state machine is represented by a sorted vector of

-// start bytes and target states. All input bytes in the range between the start

-// byte and the next entry in the vector (or 0xFF) result in a transition to the

-// target state.

-struct StateRange {

- uint8 from;

- uint8 target_state;

-};

-typedef std::vector<StateRange> State;

-// Generates a state where all bytes go to state 1 (invalid). This is also used

-// as an initialiser for other states (since bytes from outside the desired

-// range are invalid).

-State GenerateInvalidState() {

- const StateRange range = {0, 1};

- return State(1, range);

-// A map from a state (ie. a set of strings which will match from this state) to

-// a number (which is an index into the array of states).

-typedef std::map<StringSet, uint8> StateMap;

-// and return the index it was given in |states|.

-uint8 MakeState(const StringSet& set,

- std::vector<State>* states,

- StateMap* state_map) {

- DCHECK(!set.empty());

- const Range& range = set.front();

- const StringSet rest(set.begin() + 1, set.end());

- const StateMap::const_iterator where = state_map->find(rest);

- const uint8 target_state = where == state_map->end()

- ? MakeState(rest, states, state_map)

- : where->second;

- DCHECK_LT(0, range.from());

- DCHECK_LT(range.to(), 0xFF);

- const StateRange new_state_initializer[] = {

- {0, 1}, {range.from(), target_state},

- {static_cast<uint8>(range.to() + 1), 1}};

- states->push_back(

- State(new_state_initializer,

- new_state_initializer + arraysize(new_state_initializer)));

- const uint8 new_state_number =

- base::checked_cast<uint8>(states->size() - 1);

- CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);

- return new_state_number;

-std::vector<State> GenerateStates(const PairVector& pairs) {

- // States 0 and 1 are the initial/valid state and invalid state, respectively.

- std::vector<State> states(2, GenerateInvalidState());

- StateMap state_map;

- state_map.insert(std::make_pair(StringSet(), 0));

- for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) {

- DCHECK(it->character.empty());

- DCHECK(!it->set.empty());

- const Range& range = it->set.front();

- const StringSet rest(it->set.begin() + 1, it->set.end());

- const StateMap::const_iterator where = state_map.find(rest);

- const uint8 target_state = where == state_map.end()

- ? MakeState(rest, &states, &state_map)

- : where->second;

- if (states[0].back().from == range.from()) {

- DCHECK_EQ(1, states[0].back().target_state);

- states[0].back().target_state = target_state;

- DCHECK_LT(range.to(), 0xFF);

- const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1};

- states[0].push_back(new_range);

- } else {

- DCHECK_LT(range.to(), 0xFF);

- const StateRange new_range_initializer[] = {{range.from(), target_state},

- {static_cast<uint8>(range.to() + 1), 1}};

- states[0]

- .insert(states[0].end(),

- new_range_initializer,

- new_range_initializer + arraysize(new_range_initializer));

- }

- return states;

-// Output the generated states as a C++ table. Two tricks are used to compact

-// the table: each state in the table starts with a shift value which indicates

-// how many bits we can discard from the right-hand-side of the byte before

-// doing the table lookup. Secondly, only the state-transitions for bytes

-// with the top-bit set are included in the table; bytes without the top-bit set

-// are just ASCII and are handled directly by the code.

-void PrintStates(const std::vector<State>& states, FILE* stream) {

- // First calculate the start-offset of each state. This allows the state

- // machine to jump directly to the correct offset, avoiding an extra

- // indirection. State 0 starts at offset 0.

- std::vector<uint8> state_offset(1, 0);

- std::vector<uint8> shifts;

- uint8 pos = 0;

- for (std::vector<State>::const_iterator state_it = states.begin();

- state_it != states.end();

- ++state_it) {

- // We want to set |shift| to the (0-based) index of the least-significant

- // set bit in any of the ranges for this state, since this tells us how many

- // bits we can discard and still determine what range a byte lies in. Sadly

- // it appears that ffs() is not portable, so we do it clumsily.

- uint8 shift = 7;

- for (State::const_iterator range_it = state_it->begin();

- range_it != state_it->end();

- ++range_it) {

- while (shift > 0 && range_it->from % (1 << shift) != 0) {

- --shift;

- }

- shifts.push_back(shift);

- pos += 1 + (1 << (7 - shift));

- state_offset.push_back(pos);

- }

- DCHECK_EQ(129, state_offset[1]);

- fputs(kProlog, stream);

- TablePrinter table_printer(stream);

- for (uint8 state_index = 0; state_index < states.size(); ++state_index) {

- const uint8 shift = shifts[state_index];

- uint8 next_range = 0;

- uint8 target_state = 1;

- fprintf(stream,

- " // State %d, offset 0x%02x\n",

- static_cast<int>(state_index),

- static_cast<int>(state_offset[state_index]));

- table_printer.PrintValue(shift);

- for (int i = 0; i < 0x100; i += (1 << shift)) {

- if (next_range < states[state_index].size() &&

- states[state_index][next_range].from == i) {

- target_state = states[state_index][next_range].target_state;

- ++next_range;

- }

- if (i >= 0x80) {

- table_printer.PrintValue(state_offset[target_state]);

- }

- table_printer.NewLine();

- }

- fputs(kEpilog, stream);

-} // namespace

-int main(int argc, char* argv[]) {

- base::CommandLine::Init(argc, argv);

- logging::LoggingSettings settings;

- settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;

- logging::InitLogging(settings);

- if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) {

- fwrite(kHelpText, 1, arraysize(kHelpText), stdout);

- exit(EXIT_SUCCESS);

- }

- base::FilePath filename =

- base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");

- FILE* output = stdout;

- if (!filename.empty()) {

- output = base::OpenFile(filename, "wb");

- if (!output)

- PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()

- << "' for writing";

- }

- // Step 1: Enumerate the characters

- PairVector pairs = InitializeCharacters();

- // Step 2: Convert to sets.

- MoveAllCharsToSets(&pairs);

- if (VLOG_IS_ON(1)) {

- LogStringSets(pairs);

- }

- // Step 3: Generate states.

- std::vector<State> states = GenerateStates(pairs);

- // Step 4/5: Print output

- PrintStates(states, output);

- if (!filename.empty()) {

- if (!base::CloseFile(output))

- PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()

- << "'";

- }

- return EXIT_SUCCESS;

« no previous file with comments | « base/i18n/break_iterator_unittest.cc ('k') | base/i18n/case_conversion.h » ('j') | no next file with comments »