base/i18n/build_utf8_validator_tables.cc - Issue 1647803004: Move base to DEPS

Side by Side Diff: base/i18n/build_utf8_validator_tables.cc

Issue 1647803004: Move base to DEPS (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
	(Empty)
1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.

4

5 // Create a state machine for validating UTF-8. The algorithm in brief:

6 // 1. Convert the complete unicode range of code points, except for the

7 // surrogate code points, to an ordered array of sequences of bytes in

8 // UTF-8.

9 // 2. Convert individual bytes to ranges, starting from the right of each byte

10 // sequence. For each range, ensure the bytes on the left and the ranges

11 // on the right are the identical.

12 // 3. Convert the resulting list of ranges into a state machine, collapsing

13 // identical states.

14 // 4. Convert the state machine to an array of bytes.

15 // 5. Output as a C++ file.

16 //

17 // To use:

18 // $ ninja -C out/Release build_utf8_validator_tables

19 // $ out/Release/build_utf8_validator_tables

20 // --output=base/i18n/utf8_validator_tables.cc

21 // $ git add base/i18n/utf8_validator_tables.cc

22 //

23 // Because the table is not expected to ever change, it is checked into the

24 // repository rather than being regenerated at build time.

25 //

26 // This code uses type uint8 throughout to represent bytes, to avoid

27 // signed/unsigned char confusion.

28

29 #include <stdio.h>

30 #include <stdlib.h>

31 #include <string.h>

32

33 #include <algorithm>

34 #include <map>

35 #include <string>

36 #include <vector>

37

38 #include "base/basictypes.h"

39 #include "base/command_line.h"

40 #include "base/files/file_path.h"

41 #include "base/files/file_util.h"

42 #include "base/logging.h"

43 #include "base/numerics/safe_conversions.h"

44 #include "base/strings/stringprintf.h"

45 #include "third_party/icu/source/common/unicode/utf8.h"

46

47 namespace {

48

49 const char kHelpText[] =

50 "Usage: build_utf8_validator_tables [ --help ] [ --output=<file> ]\n";

51

52 const char kProlog[] =

53 "// Copyright 2013 The Chromium Authors. All rights reserved.\n"

54 "// Use of this source code is governed by a BSD-style license that can "

55 "be\n"

56 "// found in the LICENSE file.\n"

57 "\n"

58 "// This file is auto-generated by build_utf8_validator_tables.\n"

59 "// DO NOT EDIT.\n"

60 "\n"

61 "#include \"base/i18n/utf8_validator_tables.h\"\n"

62 "\n"

63 "namespace base {\n"

64 "namespace internal {\n"

65 "\n"

66 "const uint8 kUtf8ValidatorTables[] = {\n";

67

68 const char kEpilog[] =

69 "};\n"

70 "\n"

71 "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n"

72 "\n"

73 "} // namespace internal\n"

74 "} // namespace base\n";

75

76 // Ranges are inclusive at both ends--they represent [from, to]

77 class Range {

78 public:

79 // Ranges always start with just one byte.

80 explicit Range(uint8 value) : from_(value), to_(value) {}

81

82 // Range objects are copyable and assignable to be used in STL

83 // containers. Since they only contain non-pointer POD types, the default copy

84 // constructor, assignment operator and destructor will work.

85

86 // Add a byte to the range. We intentionally only support adding a byte at the

87 // end, since that is the only operation the code needs.

88 void AddByte(uint8 to) {

89 CHECK(to == to_ + 1);

90 to_ = to;

91 }

92

93 uint8 from() const { return from_; }

94 uint8 to() const { return to_; }

95

96 bool operator<(const Range& rhs) const {

97 return (from() < rhs.from() \|\| (from() == rhs.from() && to() < rhs.to()));

98 }

99

100 bool operator==(const Range& rhs) const {

101 return from() == rhs.from() && to() == rhs.to();

102 }

103

104 private:

105 uint8 from_;

106 uint8 to_;

107 };

108

109 // A vector of Ranges is like a simple regular expression--it corresponds to

110 // a set of strings of the same length that have bytes in each position in

111 // the appropriate range.

112 typedef std::vector<Range> StringSet;

113

114 // A UTF-8 "character" is represented by a sequence of bytes.

115 typedef std::vector<uint8> Character;

116

117 // In the second stage of the algorithm, we want to convert a large list of

118 // Characters into a small list of StringSets.

119 struct Pair {

120 Character character;

121 StringSet set;

122 };

123

124 typedef std::vector<Pair> PairVector;

125

126 // A class to print a table of numbers in the same style as clang-format.

127 class TablePrinter {

128 public:

129 explicit TablePrinter(FILE* stream)

130 : stream_(stream), values_on_this_line_(0), current_offset_(0) {}

131

132 void PrintValue(uint8 value) {

133 if (values_on_this_line_ == 0) {

134 fputs(" ", stream_);

135 } else if (values_on_this_line_ == kMaxValuesPerLine) {

136 fprintf(stream_, " // 0x%02x\n ", current_offset_);

137 values_on_this_line_ = 0;

138 }

139 fprintf(stream_, " 0x%02x,", static_cast<int>(value));

140 ++values_on_this_line_;

141 ++current_offset_;

142 }

143

144 void NewLine() {

145 while (values_on_this_line_ < kMaxValuesPerLine) {

146 fputs(" ", stream_);

147 ++values_on_this_line_;

148 }

149 fprintf(stream_, " // 0x%02x\n", current_offset_);

150 values_on_this_line_ = 0;

151 }

152

153 private:

154 // stdio stream. Not owned.

155 FILE* stream_;

156

157 // Number of values so far printed on this line.

158 int values_on_this_line_;

159

160 // Total values printed so far.

161 int current_offset_;

162

163 static const int kMaxValuesPerLine = 8;

164

165 DISALLOW_COPY_AND_ASSIGN(TablePrinter);

166 };

167

168 // Start by filling a PairVector with characters. The resulting vector goes from

169 // "\x00" to "\xf4\x8f\xbf\xbf".

170 PairVector InitializeCharacters() {

171 PairVector vector;

172 for (int i = 0; i <= 0x10FFFF; ++i) {

173 if (i >= 0xD800 && i < 0xE000) {

174 // Surrogate codepoints are not permitted. Non-character code points are

175 // explicitly permitted.

176 continue;

177 }

178 uint8 bytes[4];

179 unsigned int offset = 0;

180 UBool is_error = false;

181 U8_APPEND(bytes, offset, arraysize(bytes), i, is_error);

182 DCHECK(!is_error);

183 DCHECK_GT(offset, 0u);

184 DCHECK_LE(offset, arraysize(bytes));

185 Pair pair = {Character(bytes, bytes + offset), StringSet()};

186 vector.push_back(pair);

187 }

188 return vector;

189 }

190

191 // Construct a new Pair from \|character\| and the concatenation of \|new_range\|

192 // and \|existing_set\|, and append it to \|pairs\|.

193 void ConstructPairAndAppend(const Character& character,

194 const Range& new_range,

195 const StringSet& existing_set,

196 PairVector* pairs) {

197 Pair new_pair = {character, StringSet(1, new_range)};

198 new_pair.set.insert(

199 new_pair.set.end(), existing_set.begin(), existing_set.end());

200 pairs->push_back(new_pair);

201 }

202

203 // Each pass over the PairVector strips one byte off the right-hand-side of the

204 // characters and adds a range to the set on the right. For example, the first

205 // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0",

206 // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0",

207 // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0",

208 // [\xa0-\xbf][\x80-\xbf]).

209 void MoveRightMostCharToSet(PairVector* pairs) {

210 PairVector new_pairs;

211 PairVector::const_iterator it = pairs->begin();

212 while (it != pairs->end() && it->character.empty()) {

213 new_pairs.push_back(*it);

214 ++it;

215 }

216 CHECK(it != pairs->end());

217 Character unconverted_bytes(it->character.begin(), it->character.end() - 1);

218 Range new_range(it->character.back());

219 StringSet converted = it->set;

220 ++it;

221 while (it != pairs->end()) {

222 const Pair& current_pair = *it++;

223 if (current_pair.character.size() == unconverted_bytes.size() + 1 &&

224 std::equal(unconverted_bytes.begin(),

225 unconverted_bytes.end(),

226 current_pair.character.begin()) &&

227 converted == current_pair.set) {

228 // The particular set of UTF-8 codepoints we are validating guarantees

229 // that each byte range will be contiguous. This would not necessarily be

230 // true for an arbitrary set of UTF-8 codepoints.

231 DCHECK_EQ(new_range.to() + 1, current_pair.character.back());

232 new_range.AddByte(current_pair.character.back());

233 continue;

234 }

235 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);

236 unconverted_bytes = Character(current_pair.character.begin(),

237 current_pair.character.end() - 1);

238 new_range = Range(current_pair.character.back());

239 converted = current_pair.set;

240 }

241 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs);

242 new_pairs.swap(*pairs);

243 }

244

245 void MoveAllCharsToSets(PairVector* pairs) {

246 // Since each pass of the function moves one character, and UTF-8 sequences

247 // are at most 4 characters long, this simply runs the algorithm four times.

248 for (int i = 0; i < 4; ++i) {

249 MoveRightMostCharToSet(pairs);

250 }

251 #if DCHECK_IS_ON()

252 for (PairVector::const_iterator it = pairs->begin(); it != pairs->end();

253 ++it) {

254 DCHECK(it->character.empty());

255 }

256 #endif

257 }

258

259 // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f],

260 // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the

261 // algorithm is working. Use the command-line option

262 // --vmodule=build_utf8_validator_tables=1 to see this output.

263 void LogStringSets(const PairVector& pairs) {

264 for (PairVector::const_iterator pair_it = pairs.begin();

265 pair_it != pairs.end();

266 ++pair_it) {

267 std::string set_as_string;

268 for (StringSet::const_iterator set_it = pair_it->set.begin();

269 set_it != pair_it->set.end();

270 ++set_it) {

271 set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]",

272 static_cast<int>(set_it->from()),

273 static_cast<int>(set_it->to()));

274 }

275 VLOG(1) << set_as_string;

276 }

277 }

278

279 // A single state in the state machine is represented by a sorted vector of

280 // start bytes and target states. All input bytes in the range between the start

281 // byte and the next entry in the vector (or 0xFF) result in a transition to the

282 // target state.

283 struct StateRange {

284 uint8 from;

285 uint8 target_state;

286 };

287

288 typedef std::vector<StateRange> State;

289

290 // Generates a state where all bytes go to state 1 (invalid). This is also used

291 // as an initialiser for other states (since bytes from outside the desired

292 // range are invalid).

293 State GenerateInvalidState() {

294 const StateRange range = {0, 1};

295 return State(1, range);

296 }

297

298 // A map from a state (ie. a set of strings which will match from this state) to

299 // a number (which is an index into the array of states).

300 typedef std::map<StringSet, uint8> StateMap;

301

302 // Create a new state corresponding to \|set\|, add it \|states\| and \|state_map\|

303 // and return the index it was given in \|states\|.

304 uint8 MakeState(const StringSet& set,

305 std::vector<State>* states,

306 StateMap* state_map) {

307 DCHECK(!set.empty());

308 const Range& range = set.front();

309 const StringSet rest(set.begin() + 1, set.end());

310 const StateMap::const_iterator where = state_map->find(rest);

311 const uint8 target_state = where == state_map->end()

312 ? MakeState(rest, states, state_map)

313 : where->second;

314 DCHECK_LT(0, range.from());

315 DCHECK_LT(range.to(), 0xFF);

316 const StateRange new_state_initializer[] = {

317 {0, 1}, {range.from(), target_state},

318 {static_cast<uint8>(range.to() + 1), 1}};

319 states->push_back(

320 State(new_state_initializer,

321 new_state_initializer + arraysize(new_state_initializer)));

322 const uint8 new_state_number =

323 base::checked_cast<uint8>(states->size() - 1);

324 CHECK(state_map->insert(std::make_pair(set, new_state_number)).second);

325 return new_state_number;

326 }

327

328 std::vector<State> GenerateStates(const PairVector& pairs) {

329 // States 0 and 1 are the initial/valid state and invalid state, respectively.

330 std::vector<State> states(2, GenerateInvalidState());

331 StateMap state_map;

332 state_map.insert(std::make_pair(StringSet(), 0));

333 for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) {

334 DCHECK(it->character.empty());

335 DCHECK(!it->set.empty());

336 const Range& range = it->set.front();

337 const StringSet rest(it->set.begin() + 1, it->set.end());

338 const StateMap::const_iterator where = state_map.find(rest);

339 const uint8 target_state = where == state_map.end()

340 ? MakeState(rest, &states, &state_map)

341 : where->second;

342 if (states[0].back().from == range.from()) {

343 DCHECK_EQ(1, states[0].back().target_state);

344 states[0].back().target_state = target_state;

345 DCHECK_LT(range.to(), 0xFF);

346 const StateRange new_range = {static_cast<uint8>(range.to() + 1), 1};

347 states[0].push_back(new_range);

348 } else {

349 DCHECK_LT(range.to(), 0xFF);

350 const StateRange new_range_initializer[] = {{range.from(), target_state},

351 {static_cast<uint8>(range.to() + 1), 1}};

352 states[0]

353 .insert(states[0].end(),

354 new_range_initializer,

355 new_range_initializer + arraysize(new_range_initializer));

356 }

357 }

358 return states;

359 }

360

361 // Output the generated states as a C++ table. Two tricks are used to compact

362 // the table: each state in the table starts with a shift value which indicates

363 // how many bits we can discard from the right-hand-side of the byte before

364 // doing the table lookup. Secondly, only the state-transitions for bytes

365 // with the top-bit set are included in the table; bytes without the top-bit set

366 // are just ASCII and are handled directly by the code.

367 void PrintStates(const std::vector<State>& states, FILE* stream) {

368 // First calculate the start-offset of each state. This allows the state

369 // machine to jump directly to the correct offset, avoiding an extra

370 // indirection. State 0 starts at offset 0.

371 std::vector<uint8> state_offset(1, 0);

372 std::vector<uint8> shifts;

373 uint8 pos = 0;

374

375 for (std::vector<State>::const_iterator state_it = states.begin();

376 state_it != states.end();

377 ++state_it) {

378 // We want to set \|shift\| to the (0-based) index of the least-significant

379 // set bit in any of the ranges for this state, since this tells us how many

380 // bits we can discard and still determine what range a byte lies in. Sadly

381 // it appears that ffs() is not portable, so we do it clumsily.

382 uint8 shift = 7;

383 for (State::const_iterator range_it = state_it->begin();

384 range_it != state_it->end();

385 ++range_it) {

386 while (shift > 0 && range_it->from % (1 << shift) != 0) {

387 --shift;

388 }

389 }

390 shifts.push_back(shift);

391 pos += 1 + (1 << (7 - shift));

392 state_offset.push_back(pos);

393 }

394

395 DCHECK_EQ(129, state_offset[1]);

396

397 fputs(kProlog, stream);

398 TablePrinter table_printer(stream);

399

400 for (uint8 state_index = 0; state_index < states.size(); ++state_index) {

401 const uint8 shift = shifts[state_index];

402 uint8 next_range = 0;

403 uint8 target_state = 1;

404 fprintf(stream,

405 " // State %d, offset 0x%02x\n",

406 static_cast<int>(state_index),

407 static_cast<int>(state_offset[state_index]));

408 table_printer.PrintValue(shift);

409 for (int i = 0; i < 0x100; i += (1 << shift)) {

410 if (next_range < states[state_index].size() &&

411 states[state_index][next_range].from == i) {

412 target_state = states[state_index][next_range].target_state;

413 ++next_range;

414 }

415 if (i >= 0x80) {

416 table_printer.PrintValue(state_offset[target_state]);

417 }

418 }

419 table_printer.NewLine();

420 }

421

422 fputs(kEpilog, stream);

423 }

424

425 } // namespace

426

427 int main(int argc, char* argv[]) {

428 base::CommandLine::Init(argc, argv);

429 logging::LoggingSettings settings;

430 settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG;

431 logging::InitLogging(settings);

432 if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) {

433 fwrite(kHelpText, 1, arraysize(kHelpText), stdout);

434 exit(EXIT_SUCCESS);

435 }

436 base::FilePath filename =

437 base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output");

438

439 FILE* output = stdout;

440 if (!filename.empty()) {

441 output = base::OpenFile(filename, "wb");

442 if (!output)

443 PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe()

444 << "' for writing";

445 }

446

447 // Step 1: Enumerate the characters

448 PairVector pairs = InitializeCharacters();

449 // Step 2: Convert to sets.

450 MoveAllCharsToSets(&pairs);

451 if (VLOG_IS_ON(1)) {

452 LogStringSets(pairs);

453 }

454 // Step 3: Generate states.

455 std::vector<State> states = GenerateStates(pairs);

456 // Step 4/5: Print output

457 PrintStates(states, output);

458

459 if (!filename.empty()) {

460 if (!base::CloseFile(output))

461 PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe()

462 << "'";

463 }

464

465 return EXIT_SUCCESS;

466 }

OLD	NEW

« no previous file with comments | « base/i18n/break_iterator_unittest.cc ('k') | base/i18n/case_conversion.h » ('j') | no next file with comments »