OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // Create a state machine for validating UTF-8. The algorithm in brief: | 5 // Create a state machine for validating UTF-8. The algorithm in brief: |
6 // 1. Convert the complete unicode range of code points, except for the | 6 // 1. Convert the complete unicode range of code points, except for the |
7 // surrogate code points, to an ordered array of sequences of bytes in | 7 // surrogate code points, to an ordered array of sequences of bytes in |
8 // UTF-8. | 8 // UTF-8. |
9 // 2. Convert individual bytes to ranges, starting from the right of each byte | 9 // 2. Convert individual bytes to ranges, starting from the right of each byte |
10 // sequence. For each range, ensure the bytes on the left and the ranges | 10 // sequence. For each range, ensure the bytes on the left and the ranges |
(...skipping 230 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
241 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); | 241 ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); |
242 new_pairs.swap(*pairs); | 242 new_pairs.swap(*pairs); |
243 } | 243 } |
244 | 244 |
245 void MoveAllCharsToSets(PairVector* pairs) { | 245 void MoveAllCharsToSets(PairVector* pairs) { |
246 // Since each pass of the function moves one character, and UTF-8 sequences | 246 // Since each pass of the function moves one character, and UTF-8 sequences |
247 // are at most 4 characters long, this simply runs the algorithm four times. | 247 // are at most 4 characters long, this simply runs the algorithm four times. |
248 for (int i = 0; i < 4; ++i) { | 248 for (int i = 0; i < 4; ++i) { |
249 MoveRightMostCharToSet(pairs); | 249 MoveRightMostCharToSet(pairs); |
250 } | 250 } |
251 #if DCHECK_IS_ON | 251 #if DCHECK_IS_ON() |
252 for (PairVector::const_iterator it = pairs->begin(); it != pairs->end(); | 252 for (PairVector::const_iterator it = pairs->begin(); it != pairs->end(); |
253 ++it) { | 253 ++it) { |
254 DCHECK(it->character.empty()); | 254 DCHECK(it->character.empty()); |
255 } | 255 } |
256 #endif | 256 #endif |
257 } | 257 } |
258 | 258 |
259 // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f], | 259 // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f], |
260 // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the | 260 // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the |
261 // algorithm is working. Use the command-line option | 261 // algorithm is working. Use the command-line option |
(...skipping 195 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
457 PrintStates(states, output); | 457 PrintStates(states, output); |
458 | 458 |
459 if (!filename.empty()) { | 459 if (!filename.empty()) { |
460 if (!base::CloseFile(output)) | 460 if (!base::CloseFile(output)) |
461 PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe() | 461 PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe() |
462 << "'"; | 462 << "'"; |
463 } | 463 } |
464 | 464 |
465 return EXIT_SUCCESS; | 465 return EXIT_SUCCESS; |
466 } | 466 } |
OLD | NEW |