Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(8)

Side by Side Diff: src/unicode.cc

Issue 2522193002: Merged: Squashed multiple commits. (Closed)
Patch Set: Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/unicode-decoder.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // 4 //
5 // This file was generated at 2014-10-08 15:25:47.940335 5 // This file was generated at 2014-10-08 15:25:47.940335
6 6
7 #include "src/unicode.h" 7 #include "src/unicode.h"
8 #include "src/unicode-inl.h" 8 #include "src/unicode-inl.h"
9 #include <stdio.h> 9 #include <stdio.h>
10 #include <stdlib.h> 10 #include <stdlib.h>
(...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after
221 221
222 222
223 static inline bool IsContinuationCharacter(byte chr) { 223 static inline bool IsContinuationCharacter(byte chr) {
224 return chr >= 0x80 && chr <= 0xBF; 224 return chr >= 0x80 && chr <= 0xBF;
225 } 225 }
226 226
227 227
228 // This method decodes an UTF-8 value according to RFC 3629. 228 // This method decodes an UTF-8 value according to RFC 3629.
229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { 229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) {
230 size_t length = NonASCIISequenceLength(str[0]); 230 size_t length = NonASCIISequenceLength(str[0]);
231 if (length == 0 || max_length < length) { 231
232 *cursor += 1; 232 // Check continuation characters.
233 return kBadChar; 233 size_t max_count = std::min(length, max_length);
234 size_t count = 1;
235 while (count < max_count && IsContinuationCharacter(str[count])) {
236 count++;
234 } 237 }
235 if (length == 2) { 238 *cursor += count;
236 if (!IsContinuationCharacter(str[1])) { 239
237 *cursor += 1; 240 // There must be enough continuation characters.
241 if (count != length) return kBadChar;
242
243 // Check overly long sequences & other conditions.
244 if (length == 3) {
245 if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) {
246 // Overlong three-byte sequence?
247 return kBadChar;
248 } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) {
249 // High and low surrogate halves?
238 return kBadChar; 250 return kBadChar;
239 } 251 }
240 *cursor += 2; 252 } else if (length == 4) {
241 return ((str[0] << 6) + str[1]) - 0x00003080; 253 if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) {
242 } 254 // Overlong four-byte sequence.
243 if (length == 3) { 255 return kBadChar;
244 switch (str[0]) { 256 } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) {
245 case 0xE0: 257 // Code points outside of the unicode range.
246 // Overlong three-byte sequence.
247 if (str[1] < 0xA0 || str[1] > 0xBF) {
248 *cursor += 1;
249 return kBadChar;
250 }
251 break;
252 case 0xED:
253 // High and low surrogate halves.
254 if (str[1] < 0x80 || str[1] > 0x9F) {
255 *cursor += 1;
256 return kBadChar;
257 }
258 break;
259 default:
260 if (!IsContinuationCharacter(str[1])) {
261 *cursor += 1;
262 return kBadChar;
263 }
264 }
265 if (!IsContinuationCharacter(str[2])) {
266 *cursor += 1;
267 return kBadChar; 258 return kBadChar;
268 } 259 }
269 *cursor += 3;
270 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
271 } 260 }
272 DCHECK(length == 4); 261
273 switch (str[0]) { 262 // All errors have been handled, so we only have to assemble the result.
274 case 0xF0: 263 switch (length) {
275 // Overlong four-byte sequence. 264 case 1:
276 if (str[1] < 0x90 || str[1] > 0xBF) { 265 return str[0];
277 *cursor += 1; 266 case 2:
278 return kBadChar; 267 return ((str[0] << 6) + str[1]) - 0x00003080;
279 } 268 case 3:
280 break; 269 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080;
281 case 0xF4: 270 case 4:
282 // Code points outside of the unicode range. 271 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
283 if (str[1] < 0x80 || str[1] > 0x8F) { 272 0x03C82080;
284 *cursor += 1;
285 return kBadChar;
286 }
287 break;
288 default:
289 if (!IsContinuationCharacter(str[1])) {
290 *cursor += 1;
291 return kBadChar;
292 }
293 } 273 }
294 if (!IsContinuationCharacter(str[2])) { 274
295 *cursor += 1; 275 UNREACHABLE();
296 return kBadChar; 276 return kBadChar;
297 }
298 if (!IsContinuationCharacter(str[3])) {
299 *cursor += 1;
300 return kBadChar;
301 }
302 *cursor += 4;
303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) -
304 0x03C82080;
305 } 277 }
306 278
307 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { 279 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) {
308 DCHECK_NOT_NULL(buffer); 280 DCHECK_NOT_NULL(buffer);
309 281
310 // The common case: 1-byte Utf8 (and no incomplete char in the buffer) 282 // The common case: 1-byte Utf8 (and no incomplete char in the buffer)
311 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) { 283 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) {
312 return static_cast<uchar>(next); 284 return static_cast<uchar>(next);
313 } 285 }
314 286
315 if (*buffer == 0) { 287 if (*buffer == 0) {
316 // We're at the start of a new character. 288 // We're at the start of a new character.
317 uint32_t kind = NonASCIISequenceLength(next); 289 uint32_t kind = NonASCIISequenceLength(next);
318 if (kind >= 2 && kind <= 4) { 290 if (kind >= 2 && kind <= 4) {
319 // Start of 2..4 byte character, and no buffer. 291 // Start of 2..4 byte character, and no buffer.
320 292
321 // The mask for the lower bits depends on the kind, and is 293 // The mask for the lower bits depends on the kind, and is
322 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that 294 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that
323 // with one shift. 295 // with one shift.
324 uint8_t mask = 0x7f >> kind; 296 uint8_t mask = 0x7f >> kind;
325 297
326 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value 298 // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes)
327 // in the bottom three. 299 // in 2nd nibble, and the value in the bottom three. The 2nd nibble is
328 *buffer = (kind - 1) << 24 | (next & mask); 300 // intended as a counter about how many bytes are still needed.
301 *buffer = kind << 28 | (kind - 1) << 24 | (next & mask);
329 return kIncomplete; 302 return kIncomplete;
330 } else { 303 } else {
331 // No buffer, and not the start of a 1-byte char (handled at the 304 // No buffer, and not the start of a 1-byte char (handled at the
332 // beginning), and not the start of a 2..4 byte char? Bad char. 305 // beginning), and not the start of a 2..4 byte char? Bad char.
333 *buffer = 0; 306 *buffer = 0;
334 return kBadChar; 307 return kBadChar;
335 } 308 }
336 } else if (*buffer <= 0xff) { 309 } else if (*buffer <= 0xff) {
337 // We have one unprocessed byte left (from the last else case in this if 310 // We have one unprocessed byte left (from the last else case in this if
338 // statement). 311 // statement).
339 uchar previous = *buffer; 312 uchar previous = *buffer;
340 *buffer = 0; 313 *buffer = 0;
341 uchar t = ValueOfIncremental(previous, buffer); 314 uchar t = ValueOfIncremental(previous, buffer);
342 if (t == kIncomplete) { 315 if (t == kIncomplete) {
343 // If we have an incomplete character, process both the previous and the 316 // If we have an incomplete character, process both the previous and the
344 // next byte at once. 317 // next byte at once.
345 return ValueOfIncremental(next, buffer); 318 return ValueOfIncremental(next, buffer);
346 } else { 319 } else {
347 // Otherwise, process the previous byte and save the next byte for next 320 // Otherwise, process the previous byte and save the next byte for next
348 // time. 321 // time.
349 DCHECK_EQ(0u, *buffer); 322 DCHECK_EQ(0u, *buffer);
350 *buffer = next; 323 *buffer = next;
351 return t; 324 return t;
352 } 325 }
353 } else if (IsContinuationCharacter(next)) { 326 } else if (IsContinuationCharacter(next)) {
354 // We're inside of a character, as described by buffer. 327 // We're inside of a character, as described by buffer.
355 328
356 // How many bytes (excluding this one) do we still expect? 329 // How many bytes (excluding this one) do we still expect?
357 uint8_t count = (*buffer >> 24) - 1; 330 uint8_t bytes_expected = *buffer >> 28;
331 uint8_t bytes_left = (*buffer >> 24) & 0x0f;
332 bytes_left--;
358 // Update the value. 333 // Update the value.
359 uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F); 334 uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F);
360 if (count) { 335 if (bytes_left) {
361 *buffer = count << 24 | value; 336 *buffer = (bytes_expected << 28 | bytes_left << 24 | value);
362 return kIncomplete; 337 return kIncomplete;
363 } else { 338 } else {
364 *buffer = 0; 339 *buffer = 0;
365 return value; 340 bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) ||
341 (bytes_expected == 3 && value < 0x800);
342 return sequence_was_too_long ? kBadChar : value;
366 } 343 }
367 } else { 344 } else {
368 // Within a character, but not a continuation character? Then the 345 // Within a character, but not a continuation character? Then the
369 // previous char was a bad char. But we need to save the current 346 // previous char was a bad char. But we need to save the current
370 // one. 347 // one.
371 *buffer = next; 348 *buffer = next;
372 return kBadChar; 349 return kBadChar;
373 } 350 }
374 } 351 }
375 352
(...skipping 3150 matching lines...) Expand 10 before | Expand all | Expand 10 after
3526 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3503 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3527 + 3504 +
3528 kCanonicalizationRangeMultiStrings1Size * 3505 kCanonicalizationRangeMultiStrings1Size *
3529 sizeof(MultiCharacterSpecialCase<1>) // NOLINT 3506 sizeof(MultiCharacterSpecialCase<1>) // NOLINT
3530 + 3507 +
3531 kCanonicalizationRangeMultiStrings7Size * 3508 kCanonicalizationRangeMultiStrings7Size *
3532 sizeof(MultiCharacterSpecialCase<1>); // NOLINT 3509 sizeof(MultiCharacterSpecialCase<1>); // NOLINT
3533 } 3510 }
3534 3511
3535 } // namespace unibrow 3512 } // namespace unibrow
OLDNEW
« no previous file with comments | « no previous file | src/unicode-decoder.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698