OLD | NEW |
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 // | 4 // |
5 // This file was generated at 2014-10-08 15:25:47.940335 | 5 // This file was generated at 2014-10-08 15:25:47.940335 |
6 | 6 |
7 #include "src/unicode.h" | 7 #include "src/unicode.h" |
8 #include "src/unicode-inl.h" | 8 #include "src/unicode-inl.h" |
9 #include <stdio.h> | 9 #include <stdio.h> |
10 #include <stdlib.h> | 10 #include <stdlib.h> |
(...skipping 210 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
221 | 221 |
222 | 222 |
223 static inline bool IsContinuationCharacter(byte chr) { | 223 static inline bool IsContinuationCharacter(byte chr) { |
224 return chr >= 0x80 && chr <= 0xBF; | 224 return chr >= 0x80 && chr <= 0xBF; |
225 } | 225 } |
226 | 226 |
227 | 227 |
228 // This method decodes an UTF-8 value according to RFC 3629. | 228 // This method decodes an UTF-8 value according to RFC 3629. |
229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { | 229 uchar Utf8::CalculateValue(const byte* str, size_t max_length, size_t* cursor) { |
230 size_t length = NonASCIISequenceLength(str[0]); | 230 size_t length = NonASCIISequenceLength(str[0]); |
231 if (length == 0 || max_length < length) { | 231 |
232 *cursor += 1; | 232 // Check continuation characters. |
| 233 size_t max_count = std::min(length, max_length); |
| 234 size_t count = 1; |
| 235 while (count < max_count && IsContinuationCharacter(str[count])) { |
| 236 count++; |
| 237 } |
| 238 |
| 239 // Check overly long sequences & other conditions. Use length as error |
| 240 // indicator. |
| 241 if (length == 3) { |
| 242 if (str[0] == 0xE0 && (str[1] < 0xA0 || str[1] > 0xBF)) { |
| 243 // Overlong three-byte sequence? |
| 244 length = 0; |
| 245 } else if (str[0] == 0xED && (str[1] < 0x80 || str[1] > 0x9F)) { |
| 246 // High and low surrogate halves? |
| 247 length = 0; |
| 248 } |
| 249 } else if (length == 4) { |
| 250 if (str[0] == 0xF0 && (str[1] < 0x90 || str[1] > 0xBF)) { |
| 251 // Overlong four-byte sequence. |
| 252 length = 0; |
| 253 } else if (str[0] == 0xF4 && (str[1] < 0x80 || str[1] > 0x8F)) { |
| 254 // Code points outside of the unicode range. |
| 255 length = 0; |
| 256 } |
| 257 } |
| 258 |
| 259 if (count != length) { |
| 260 // All invalid encodings should land here. |
| 261 *cursor += count; |
233 return kBadChar; | 262 return kBadChar; |
234 } | 263 } |
235 if (length == 2) { | 264 |
236 if (!IsContinuationCharacter(str[1])) { | 265 // All errors have been handled, so we only have to assemble the result. |
237 *cursor += 1; | 266 *cursor += length; |
238 return kBadChar; | 267 switch (length) { |
239 } | 268 case 1: |
240 *cursor += 2; | 269 return str[0]; |
241 return ((str[0] << 6) + str[1]) - 0x00003080; | 270 case 2: |
| 271 return ((str[0] << 6) + str[1]) - 0x00003080; |
| 272 case 3: |
| 273 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; |
| 274 case 4: |
| 275 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - |
| 276 0x03C82080; |
242 } | 277 } |
243 if (length == 3) { | 278 |
244 switch (str[0]) { | 279 UNREACHABLE(); |
245 case 0xE0: | 280 return kBadChar; |
246 // Overlong three-byte sequence. | |
247 if (str[1] < 0xA0 || str[1] > 0xBF) { | |
248 *cursor += 1; | |
249 return kBadChar; | |
250 } | |
251 break; | |
252 case 0xED: | |
253 // High and low surrogate halves. | |
254 if (str[1] < 0x80 || str[1] > 0x9F) { | |
255 *cursor += 1; | |
256 return kBadChar; | |
257 } | |
258 break; | |
259 default: | |
260 if (!IsContinuationCharacter(str[1])) { | |
261 *cursor += 1; | |
262 return kBadChar; | |
263 } | |
264 } | |
265 if (!IsContinuationCharacter(str[2])) { | |
266 *cursor += 1; | |
267 return kBadChar; | |
268 } | |
269 *cursor += 3; | |
270 return ((str[0] << 12) + (str[1] << 6) + str[2]) - 0x000E2080; | |
271 } | |
272 DCHECK(length == 4); | |
273 switch (str[0]) { | |
274 case 0xF0: | |
275 // Overlong four-byte sequence. | |
276 if (str[1] < 0x90 || str[1] > 0xBF) { | |
277 *cursor += 1; | |
278 return kBadChar; | |
279 } | |
280 break; | |
281 case 0xF4: | |
282 // Code points outside of the unicode range. | |
283 if (str[1] < 0x80 || str[1] > 0x8F) { | |
284 *cursor += 1; | |
285 return kBadChar; | |
286 } | |
287 break; | |
288 default: | |
289 if (!IsContinuationCharacter(str[1])) { | |
290 *cursor += 1; | |
291 return kBadChar; | |
292 } | |
293 } | |
294 if (!IsContinuationCharacter(str[2])) { | |
295 *cursor += 1; | |
296 return kBadChar; | |
297 } | |
298 if (!IsContinuationCharacter(str[3])) { | |
299 *cursor += 1; | |
300 return kBadChar; | |
301 } | |
302 *cursor += 4; | |
303 return ((str[0] << 18) + (str[1] << 12) + (str[2] << 6) + str[3]) - | |
304 0x03C82080; | |
305 } | 281 } |
306 | 282 |
307 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { | 283 uchar Utf8::ValueOfIncremental(byte next, Utf8IncrementalBuffer* buffer) { |
308 DCHECK_NOT_NULL(buffer); | 284 DCHECK_NOT_NULL(buffer); |
309 | 285 |
310 // The common case: 1-byte Utf8 (and no incomplete char in the buffer) | 286 // The common case: 1-byte Utf8 (and no incomplete char in the buffer) |
311 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) { | 287 if (V8_LIKELY(next <= kMaxOneByteChar && *buffer == 0)) { |
312 return static_cast<uchar>(next); | 288 return static_cast<uchar>(next); |
313 } | 289 } |
314 | 290 |
315 if (*buffer == 0) { | 291 if (*buffer == 0) { |
316 // We're at the start of a new character. | 292 // We're at the start of a new character. |
317 uint32_t kind = NonASCIISequenceLength(next); | 293 uint32_t kind = NonASCIISequenceLength(next); |
318 if (kind >= 2 && kind <= 4) { | 294 if (kind >= 2 && kind <= 4) { |
319 // Start of 2..4 byte character, and no buffer. | 295 // Start of 2..4 byte character, and no buffer. |
320 | 296 |
321 // The mask for the lower bits depends on the kind, and is | 297 // The mask for the lower bits depends on the kind, and is |
322 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that | 298 // 0x1F, 0x0F, 0x07 for kinds 2, 3, 4 respectively. We can get that |
323 // with one shift. | 299 // with one shift. |
324 uint8_t mask = 0x7f >> kind; | 300 uint8_t mask = 0x7f >> kind; |
325 | 301 |
326 // Store the kind - 1 (i.e., remaining bytes) in the top byte, value | 302 // Store the kind in the top nibble, and kind - 1 (i.e., remaining bytes) |
327 // in the bottom three. | 303 // in 2nd nibble, and the value in the bottom three. The 2nd nibble is |
328 *buffer = (kind - 1) << 24 | (next & mask); | 304 // intended as a counter about how many bytes are still needed. |
| 305 *buffer = kind << 28 | (kind - 1) << 24 | (next & mask); |
329 return kIncomplete; | 306 return kIncomplete; |
330 } else { | 307 } else { |
331 // No buffer, and not the start of a 1-byte char (handled at the | 308 // No buffer, and not the start of a 1-byte char (handled at the |
332 // beginning), and not the start of a 2..4 byte char? Bad char. | 309 // beginning), and not the start of a 2..4 byte char? Bad char. |
333 *buffer = 0; | 310 *buffer = 0; |
334 return kBadChar; | 311 return kBadChar; |
335 } | 312 } |
336 } else if (*buffer <= 0xff) { | 313 } else if (*buffer <= 0xff) { |
337 // We have one unprocessed byte left (from the last else case in this if | 314 // We have one unprocessed byte left (from the last else case in this if |
338 // statement). | 315 // statement). |
339 uchar previous = *buffer; | 316 uchar previous = *buffer; |
340 *buffer = 0; | 317 *buffer = 0; |
341 uchar t = ValueOfIncremental(previous, buffer); | 318 uchar t = ValueOfIncremental(previous, buffer); |
342 if (t == kIncomplete) { | 319 if (t == kIncomplete) { |
343 // If we have an incomplete character, process both the previous and the | 320 // If we have an incomplete character, process both the previous and the |
344 // next byte at once. | 321 // next byte at once. |
345 return ValueOfIncremental(next, buffer); | 322 return ValueOfIncremental(next, buffer); |
346 } else { | 323 } else { |
347 // Otherwise, process the previous byte and save the next byte for next | 324 // Otherwise, process the previous byte and save the next byte for next |
348 // time. | 325 // time. |
349 DCHECK_EQ(0, *buffer); | 326 DCHECK_EQ(0, *buffer); |
350 *buffer = next; | 327 *buffer = next; |
351 return t; | 328 return t; |
352 } | 329 } |
353 } else if (IsContinuationCharacter(next)) { | 330 } else if (IsContinuationCharacter(next)) { |
354 // We're inside of a character, as described by buffer. | 331 // We're inside of a character, as described by buffer. |
355 | 332 |
356 // How many bytes (excluding this one) do we still expect? | 333 // How many bytes (excluding this one) do we still expect? |
357 uint8_t count = (*buffer >> 24) - 1; | 334 uint8_t bytes_expected = *buffer >> 28; |
| 335 uint8_t bytes_left = (*buffer >> 24) & 0x0f; |
| 336 bytes_left--; |
358 // Update the value. | 337 // Update the value. |
359 uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F); | 338 uint32_t value = ((*buffer & 0xffffff) << 6) | (next & 0x3F); |
360 if (count) { | 339 if (bytes_left) { |
361 *buffer = count << 24 | value; | 340 *buffer = (bytes_expected << 28 | bytes_left << 24 | value); |
362 return kIncomplete; | 341 return kIncomplete; |
363 } else { | 342 } else { |
364 *buffer = 0; | 343 *buffer = 0; |
365 return value; | 344 bool sequence_was_too_long = (bytes_expected == 2 && value < 0x80) || |
| 345 (bytes_expected == 3 && value < 0x800); |
| 346 return sequence_was_too_long ? kBadChar : value; |
366 } | 347 } |
367 } else { | 348 } else { |
368 // Within a character, but not a continuation character? Then the | 349 // Within a character, but not a continuation character? Then the |
369 // previous char was a bad char. But we need to save the current | 350 // previous char was a bad char. But we need to save the current |
370 // one. | 351 // one. |
371 *buffer = next; | 352 *buffer = next; |
372 return kBadChar; | 353 return kBadChar; |
373 } | 354 } |
374 } | 355 } |
375 | 356 |
(...skipping 3150 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3526 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3507 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3527 + | 3508 + |
3528 kCanonicalizationRangeMultiStrings1Size * | 3509 kCanonicalizationRangeMultiStrings1Size * |
3529 sizeof(MultiCharacterSpecialCase<1>) // NOLINT | 3510 sizeof(MultiCharacterSpecialCase<1>) // NOLINT |
3530 + | 3511 + |
3531 kCanonicalizationRangeMultiStrings7Size * | 3512 kCanonicalizationRangeMultiStrings7Size * |
3532 sizeof(MultiCharacterSpecialCase<1>); // NOLINT | 3513 sizeof(MultiCharacterSpecialCase<1>); // NOLINT |
3533 } | 3514 } |
3534 | 3515 |
3535 } // namespace unibrow | 3516 } // namespace unibrow |
OLD | NEW |