OLD | NEW |
---|---|
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
4 | 4 |
5 part of dart.convert; | 5 part of dart.convert; |
6 | 6 |
7 /** The Unicode Replacement character `U+FFFD` (�). */ | 7 /** The Unicode Replacement character `U+FFFD` (�). */ |
8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; | 8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; |
9 | 9 |
10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ | 10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ |
(...skipping 402 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
413 _value = 0; | 413 _value = 0; |
414 _expectedUnits = 0; | 414 _expectedUnits = 0; |
415 _extraUnits = 0; | 415 _extraUnits = 0; |
416 } | 416 } |
417 } | 417 } |
418 | 418 |
419 void convert(List<int> codeUnits, int startIndex, int endIndex) { | 419 void convert(List<int> codeUnits, int startIndex, int endIndex) { |
420 int value = _value; | 420 int value = _value; |
421 int expectedUnits = _expectedUnits; | 421 int expectedUnits = _expectedUnits; |
422 int extraUnits = _extraUnits; | 422 int extraUnits = _extraUnits; |
423 int singleBytesCount = 0; | |
424 _value = 0; | 423 _value = 0; |
425 _expectedUnits = 0; | 424 _expectedUnits = 0; |
426 _extraUnits = 0; | 425 _extraUnits = 0; |
427 | 426 |
427 int scanOneByteCharacters(units, int from) { | |
428 final to = endIndex; | |
429 final mask = ~_ONE_BYTE_LIMIT; | |
430 for (var i = from; i < to; i++) { | |
431 if ((units[i] & mask) != 0) return i - from; | |
432 } | |
433 return to - from; | |
434 } | |
435 | |
428 void addSingleBytes(int from, int to) { | 436 void addSingleBytes(int from, int to) { |
429 assert(singleBytesCount > 0); | |
430 assert(from >= startIndex && from <= endIndex); | 437 assert(from >= startIndex && from <= endIndex); |
431 assert(to >= startIndex && to <= endIndex); | 438 assert(to >= startIndex && to <= endIndex); |
432 if (from == 0 && to == codeUnits.length) { | 439 if (from == 0 && to == codeUnits.length) { |
433 _stringSink.write(new String.fromCharCodes(codeUnits)); | 440 var str = new String.fromCharCodes(codeUnits); |
Florian Schneider
2014/07/31 11:19:49
Why this change?
Anders Johnsen
2014/07/31 12:23:03
Ah, was for debugging. Reverted.
| |
441 _stringSink.write(str); | |
434 } else { | 442 } else { |
435 _stringSink.write( | 443 _stringSink.write( |
436 new String.fromCharCodes(codeUnits.sublist(from, to))); | 444 new String.fromCharCodes(codeUnits.sublist(from, to))); |
437 } | 445 } |
438 singleBytesCount = 0; | |
439 } | 446 } |
440 | 447 |
441 int i = startIndex; | 448 int i = startIndex; |
442 loop: while (true) { | 449 loop: while (true) { |
443 multibyte: if (expectedUnits > 0) { | 450 multibyte: if (expectedUnits > 0) { |
444 do { | 451 do { |
445 if (i == endIndex) { | 452 if (i == endIndex) { |
446 break loop; | 453 break loop; |
447 } | 454 } |
448 int unit = codeUnits[i]; | 455 int unit = codeUnits[i]; |
(...skipping 29 matching lines...) Expand all Loading... | |
478 } | 485 } |
479 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | 486 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
480 } | 487 } |
481 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { | 488 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { |
482 _stringSink.writeCharCode(value); | 489 _stringSink.writeCharCode(value); |
483 } | 490 } |
484 _isFirstCharacter = false; | 491 _isFirstCharacter = false; |
485 } | 492 } |
486 | 493 |
487 while (i < endIndex) { | 494 while (i < endIndex) { |
495 int oneBytes = scanOneByteCharacters(codeUnits, i); | |
496 if (oneBytes > 0) { | |
497 _isFirstCharacter = false; | |
498 addSingleBytes(i, i + oneBytes); | |
499 i += oneBytes; | |
500 if (i == endIndex) break; | |
501 } | |
488 int unit = codeUnits[i++]; | 502 int unit = codeUnits[i++]; |
489 // TODO(floitsch): the way we test we could potentially allow | 503 // TODO(floitsch): the way we test we could potentially allow |
490 // units that are too large, if they happen to have the | 504 // units that are too large, if they happen to have the |
491 // right bit-pattern. (Same is true for the multibyte loop above). | 505 // right bit-pattern. (Same is true for the multibyte loop above). |
492 // TODO(floitsch): optimize this loop. See: | 506 // TODO(floitsch): optimize this loop. See: |
493 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80 | 507 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80 |
494 if (unit < 0) { | 508 if (unit < 0) { |
495 // TODO(floitsch): should this be unit <= 0 ? | 509 // TODO(floitsch): should this be unit <= 0 ? |
496 if (singleBytesCount > 0) { | |
497 int to = i - 1; | |
498 addSingleBytes(to - singleBytesCount, to); | |
499 } | |
500 if (!_allowMalformed) { | 510 if (!_allowMalformed) { |
501 throw new FormatException( | 511 throw new FormatException( |
502 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); | 512 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); |
503 } | 513 } |
504 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | 514 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
505 } else if (unit <= _ONE_BYTE_LIMIT) { | |
506 _isFirstCharacter = false; | |
507 singleBytesCount++; | |
508 } else { | 515 } else { |
509 if (singleBytesCount > 0) { | 516 assert(unit > _ONE_BYTE_LIMIT); |
510 int to = i - 1; | |
511 addSingleBytes(to - singleBytesCount, to); | |
512 } | |
513 if ((unit & 0xE0) == 0xC0) { | 517 if ((unit & 0xE0) == 0xC0) { |
514 value = unit & 0x1F; | 518 value = unit & 0x1F; |
515 expectedUnits = extraUnits = 1; | 519 expectedUnits = extraUnits = 1; |
516 continue loop; | 520 continue loop; |
517 } | 521 } |
518 if ((unit & 0xF0) == 0xE0) { | 522 if ((unit & 0xF0) == 0xE0) { |
519 value = unit & 0x0F; | 523 value = unit & 0x0F; |
520 expectedUnits = extraUnits = 2; | 524 expectedUnits = extraUnits = 2; |
521 continue loop; | 525 continue loop; |
522 } | 526 } |
523 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. | 527 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. |
524 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { | 528 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { |
525 value = unit & 0x07; | 529 value = unit & 0x07; |
526 expectedUnits = extraUnits = 3; | 530 expectedUnits = extraUnits = 3; |
527 continue loop; | 531 continue loop; |
528 } | 532 } |
529 if (!_allowMalformed) { | 533 if (!_allowMalformed) { |
530 throw new FormatException( | 534 throw new FormatException( |
531 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | 535 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
532 } | 536 } |
533 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | 537 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
534 expectedUnits = extraUnits = 0; | 538 expectedUnits = extraUnits = 0; |
535 _isFirstCharacter = false; | 539 _isFirstCharacter = false; |
536 _stringSink.writeCharCode(value); | 540 _stringSink.writeCharCode(value); |
537 } | 541 } |
538 } | 542 } |
539 break loop; | 543 break loop; |
540 } | 544 } |
541 if (singleBytesCount > 0) { | |
542 addSingleBytes(i - singleBytesCount, endIndex); | |
543 } | |
544 if (expectedUnits > 0) { | 545 if (expectedUnits > 0) { |
545 _value = value; | 546 _value = value; |
546 _expectedUnits = expectedUnits; | 547 _expectedUnits = expectedUnits; |
547 _extraUnits = extraUnits; | 548 _extraUnits = extraUnits; |
548 } | 549 } |
549 } | 550 } |
550 } | 551 } |
OLD | NEW |