Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(321)

Side by Side Diff: sdk/lib/convert/utf.dart

Issue 435553002: Improve utf8 decoding of single-char bytes, by isolating the loop. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 part of dart.convert; 5 part of dart.convert;
6 6
7 /** The Unicode Replacement character `U+FFFD` (�). */ 7 /** The Unicode Replacement character `U+FFFD` (�). */
8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; 8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;
9 9
10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ 10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */
(...skipping 402 matching lines...) Expand 10 before | Expand all | Expand 10 after
413 _value = 0; 413 _value = 0;
414 _expectedUnits = 0; 414 _expectedUnits = 0;
415 _extraUnits = 0; 415 _extraUnits = 0;
416 } 416 }
417 } 417 }
418 418
419 void convert(List<int> codeUnits, int startIndex, int endIndex) { 419 void convert(List<int> codeUnits, int startIndex, int endIndex) {
420 int value = _value; 420 int value = _value;
421 int expectedUnits = _expectedUnits; 421 int expectedUnits = _expectedUnits;
422 int extraUnits = _extraUnits; 422 int extraUnits = _extraUnits;
423 int singleBytesCount = 0;
424 _value = 0; 423 _value = 0;
425 _expectedUnits = 0; 424 _expectedUnits = 0;
426 _extraUnits = 0; 425 _extraUnits = 0;
427 426
427 int scanOneByteCharacters(units, int from) {
428 final to = endIndex;
429 final mask = ~_ONE_BYTE_LIMIT;
430 for (var i = from; i < to; i++) {
431 if ((units[i] & mask) != 0) return i - from;
432 }
433 return to - from;
434 }
435
428 void addSingleBytes(int from, int to) { 436 void addSingleBytes(int from, int to) {
429 assert(singleBytesCount > 0);
430 assert(from >= startIndex && from <= endIndex); 437 assert(from >= startIndex && from <= endIndex);
431 assert(to >= startIndex && to <= endIndex); 438 assert(to >= startIndex && to <= endIndex);
432 if (from == 0 && to == codeUnits.length) { 439 if (from == 0 && to == codeUnits.length) {
433 _stringSink.write(new String.fromCharCodes(codeUnits)); 440 var str = new String.fromCharCodes(codeUnits);
Florian Schneider 2014/07/31 11:19:49 Why this change?
Anders Johnsen 2014/07/31 12:23:03 Ah, was for debugging. Reverted.
441 _stringSink.write(str);
434 } else { 442 } else {
435 _stringSink.write( 443 _stringSink.write(
436 new String.fromCharCodes(codeUnits.sublist(from, to))); 444 new String.fromCharCodes(codeUnits.sublist(from, to)));
437 } 445 }
438 singleBytesCount = 0;
439 } 446 }
440 447
441 int i = startIndex; 448 int i = startIndex;
442 loop: while (true) { 449 loop: while (true) {
443 multibyte: if (expectedUnits > 0) { 450 multibyte: if (expectedUnits > 0) {
444 do { 451 do {
445 if (i == endIndex) { 452 if (i == endIndex) {
446 break loop; 453 break loop;
447 } 454 }
448 int unit = codeUnits[i]; 455 int unit = codeUnits[i];
(...skipping 29 matching lines...) Expand all
478 } 485 }
479 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; 486 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;
480 } 487 }
481 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { 488 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) {
482 _stringSink.writeCharCode(value); 489 _stringSink.writeCharCode(value);
483 } 490 }
484 _isFirstCharacter = false; 491 _isFirstCharacter = false;
485 } 492 }
486 493
487 while (i < endIndex) { 494 while (i < endIndex) {
495 int oneBytes = scanOneByteCharacters(codeUnits, i);
496 if (oneBytes > 0) {
497 _isFirstCharacter = false;
498 addSingleBytes(i, i + oneBytes);
499 i += oneBytes;
500 if (i == endIndex) break;
501 }
488 int unit = codeUnits[i++]; 502 int unit = codeUnits[i++];
489 // TODO(floitsch): the way we test we could potentially allow 503 // TODO(floitsch): the way we test we could potentially allow
490 // units that are too large, if they happen to have the 504 // units that are too large, if they happen to have the
491 // right bit-pattern. (Same is true for the multibyte loop above). 505 // right bit-pattern. (Same is true for the multibyte loop above).
492 // TODO(floitsch): optimize this loop. See: 506 // TODO(floitsch): optimize this loop. See:
493 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80 507 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80
494 if (unit < 0) { 508 if (unit < 0) {
495 // TODO(floitsch): should this be unit <= 0 ? 509 // TODO(floitsch): should this be unit <= 0 ?
496 if (singleBytesCount > 0) {
497 int to = i - 1;
498 addSingleBytes(to - singleBytesCount, to);
499 }
500 if (!_allowMalformed) { 510 if (!_allowMalformed) {
501 throw new FormatException( 511 throw new FormatException(
502 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); 512 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}");
503 } 513 }
504 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); 514 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);
505 } else if (unit <= _ONE_BYTE_LIMIT) {
506 _isFirstCharacter = false;
507 singleBytesCount++;
508 } else { 515 } else {
509 if (singleBytesCount > 0) { 516 assert(unit > _ONE_BYTE_LIMIT);
510 int to = i - 1;
511 addSingleBytes(to - singleBytesCount, to);
512 }
513 if ((unit & 0xE0) == 0xC0) { 517 if ((unit & 0xE0) == 0xC0) {
514 value = unit & 0x1F; 518 value = unit & 0x1F;
515 expectedUnits = extraUnits = 1; 519 expectedUnits = extraUnits = 1;
516 continue loop; 520 continue loop;
517 } 521 }
518 if ((unit & 0xF0) == 0xE0) { 522 if ((unit & 0xF0) == 0xE0) {
519 value = unit & 0x0F; 523 value = unit & 0x0F;
520 expectedUnits = extraUnits = 2; 524 expectedUnits = extraUnits = 2;
521 continue loop; 525 continue loop;
522 } 526 }
523 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. 527 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.
524 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { 528 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {
525 value = unit & 0x07; 529 value = unit & 0x07;
526 expectedUnits = extraUnits = 3; 530 expectedUnits = extraUnits = 3;
527 continue loop; 531 continue loop;
528 } 532 }
529 if (!_allowMalformed) { 533 if (!_allowMalformed) {
530 throw new FormatException( 534 throw new FormatException(
531 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); 535 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");
532 } 536 }
533 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; 537 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;
534 expectedUnits = extraUnits = 0; 538 expectedUnits = extraUnits = 0;
535 _isFirstCharacter = false; 539 _isFirstCharacter = false;
536 _stringSink.writeCharCode(value); 540 _stringSink.writeCharCode(value);
537 } 541 }
538 } 542 }
539 break loop; 543 break loop;
540 } 544 }
541 if (singleBytesCount > 0) {
542 addSingleBytes(i - singleBytesCount, endIndex);
543 }
544 if (expectedUnits > 0) { 545 if (expectedUnits > 0) {
545 _value = value; 546 _value = value;
546 _expectedUnits = expectedUnits; 547 _expectedUnits = expectedUnits;
547 _extraUnits = extraUnits; 548 _extraUnits = extraUnits;
548 } 549 }
549 } 550 }
550 } 551 }
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698