sdk/lib/convert/utf.dart - Issue 435553002: Improve utf8 decoding of single-char bytes, by isolating the loop.

Side by Side Diff: sdk/lib/convert/utf.dart

Issue 435553002: Improve utf8 decoding of single-char bytes, by isolating the loop. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 part of dart.convert;	5 part of dart.convert;

6	6

7 /** The Unicode Replacement character `U+FFFD` (�). */	7 /** The Unicode Replacement character `U+FFFD` (�). */

8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;	8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;

9	9

10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */	10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */

(...skipping 402 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
413 _value = 0;	413 _value = 0;

414 _expectedUnits = 0;	414 _expectedUnits = 0;

415 _extraUnits = 0;	415 _extraUnits = 0;

416 }	416 }

417 }	417 }

418	418

419 void convert(List<int> codeUnits, int startIndex, int endIndex) {	419 void convert(List<int> codeUnits, int startIndex, int endIndex) {

420 int value = _value;	420 int value = _value;

421 int expectedUnits = _expectedUnits;	421 int expectedUnits = _expectedUnits;

422 int extraUnits = _extraUnits;	422 int extraUnits = _extraUnits;

423 int singleBytesCount = 0;

424 _value = 0;	423 _value = 0;

425 _expectedUnits = 0;	424 _expectedUnits = 0;

426 _extraUnits = 0;	425 _extraUnits = 0;

427	426

	427 int scanOneByteCharacters(units, int from) {

	428 final to = endIndex;

	429 final mask = ~_ONE_BYTE_LIMIT;

	430 for (var i = from; i < to; i++) {

	431 if ((units[i] & mask) != 0) return i - from;

	432 }

	433 return to - from;

	434 }

	435

428 void addSingleBytes(int from, int to) {	436 void addSingleBytes(int from, int to) {

429 assert(singleBytesCount > 0);

430 assert(from >= startIndex && from <= endIndex);	437 assert(from >= startIndex && from <= endIndex);

431 assert(to >= startIndex && to <= endIndex);	438 assert(to >= startIndex && to <= endIndex);

432 if (from == 0 && to == codeUnits.length) {	439 if (from == 0 && to == codeUnits.length) {

433 _stringSink.write(new String.fromCharCodes(codeUnits));	440 var str = new String.fromCharCodes(codeUnits);
	Florian Schneider 2014/07/31 11:19:49 Why this change? Why this change? Anders Johnsen 2014/07/31 12:23:03 Ah, was for debugging. Reverted. Show quoted text On 2014/07/31 11:19:49, Florian Schneider wrote: > Why this change? Ah, was for debugging. Reverted.
	441 _stringSink.write(str);

434 } else {	442 } else {

435 _stringSink.write(	443 _stringSink.write(

436 new String.fromCharCodes(codeUnits.sublist(from, to)));	444 new String.fromCharCodes(codeUnits.sublist(from, to)));

437 }	445 }

438 singleBytesCount = 0;

439 }	446 }

440	447

441 int i = startIndex;	448 int i = startIndex;

442 loop: while (true) {	449 loop: while (true) {

443 multibyte: if (expectedUnits > 0) {	450 multibyte: if (expectedUnits > 0) {

444 do {	451 do {

445 if (i == endIndex) {	452 if (i == endIndex) {

446 break loop;	453 break loop;

447 }	454 }

448 int unit = codeUnits[i];	455 int unit = codeUnits[i];

(...skipping 29 matching lines...) Expand all Loading...
478 }	485 }

479 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;	486 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

480 }	487 }

481 if (!_isFirstCharacter \|\| value != UNICODE_BOM_CHARACTER_RUNE) {	488 if (!_isFirstCharacter \|\| value != UNICODE_BOM_CHARACTER_RUNE) {

482 _stringSink.writeCharCode(value);	489 _stringSink.writeCharCode(value);

483 }	490 }

484 _isFirstCharacter = false;	491 _isFirstCharacter = false;

485 }	492 }

486	493

487 while (i < endIndex) {	494 while (i < endIndex) {

	495 int oneBytes = scanOneByteCharacters(codeUnits, i);

	496 if (oneBytes > 0) {

	497 _isFirstCharacter = false;

	498 addSingleBytes(i, i + oneBytes);

	499 i += oneBytes;

	500 if (i == endIndex) break;

	501 }

488 int unit = codeUnits[i++];	502 int unit = codeUnits[i++];

489 // TODO(floitsch): the way we test we could potentially allow	503 // TODO(floitsch): the way we test we could potentially allow

490 // units that are too large, if they happen to have the	504 // units that are too large, if they happen to have the

491 // right bit-pattern. (Same is true for the multibyte loop above).	505 // right bit-pattern. (Same is true for the multibyte loop above).

492 // TODO(floitsch): optimize this loop. See:	506 // TODO(floitsch): optimize this loop. See:

493 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80	507 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80

494 if (unit < 0) {	508 if (unit < 0) {

495 // TODO(floitsch): should this be unit <= 0 ?	509 // TODO(floitsch): should this be unit <= 0 ?

496 if (singleBytesCount > 0) {

497 int to = i - 1;

498 addSingleBytes(to - singleBytesCount, to);

499 }

500 if (!_allowMalformed) {	510 if (!_allowMalformed) {

501 throw new FormatException(	511 throw new FormatException(

502 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}");	512 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}");

503 }	513 }

504 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);	514 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);

505 } else if (unit <= _ONE_BYTE_LIMIT) {

506 _isFirstCharacter = false;

507 singleBytesCount++;

508 } else {	515 } else {

509 if (singleBytesCount > 0) {	516 assert(unit > _ONE_BYTE_LIMIT);

510 int to = i - 1;

511 addSingleBytes(to - singleBytesCount, to);

512 }

513 if ((unit & 0xE0) == 0xC0) {	517 if ((unit & 0xE0) == 0xC0) {

514 value = unit & 0x1F;	518 value = unit & 0x1F;

515 expectedUnits = extraUnits = 1;	519 expectedUnits = extraUnits = 1;

516 continue loop;	520 continue loop;

517 }	521 }

518 if ((unit & 0xF0) == 0xE0) {	522 if ((unit & 0xF0) == 0xE0) {

519 value = unit & 0x0F;	523 value = unit & 0x0F;

520 expectedUnits = extraUnits = 2;	524 expectedUnits = extraUnits = 2;

521 continue loop;	525 continue loop;

522 }	526 }

523 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.	527 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.

524 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {	528 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {

525 value = unit & 0x07;	529 value = unit & 0x07;

526 expectedUnits = extraUnits = 3;	530 expectedUnits = extraUnits = 3;

527 continue loop;	531 continue loop;

528 }	532 }

529 if (!_allowMalformed) {	533 if (!_allowMalformed) {

530 throw new FormatException(	534 throw new FormatException(

531 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");	535 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

532 }	536 }

533 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;	537 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

534 expectedUnits = extraUnits = 0;	538 expectedUnits = extraUnits = 0;

535 _isFirstCharacter = false;	539 _isFirstCharacter = false;

536 _stringSink.writeCharCode(value);	540 _stringSink.writeCharCode(value);

537 }	541 }

538 }	542 }

539 break loop;	543 break loop;

540 }	544 }

541 if (singleBytesCount > 0) {

542 addSingleBytes(i - singleBytesCount, endIndex);

543 }

544 if (expectedUnits > 0) {	545 if (expectedUnits > 0) {

545 _value = value;	546 _value = value;

546 _expectedUnits = expectedUnits;	547 _expectedUnits = expectedUnits;

547 _extraUnits = extraUnits;	548 _extraUnits = extraUnits;

548 }	549 }

549 }	550 }

550 }	551 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »