sdk/lib/convert/utf.dart - Issue 435553002: Improve utf8 decoding of single-char bytes, by isolating the loop.

Side by Side Diff: sdk/lib/convert/utf.dart

Issue 435553002: Improve utf8 decoding of single-char bytes, by isolating the loop. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Revert debug change. Created 6 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 part of dart.convert;	5 part of dart.convert;

6	6

7 /** The Unicode Replacement character `U+FFFD` (�). */	7 /** The Unicode Replacement character `U+FFFD` (�). */

8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;	8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD;

9	9

10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */	10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */

(...skipping 402 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
413 _value = 0;	413 _value = 0;

414 _expectedUnits = 0;	414 _expectedUnits = 0;

415 _extraUnits = 0;	415 _extraUnits = 0;

416 }	416 }

417 }	417 }

418	418

419 void convert(List<int> codeUnits, int startIndex, int endIndex) {	419 void convert(List<int> codeUnits, int startIndex, int endIndex) {

420 int value = _value;	420 int value = _value;

421 int expectedUnits = _expectedUnits;	421 int expectedUnits = _expectedUnits;

422 int extraUnits = _extraUnits;	422 int extraUnits = _extraUnits;

423 int singleBytesCount = 0;

424 _value = 0;	423 _value = 0;

425 _expectedUnits = 0;	424 _expectedUnits = 0;

426 _extraUnits = 0;	425 _extraUnits = 0;

427	426

	427 int scanOneByteCharacters(units, int from) {

	428 final to = endIndex;

	429 final mask = ~_ONE_BYTE_LIMIT;

	430 for (var i = from; i < to; i++) {

	431 if ((units[i] & mask) != 0) return i - from;

	432 }

	433 return to - from;

	434 }

	435

428 void addSingleBytes(int from, int to) {	436 void addSingleBytes(int from, int to) {

429 assert(singleBytesCount > 0);

430 assert(from >= startIndex && from <= endIndex);	437 assert(from >= startIndex && from <= endIndex);

431 assert(to >= startIndex && to <= endIndex);	438 assert(to >= startIndex && to <= endIndex);

432 if (from == 0 && to == codeUnits.length) {	439 if (from == 0 && to == codeUnits.length) {

433 _stringSink.write(new String.fromCharCodes(codeUnits));	440 _stringSink.write(new String.fromCharCodes(codeUnits));

434 } else {	441 } else {

435 _stringSink.write(	442 _stringSink.write(

436 new String.fromCharCodes(codeUnits.sublist(from, to)));	443 new String.fromCharCodes(codeUnits.sublist(from, to)));

437 }	444 }

438 singleBytesCount = 0;

439 }	445 }

440	446

441 int i = startIndex;	447 int i = startIndex;

442 loop: while (true) {	448 loop: while (true) {

443 multibyte: if (expectedUnits > 0) {	449 multibyte: if (expectedUnits > 0) {

444 do {	450 do {

445 if (i == endIndex) {	451 if (i == endIndex) {

446 break loop;	452 break loop;

447 }	453 }

448 int unit = codeUnits[i];	454 int unit = codeUnits[i];

(...skipping 29 matching lines...) Expand all Loading...
478 }	484 }

479 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;	485 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

480 }	486 }

481 if (!_isFirstCharacter \|\| value != UNICODE_BOM_CHARACTER_RUNE) {	487 if (!_isFirstCharacter \|\| value != UNICODE_BOM_CHARACTER_RUNE) {

482 _stringSink.writeCharCode(value);	488 _stringSink.writeCharCode(value);

483 }	489 }

484 _isFirstCharacter = false;	490 _isFirstCharacter = false;

485 }	491 }

486	492

487 while (i < endIndex) {	493 while (i < endIndex) {

	494 int oneBytes = scanOneByteCharacters(codeUnits, i);

	495 if (oneBytes > 0) {

	496 _isFirstCharacter = false;

	497 addSingleBytes(i, i + oneBytes);

	498 i += oneBytes;

	499 if (i == endIndex) break;

	500 }

488 int unit = codeUnits[i++];	501 int unit = codeUnits[i++];

489 // TODO(floitsch): the way we test we could potentially allow	502 // TODO(floitsch): the way we test we could potentially allow

490 // units that are too large, if they happen to have the	503 // units that are too large, if they happen to have the

491 // right bit-pattern. (Same is true for the multibyte loop above).	504 // right bit-pattern. (Same is true for the multibyte loop above).

492 // TODO(floitsch): optimize this loop. See:	505 // TODO(floitsch): optimize this loop. See:

493 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80	506 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d art?column_width=80

494 if (unit < 0) {	507 if (unit < 0) {

495 // TODO(floitsch): should this be unit <= 0 ?	508 // TODO(floitsch): should this be unit <= 0 ?

496 if (singleBytesCount > 0) {

497 int to = i - 1;

498 addSingleBytes(to - singleBytesCount, to);

499 }

500 if (!_allowMalformed) {	509 if (!_allowMalformed) {

501 throw new FormatException(	510 throw new FormatException(

502 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}");	511 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}");

503 }	512 }

504 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);	513 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE);

505 } else if (unit <= _ONE_BYTE_LIMIT) {

506 _isFirstCharacter = false;

507 singleBytesCount++;

508 } else {	514 } else {

509 if (singleBytesCount > 0) {	515 assert(unit > _ONE_BYTE_LIMIT);

510 int to = i - 1;

511 addSingleBytes(to - singleBytesCount, to);

512 }

513 if ((unit & 0xE0) == 0xC0) {	516 if ((unit & 0xE0) == 0xC0) {

514 value = unit & 0x1F;	517 value = unit & 0x1F;

515 expectedUnits = extraUnits = 1;	518 expectedUnits = extraUnits = 1;

516 continue loop;	519 continue loop;

517 }	520 }

518 if ((unit & 0xF0) == 0xE0) {	521 if ((unit & 0xF0) == 0xE0) {

519 value = unit & 0x0F;	522 value = unit & 0x0F;

520 expectedUnits = extraUnits = 2;	523 expectedUnits = extraUnits = 2;

521 continue loop;	524 continue loop;

522 }	525 }

523 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.	526 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences.

524 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {	527 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) {

525 value = unit & 0x07;	528 value = unit & 0x07;

526 expectedUnits = extraUnits = 3;	529 expectedUnits = extraUnits = 3;

527 continue loop;	530 continue loop;

528 }	531 }

529 if (!_allowMalformed) {	532 if (!_allowMalformed) {

530 throw new FormatException(	533 throw new FormatException(

531 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");	534 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}");

532 }	535 }

533 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;	536 value = UNICODE_REPLACEMENT_CHARACTER_RUNE;

534 expectedUnits = extraUnits = 0;	537 expectedUnits = extraUnits = 0;

535 _isFirstCharacter = false;	538 _isFirstCharacter = false;

536 _stringSink.writeCharCode(value);	539 _stringSink.writeCharCode(value);

537 }	540 }

538 }	541 }

539 break loop;	542 break loop;

540 }	543 }

541 if (singleBytesCount > 0) {

542 addSingleBytes(i - singleBytesCount, endIndex);

543 }

544 if (expectedUnits > 0) {	544 if (expectedUnits > 0) {

545 _value = value;	545 _value = value;

546 _expectedUnits = expectedUnits;	546 _expectedUnits = expectedUnits;

547 _extraUnits = extraUnits;	547 _extraUnits = extraUnits;

548 }	548 }

549 }	549 }

550 }	550 }

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »