| OLD | NEW |
| 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 part of dart.convert; | 5 part of dart.convert; |
| 6 | 6 |
| 7 /** The Unicode Replacement character `U+FFFD` (�). */ | 7 /** The Unicode Replacement character `U+FFFD` (�). */ |
| 8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; | 8 const int UNICODE_REPLACEMENT_CHARACTER_RUNE = 0xFFFD; |
| 9 | 9 |
| 10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ | 10 /** The Unicode Byte Order Marker (BOM) character `U+FEFF`. */ |
| (...skipping 402 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 413 _value = 0; | 413 _value = 0; |
| 414 _expectedUnits = 0; | 414 _expectedUnits = 0; |
| 415 _extraUnits = 0; | 415 _extraUnits = 0; |
| 416 } | 416 } |
| 417 } | 417 } |
| 418 | 418 |
| 419 void convert(List<int> codeUnits, int startIndex, int endIndex) { | 419 void convert(List<int> codeUnits, int startIndex, int endIndex) { |
| 420 int value = _value; | 420 int value = _value; |
| 421 int expectedUnits = _expectedUnits; | 421 int expectedUnits = _expectedUnits; |
| 422 int extraUnits = _extraUnits; | 422 int extraUnits = _extraUnits; |
| 423 int singleBytesCount = 0; | |
| 424 _value = 0; | 423 _value = 0; |
| 425 _expectedUnits = 0; | 424 _expectedUnits = 0; |
| 426 _extraUnits = 0; | 425 _extraUnits = 0; |
| 427 | 426 |
| 427 int scanOneByteCharacters(units, int from) { |
| 428 final to = endIndex; |
| 429 final mask = ~_ONE_BYTE_LIMIT; |
| 430 for (var i = from; i < to; i++) { |
| 431 if ((units[i] & mask) != 0) return i - from; |
| 432 } |
| 433 return to - from; |
| 434 } |
| 435 |
| 428 void addSingleBytes(int from, int to) { | 436 void addSingleBytes(int from, int to) { |
| 429 assert(singleBytesCount > 0); | |
| 430 assert(from >= startIndex && from <= endIndex); | 437 assert(from >= startIndex && from <= endIndex); |
| 431 assert(to >= startIndex && to <= endIndex); | 438 assert(to >= startIndex && to <= endIndex); |
| 432 if (from == 0 && to == codeUnits.length) { | 439 if (from == 0 && to == codeUnits.length) { |
| 433 _stringSink.write(new String.fromCharCodes(codeUnits)); | 440 _stringSink.write(new String.fromCharCodes(codeUnits)); |
| 434 } else { | 441 } else { |
| 435 _stringSink.write( | 442 _stringSink.write( |
| 436 new String.fromCharCodes(codeUnits.sublist(from, to))); | 443 new String.fromCharCodes(codeUnits.sublist(from, to))); |
| 437 } | 444 } |
| 438 singleBytesCount = 0; | |
| 439 } | 445 } |
| 440 | 446 |
| 441 int i = startIndex; | 447 int i = startIndex; |
| 442 loop: while (true) { | 448 loop: while (true) { |
| 443 multibyte: if (expectedUnits > 0) { | 449 multibyte: if (expectedUnits > 0) { |
| 444 do { | 450 do { |
| 445 if (i == endIndex) { | 451 if (i == endIndex) { |
| 446 break loop; | 452 break loop; |
| 447 } | 453 } |
| 448 int unit = codeUnits[i]; | 454 int unit = codeUnits[i]; |
| (...skipping 29 matching lines...) Expand all Loading... |
| 478 } | 484 } |
| 479 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | 485 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
| 480 } | 486 } |
| 481 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { | 487 if (!_isFirstCharacter || value != UNICODE_BOM_CHARACTER_RUNE) { |
| 482 _stringSink.writeCharCode(value); | 488 _stringSink.writeCharCode(value); |
| 483 } | 489 } |
| 484 _isFirstCharacter = false; | 490 _isFirstCharacter = false; |
| 485 } | 491 } |
| 486 | 492 |
| 487 while (i < endIndex) { | 493 while (i < endIndex) { |
| 494 int oneBytes = scanOneByteCharacters(codeUnits, i); |
| 495 if (oneBytes > 0) { |
| 496 _isFirstCharacter = false; |
| 497 addSingleBytes(i, i + oneBytes); |
| 498 i += oneBytes; |
| 499 if (i == endIndex) break; |
| 500 } |
| 488 int unit = codeUnits[i++]; | 501 int unit = codeUnits[i++]; |
| 489 // TODO(floitsch): the way we test we could potentially allow | 502 // TODO(floitsch): the way we test we could potentially allow |
| 490 // units that are too large, if they happen to have the | 503 // units that are too large, if they happen to have the |
| 491 // right bit-pattern. (Same is true for the multibyte loop above). | 504 // right bit-pattern. (Same is true for the multibyte loop above). |
| 492 // TODO(floitsch): optimize this loop. See: | 505 // TODO(floitsch): optimize this loop. See: |
| 493 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 | 506 // https://codereview.chromium.org/22929022/diff/1/sdk/lib/convert/utf.d
art?column_width=80 |
| 494 if (unit < 0) { | 507 if (unit < 0) { |
| 495 // TODO(floitsch): should this be unit <= 0 ? | 508 // TODO(floitsch): should this be unit <= 0 ? |
| 496 if (singleBytesCount > 0) { | |
| 497 int to = i - 1; | |
| 498 addSingleBytes(to - singleBytesCount, to); | |
| 499 } | |
| 500 if (!_allowMalformed) { | 509 if (!_allowMalformed) { |
| 501 throw new FormatException( | 510 throw new FormatException( |
| 502 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); | 511 "Negative UTF-8 code unit: -0x${(-unit).toRadixString(16)}"); |
| 503 } | 512 } |
| 504 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); | 513 _stringSink.writeCharCode(UNICODE_REPLACEMENT_CHARACTER_RUNE); |
| 505 } else if (unit <= _ONE_BYTE_LIMIT) { | |
| 506 _isFirstCharacter = false; | |
| 507 singleBytesCount++; | |
| 508 } else { | 514 } else { |
| 509 if (singleBytesCount > 0) { | 515 assert(unit > _ONE_BYTE_LIMIT); |
| 510 int to = i - 1; | |
| 511 addSingleBytes(to - singleBytesCount, to); | |
| 512 } | |
| 513 if ((unit & 0xE0) == 0xC0) { | 516 if ((unit & 0xE0) == 0xC0) { |
| 514 value = unit & 0x1F; | 517 value = unit & 0x1F; |
| 515 expectedUnits = extraUnits = 1; | 518 expectedUnits = extraUnits = 1; |
| 516 continue loop; | 519 continue loop; |
| 517 } | 520 } |
| 518 if ((unit & 0xF0) == 0xE0) { | 521 if ((unit & 0xF0) == 0xE0) { |
| 519 value = unit & 0x0F; | 522 value = unit & 0x0F; |
| 520 expectedUnits = extraUnits = 2; | 523 expectedUnits = extraUnits = 2; |
| 521 continue loop; | 524 continue loop; |
| 522 } | 525 } |
| 523 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. | 526 // 0xF5, 0xF6 ... 0xFF never appear in valid UTF-8 sequences. |
| 524 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { | 527 if ((unit & 0xF8) == 0xF0 && unit < 0xF5) { |
| 525 value = unit & 0x07; | 528 value = unit & 0x07; |
| 526 expectedUnits = extraUnits = 3; | 529 expectedUnits = extraUnits = 3; |
| 527 continue loop; | 530 continue loop; |
| 528 } | 531 } |
| 529 if (!_allowMalformed) { | 532 if (!_allowMalformed) { |
| 530 throw new FormatException( | 533 throw new FormatException( |
| 531 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); | 534 "Bad UTF-8 encoding 0x${unit.toRadixString(16)}"); |
| 532 } | 535 } |
| 533 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; | 536 value = UNICODE_REPLACEMENT_CHARACTER_RUNE; |
| 534 expectedUnits = extraUnits = 0; | 537 expectedUnits = extraUnits = 0; |
| 535 _isFirstCharacter = false; | 538 _isFirstCharacter = false; |
| 536 _stringSink.writeCharCode(value); | 539 _stringSink.writeCharCode(value); |
| 537 } | 540 } |
| 538 } | 541 } |
| 539 break loop; | 542 break loop; |
| 540 } | 543 } |
| 541 if (singleBytesCount > 0) { | |
| 542 addSingleBytes(i - singleBytesCount, endIndex); | |
| 543 } | |
| 544 if (expectedUnits > 0) { | 544 if (expectedUnits > 0) { |
| 545 _value = value; | 545 _value = value; |
| 546 _expectedUnits = expectedUnits; | 546 _expectedUnits = expectedUnits; |
| 547 _extraUnits = extraUnits; | 547 _extraUnits = extraUnits; |
| 548 } | 548 } |
| 549 } | 549 } |
| 550 } | 550 } |
| OLD | NEW |