sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart - Issue 27510003: Scanner for UTF-8 byte arrays

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Re-add ArrayBasedScanner, minor fixes. Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« sdk/lib/_internal/compiler/implementation/scanner/token.dart ('K') | « sdk/lib/_internal/compiler/implementation/scanner/token.dart ('k') | sdk/lib/_internal/compiler/implementation/script.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file

	2 // for details. All rights reserved. Use of this source code is governed by a

	3 // BSD-style license that can be found in the LICENSE file.

	4

	5 part of scanner;

	6

	7 /**

	8 * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens

	9 * that points to substrings.

	10 */

	11 class Utf8BytesScanner extends ArrayBasedScanner {

	12 /** The file content. */

	13 List<int> bytes;

	14

	15 /**

	16 * Points to the offset of the byte last returned by [advance].

	17 *

	18 * After invoking [currentAsUnicode], the [byteOffset] points to the last

	19 * byte that is part of the (unicode or ASCII) character. That way, [advance]

	20 * can always increase the byte offset by 1.

	21 */

	22 int byteOffset = -1;

	23

	24 /**

	25 * The getter [scanOffset] is expected to return the index where the current

	26 * character starts. In case of a non-ascii character, after invoking

	27 * [currentAsUnicode], the byte offset points to the last byte.

	28 *

	29 * This field keeps track of the number of bytes for the current unicode

	30 * character. For example, if bytes 7,8,9 encode one unicode character, the

	31 * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack]

	32 * will be 2, so that [scanOffset] returns 7.

	33 */

	34 int scanSlack = 0;

	35

	36 /**

	37 * Holds the [byteOffset] value for which the current [scanSlack] is valid.

	38 */

	39 int scanSlackOffset = -1;

	40

	41 /**

	42 * Returns the byte offset of the first byte that belongs to the current

	43 * character.

	44 */

	45 int get scanOffset {

	46 if (byteOffset == scanSlackOffset) {

	47 return byteOffset - scanSlack;

	48 } else {

	49 return byteOffset;

	50 }

	51 }

	52

	53 /**

	54 * The difference between the number of bytes and the number of corresponding

	55 * string characters, up to the current [byteOffset].

	56 */

	57 int utf8Slack = 0;

	58

	59 /**

	60 * Creates a new Utf8BytesScanner. The source file is expected to be a

	61 * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the

	62 * string text of the source file is decoded.

	63 *

	64 * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an

	65 * array whose last element is '0' to signal the end of the file. If this

	66 * is not the case, the entire array is copied before scanning.

	67 */

	68 Utf8BytesScanner(SourceFile file, {bool includeComments: false})

	69 : bytes = file.slowUtf8Bytes(),

	70 super(file, includeComments) {

	71 ensureZeroTermination();

	72 }

	73

	74 /**

	75 * Creates a new Utf8BytesScanner from a list of UTF-8 bytes.

	76 *

	77 * The last element of the list is expected to be '0' to signal the end of

	78 * the file. If this is not the case, the entire array is copied before

	79 * scanning.

	80 */

	81 Utf8BytesScanner.fromBytes(this.bytes, {bool includeComments: false})

	82 : super(null, includeComments) {

	83 ensureZeroTermination();

	84 }

	85

	86 void ensureZeroTermination() {

	87 if (bytes.isEmpty \|\| bytes[bytes.length -1] != 0) {

	88 // TODO(lry), abort instead of copying the array, or warn?

	89 var newBytes = new Uint8List(bytes.length + 1);

	90 newBytes.addAll(bytes);

	91 newBytes[bytes.length] = 0;

	92 bytes = newBytes;

	93 }

	94 }

	95

	96 int advance() => bytes[++byteOffset];

	97

	98 int peek() => bytes[byteOffset + 1];

	99

	100 /**

	101 * Returns the unicode code point starting at the byte offset [startOffset]

	102 * with the byte [nextByte]. If [advance] is true the current [byteOffset]

	103 * is advanced to the last byte of the code point.

	104 */

	105 int nextCodePoint(int startOffset, int nextByte, bool advance) {

	106 // The number of 1s in the first byte indicate the number of bytes, at

	107 // least 2.

	108 int numBytes = 2;

	109 int bit = 0x20;

	110 while ((nextByte & bit) != 0) {

	111 numBytes++;

	112 bit >>= 1;

	113 }

	114 int end = startOffset + numBytes;

	115 if (advance) {

	116 byteOffset = end - 1;

	117 }

	118 // TODO(lry), measurably slow, decode creates first a Utf8Decoder and a

	119 // _Utf8Decoder instance. Also the sublist is eagerly allocated.

	120 String codePoint = UTF8.decode(bytes.sublist(startOffset, end));

	121 if (codePoint.length == 1) {

	122 if (advance) {

	123 utf8Slack += (numBytes - 1);

	124 scanSlack = numBytes - 1;

	125 scanSlackOffset = byteOffset;

	126 }

	127 return codePoint.codeUnitAt(0);

	128 } else if (codePoint.length == 2) {

	129 if (advance) {

	130 utf8Slack += (numBytes - 2);

	131 scanSlack = numBytes - 1;

	132 scanSlackOffset = byteOffset;

	133 stringOffsetSlackOffset = byteOffset;

	134 }

	135 // In case of a surrogate pair, return a single code point.

	136 return codePoint.runes.single;

	137 } else {

	138 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";

	139 }

	140 }

	141

	142 int lastUnicodeOffset = -1;

	143 int currentAsUnicode(int next) {

	144 if (next < 128) return next;

	145 // Check if currentAsUnicode was already invoked.

	146 if (byteOffset == lastUnicodeOffset) return next;

	147 int res = nextCodePoint(byteOffset, next, true);

	148 lastUnicodeOffset = byteOffset;

	149 return res;

	150 }

	151

	152 void handleUnicode(int startScanOffset) {

	153 int end = byteOffset;

	154 // TODO(lry), this measurably slows down the scanner for files with unicode.

	155 String s = UTF8.decode(bytes.sublist(startScanOffset, end));

	156 utf8Slack += (end - startScanOffset) - s.length;

	157 }

	158
	kasperl 2013/10/17 08:50:39 That's a lot of whitespace. That's a lot of whitespace. lukas 2013/10/17 17:49:34 Done. Show quoted text On 2013/10/17 08:50:39, kasperl wrote: > That's a lot of whitespace. Done.
	159

	160

	161 /**

	162 * This field remembers the byte offset of the last character decoded with

	163 * [nextCodePoint] that used two code units in UTF-16.

	164 *

	165 * [nextCodePoint] returns a single code point for each unicode character,

	166 * even if it needs two code units in UTF-16.

	167 *

	168 * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in

	169 * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the

	170 * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should

	171 * return the offset of the first one, which is one position more left than

	172 * the [utf8Slack].

	173 */

	174 int stringOffsetSlackOffset = -1;

	175

	176 int get stringOffset {

	177 if (stringOffsetSlackOffset == byteOffset) {

	178 return byteOffset - utf8Slack - 1;

	179 } else {

	180 return byteOffset - utf8Slack;

	181 }

	182 }

	183

	184 Token firstToken() => tokens.next;

	185 Token previousToken() => tail;

	186

	187

	188 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,

	189 [int extraOffset = 0]) {

	190 tail.next = new StringToken.fromUtf8Bytes(

	191 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);

	192 tail = tail.next;

	193 }

	194 }

OLD	NEW