sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart - Issue 27510003: Scanner for UTF-8 byte arrays

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart

Issue 27510003: Scanner for UTF-8 byte arrays (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: fixes compiler tests Created 7 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

« sdk/lib/_internal/compiler/implementation/scanner/token.dart ('K') | « sdk/lib/_internal/compiler/implementation/scanner/token.dart ('k') | sdk/lib/_internal/compiler/implementation/script.dart » ('j') | sdk/lib/_internal/compiler/implementation/source_file.dart » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
	ngeoffray 2013/10/18 10:19:37 2011 -> 2013 2011 -> 2013 lukas 2013/10/24 16:48:36 Done. Show quoted text On 2013/10/18 10:19:37, ngeoffray wrote: > 2011 -> 2013 Done.
	2 // for details. All rights reserved. Use of this source code is governed by a

	3 // BSD-style license that can be found in the LICENSE file.

	4

	5 part of scanner;

	6

	7 /**

	8 * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens

	9 * that points to substrings.

	10 */

	11 class Utf8BytesScanner extends ArrayBasedScanner {

	12 /** The file content. */

	13 List<int> bytes;

	14

	15 /**

	16 * Points to the offset of the byte last returned by [advance].
	ngeoffray 2013/10/18 10:19:37 byte last -> last byte? byte last -> last byte? lukas 2013/10/24 16:48:36 Done. Show quoted text On 2013/10/18 10:19:37, ngeoffray wrote: > byte last -> last byte? Done.
	17 *

	18 * After invoking [currentAsUnicode], the [byteOffset] points to the last

	19 * byte that is part of the (unicode or ASCII) character. That way, [advance]

	20 * can always increase the byte offset by 1.

	21 */

	22 int byteOffset = -1;

	23

	24 /**

	25 * The getter [scanOffset] is expected to return the index where the current

	26 * character starts. In case of a non-ascii character, after invoking

	27 * [currentAsUnicode], the byte offset points to the last byte.

	28 *

	29 * This field keeps track of the number of bytes for the current unicode

	30 * character. For example, if bytes 7,8,9 encode one unicode character, the

	31 * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack]

	32 * will be 2, so that [scanOffset] returns 7.

	33 */

	34 int scanSlack = 0;

	35

	36 /**

	37 * Holds the [byteOffset] value for which the current [scanSlack] is valid.

	38 */

	39 int scanSlackOffset = -1;

	40

	41 /**

	42 * Returns the byte offset of the first byte that belongs to the current

	43 * character.

	44 */

	45 int get scanOffset {

	46 if (byteOffset == scanSlackOffset) {

	47 return byteOffset - scanSlack;

	48 } else {

	49 return byteOffset;

	50 }

	51 }

	52

	53 /**

	54 * The difference between the number of bytes and the number of corresponding

	55 * string characters, up to the current [byteOffset].

	56 */

	57 int utf8Slack = 0;

	58

	59 /**

	60 * Creates a new Utf8BytesScanner. The source file is expected to be a

	61 * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the

	62 * string text of the source file is decoded.

	63 *

	64 * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an

	65 * array whose last element is '0' to signal the end of the file. If this

	66 * is not the case, the entire array is copied before scanning.

	67 */

	68 Utf8BytesScanner(SourceFile file, {bool includeComments: false})

	69 : bytes = file.slowUtf8Bytes(),

	70 super(file, includeComments) {

	71 ensureZeroTermination();

	72 }

	73

	74 /**

	75 * Creates a new Utf8BytesScanner from a list of UTF-8 bytes.

	76 *

	77 * The last element of the list is expected to be '0' to signal the end of

	78 * the file. If this is not the case, the entire array is copied before

	79 * scanning.

	80 */

	81 Utf8BytesScanner.fromBytes(this.bytes, {bool includeComments: false})

	82 : super(null, includeComments) {

	83 ensureZeroTermination();

	84 }

	85

	86 void ensureZeroTermination() {

	87 if (bytes.isEmpty \|\| bytes[bytes.length - 1] != 0) {

	88 // TODO(lry): abort instead of copying the array, or warn?

	89 var newBytes = new Uint8List(bytes.length + 1);

	90 for (int i = 0; i < bytes.length; i++) {

	91 newBytes[i] = bytes[i];

	92 }

	93 newBytes[bytes.length] = 0;

	94 bytes = newBytes;

	95 }

	96 }

	97

	98 int advance() => bytes[++byteOffset];

	99

	100 int peek() => bytes[byteOffset + 1];

	101

	102 /**

	103 * Returns the unicode code point starting at the byte offset [startOffset]

	104 * with the byte [nextByte]. If [advance] is true the current [byteOffset]

	105 * is advanced to the last byte of the code point.

	106 */

	107 int nextCodePoint(int startOffset, int nextByte, bool advance) {

	108 // The number of 1s in the first byte indicate the number of bytes, at

	109 // least 2.

	110 int numBytes = 2;

	111 int bit = 0x20;

	112 while ((nextByte & bit) != 0) {

	113 numBytes++;

	114 bit >>= 1;

	115 }

	116 int end = startOffset + numBytes;

	117 if (advance) {

	118 byteOffset = end - 1;

	119 }

	120 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a

	121 // _Utf8Decoder instance. Also the sublist is eagerly allocated.

	122 String codePoint = UTF8.decode(bytes.sublist(startOffset, end));

	123 if (codePoint.length == 1) {

	124 if (advance) {

	125 utf8Slack += (numBytes - 1);

	126 scanSlack = numBytes - 1;

	127 scanSlackOffset = byteOffset;

	128 }

	129 return codePoint.codeUnitAt(0);

	130 } else if (codePoint.length == 2) {

	131 if (advance) {

	132 utf8Slack += (numBytes - 2);

	133 scanSlack = numBytes - 1;

	134 scanSlackOffset = byteOffset;

	135 stringOffsetSlackOffset = byteOffset;

	136 }

	137 // In case of a surrogate pair, return a single code point.

	138 return codePoint.runes.single;

	139 } else {

	140 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";
	ngeoffray 2013/10/18 10:19:37 Will users face this error? If yes, we should thro Will users face this error? If yes, we should throw differently. lukas 2013/10/24 16:48:36 That could only happen in a file with wrong UTF-8 Show quoted text On 2013/10/18 10:19:37, ngeoffray wrote: > Will users face this error? If yes, we should throw differently. That could only happen in a file with wrong UTF-8 encoding, in which case UTF8.decode would probably fail earlier. I could also just remove the check.
	141 }

	142 }

	143

	144 int lastUnicodeOffset = -1;

	145 int currentAsUnicode(int next) {

	146 if (next < 128) return next;

	147 // Check if currentAsUnicode was already invoked.

	148 if (byteOffset == lastUnicodeOffset) return next;

	149 int res = nextCodePoint(byteOffset, next, true);

	150 lastUnicodeOffset = byteOffset;

	151 return res;

	152 }

	153

	154 void handleUnicode(int startScanOffset) {

	155 int end = byteOffset;

	156 // TODO(lry): this measurably slows down the scanner for files with unicode.

	157 String s = UTF8.decode(bytes.sublist(startScanOffset, end));

	158 utf8Slack += (end - startScanOffset) - s.length;

	159 }

	160

	161 /**

	162 * This field remembers the byte offset of the last character decoded with

	163 * [nextCodePoint] that used two code units in UTF-16.

	164 *

	165 * [nextCodePoint] returns a single code point for each unicode character,

	166 * even if it needs two code units in UTF-16.

	167 *

	168 * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in

	169 * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the

	170 * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should

	171 * return the offset of the first one, which is one position more left than

	172 * the [utf8Slack].

	173 */

	174 int stringOffsetSlackOffset = -1;

	175

	176 int get stringOffset {

	177 if (stringOffsetSlackOffset == byteOffset) {

	178 return byteOffset - utf8Slack - 1;

	179 } else {

	180 return byteOffset - utf8Slack;

	181 }

	182 }

	183

	184 Token firstToken() => tokens.next;

	185 Token previousToken() => tail;

	186

	187
	ngeoffray 2013/10/18 10:19:37 Extra line, Extra line, lukas 2013/10/24 16:48:36 Done. Show quoted text On 2013/10/18 10:19:37, ngeoffray wrote: > Extra line, Done.
	188 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,

	189 [int extraOffset = 0]) {

	190 tail.next = new StringToken.fromUtf8Bytes(

	191 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);

	192 tail = tail.next;

	193 }

	194 }

OLD	NEW