Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(167)

Side by Side Diff: pkg/front_end/lib/src/fasta/scanner/utf8_bytes_scanner.dart

Issue 2664593002: Port parser and scanner fixes from rasta branch. (Closed)
Patch Set: Rebased on ef8ec26cf36d1f07b4fdf5d605003210826ae1c2. Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 library fasta.scanner.utf8_bytes_scanner; 5 library fasta.scanner.utf8_bytes_scanner;
6 6
7 import 'dart:convert' show 7 import 'dart:convert' show
8 UNICODE_BOM_CHARACTER_RUNE, 8 UNICODE_BOM_CHARACTER_RUNE,
9 UTF8; 9 UTF8;
10 10
11 import '../scanner.dart' show
12 unicodeReplacementCharacter;
13
11 import 'precedence.dart' show 14 import 'precedence.dart' show
12 PrecedenceInfo; 15 PrecedenceInfo;
13 16
14 import 'token.dart' show 17 import 'token.dart' show
15 StringToken, 18 StringToken,
16 Token; 19 Token;
17 20
18 import 'array_based_scanner.dart' show 21 import 'array_based_scanner.dart' show
19 ArrayBasedScanner; 22 ArrayBasedScanner;
20 23
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after
96 return offset + 3 < bytes.length && 99 return offset + 3 < bytes.length &&
97 bytes[offset] == BOM_UTF8[0] && 100 bytes[offset] == BOM_UTF8[0] &&
98 bytes[offset + 1] == BOM_UTF8[1] && 101 bytes[offset + 1] == BOM_UTF8[1] &&
99 bytes[offset + 2] == BOM_UTF8[2]; 102 bytes[offset + 2] == BOM_UTF8[2];
100 } 103 }
101 104
102 int advance() => bytes[++byteOffset]; 105 int advance() => bytes[++byteOffset];
103 106
104 int peek() => bytes[byteOffset + 1]; 107 int peek() => bytes[byteOffset + 1];
105 108
106 /** 109 /// Returns the unicode code point starting at the byte offset [startOffset]
107 * Returns the unicode code point starting at the byte offset [startOffset] 110 /// with the byte [nextByte].
108 * with the byte [nextByte]. If [advance] is true the current [byteOffset] 111 int nextCodePoint(int startOffset, int nextByte) {
109 * is advanced to the last byte of the code point. 112 int expectedHighBytes;
110 */ 113 if (nextByte < 0xC2) {
111 int nextCodePoint(int startOffset, int nextByte, bool advance) { 114 expectedHighBytes = 1; // Bad code unit.
112 // The number of 1s in the first byte indicate the number of bytes, at 115 } else if (nextByte < 0xE0) {
113 // least 2. 116 expectedHighBytes = 2;
114 int numBytes = 2; 117 } else if (nextByte < 0xF0) {
115 int bit = 0x20; 118 expectedHighBytes = 3;
116 while ((nextByte & bit) != 0) { 119 } else if (nextByte < 0xF5) {
120 expectedHighBytes = 4;
121 } else {
122 expectedHighBytes = 1; // Bad code unit.
123 }
124 int numBytes = 0;
125 for (int i = 0; i < expectedHighBytes; i++) {
126 if (bytes[byteOffset + i] < 0x80) {
127 break;
128 }
117 numBytes++; 129 numBytes++;
118 bit >>= 1;
119 } 130 }
120 int end = startOffset + numBytes; 131 int end = startOffset + numBytes;
121 if (advance) { 132 byteOffset = end - 1;
122 byteOffset = end - 1; 133 if (expectedHighBytes == 1 || numBytes != expectedHighBytes) {
134 return unicodeReplacementCharacter;
123 } 135 }
124 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a 136 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a
125 // _Utf8Decoder instance. Also the sublist is eagerly allocated. 137 // _Utf8Decoder instance. Also the sublist is eagerly allocated.
126 String codePoint = UTF8.decode(bytes.sublist(startOffset, end)); 138 String codePoint =
139 UTF8.decode(bytes.sublist(startOffset, end), allowMalformed: true);
127 if (codePoint.length == 0) { 140 if (codePoint.length == 0) {
128 // The UTF-8 decoder discards leading BOM characters. 141 // The UTF-8 decoder discards leading BOM characters.
129 // TODO(floitsch): don't just assume that removed characters were the 142 // TODO(floitsch): don't just assume that removed characters were the
130 // BOM. 143 // BOM.
131 assert(containsBomAt(startOffset)); 144 assert(containsBomAt(startOffset));
132 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE); 145 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);
133 } 146 }
134 if (codePoint.length == 1) { 147 if (codePoint.length == 1) {
135 if (advance) { 148 utf8Slack += (numBytes - 1);
136 utf8Slack += (numBytes - 1); 149 scanSlack = numBytes - 1;
137 scanSlack = numBytes - 1; 150 scanSlackOffset = byteOffset;
138 scanSlackOffset = byteOffset;
139 }
140 return codePoint.codeUnitAt(0); 151 return codePoint.codeUnitAt(0);
141 } else if (codePoint.length == 2) { 152 } else if (codePoint.length == 2) {
142 if (advance) { 153 utf8Slack += (numBytes - 2);
143 utf8Slack += (numBytes - 2); 154 scanSlack = numBytes - 1;
144 scanSlack = numBytes - 1; 155 scanSlackOffset = byteOffset;
145 scanSlackOffset = byteOffset; 156 stringOffsetSlackOffset = byteOffset;
146 stringOffsetSlackOffset = byteOffset;
147 }
148 // In case of a surrogate pair, return a single code point. 157 // In case of a surrogate pair, return a single code point.
149 return codePoint.runes.single; 158 return codePoint.runes.single;
150 } else { 159 } else {
151 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}"; 160 return unicodeReplacementCharacter;
152 } 161 }
153 } 162 }
154 163
155 int lastUnicodeOffset = -1; 164 int lastUnicodeOffset = -1;
156 int currentAsUnicode(int next) { 165 int currentAsUnicode(int next) {
157 if (next < 128) return next; 166 if (next < 128) return next;
158 // Check if currentAsUnicode was already invoked. 167 // Check if currentAsUnicode was already invoked.
159 if (byteOffset == lastUnicodeOffset) return next; 168 if (byteOffset == lastUnicodeOffset) return next;
160 int res = nextCodePoint(byteOffset, next, true); 169 int res = nextCodePoint(byteOffset, next);
161 lastUnicodeOffset = byteOffset; 170 lastUnicodeOffset = byteOffset;
162 return res; 171 return res;
163 } 172 }
164 173
165 void handleUnicode(int startScanOffset) { 174 void handleUnicode(int startScanOffset) {
166 int end = byteOffset; 175 int end = byteOffset;
167 // TODO(lry): this measurably slows down the scanner for files with unicode. 176 // TODO(lry): this measurably slows down the scanner for files with unicode.
168 String s = UTF8.decode(bytes.sublist(startScanOffset, end)); 177 String s =
178 UTF8.decode(bytes.sublist(startScanOffset, end), allowMalformed: true);
169 utf8Slack += (end - startScanOffset) - s.length; 179 utf8Slack += (end - startScanOffset) - s.length;
170 } 180 }
171 181
172 /** 182 /**
173 * This field remembers the byte offset of the last character decoded with 183 * This field remembers the byte offset of the last character decoded with
174 * [nextCodePoint] that used two code units in UTF-16. 184 * [nextCodePoint] that used two code units in UTF-16.
175 * 185 *
176 * [nextCodePoint] returns a single code point for each unicode character, 186 * [nextCodePoint] returns a single code point for each unicode character,
177 * even if it needs two code units in UTF-16. 187 * even if it needs two code units in UTF-16.
178 * 188 *
(...skipping 18 matching lines...) Expand all
197 207
198 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly, 208 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,
199 [int extraOffset = 0]) { 209 [int extraOffset = 0]) {
200 tail.next = new StringToken.fromUtf8Bytes( 210 tail.next = new StringToken.fromUtf8Bytes(
201 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart); 211 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);
202 tail = tail.next; 212 tail = tail.next;
203 } 213 }
204 214
205 bool atEndOfFile() => byteOffset >= bytes.length - 1; 215 bool atEndOfFile() => byteOffset >= bytes.length - 1;
206 } 216 }
OLDNEW
« no previous file with comments | « pkg/front_end/lib/src/fasta/scanner/token.dart ('k') | pkg/front_end/lib/src/fasta/source/source_loader.dart » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698