Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(64)

Side by Side Diff: pkg/dart_scanner/lib/src/utf8_bytes_scanner.dart

Issue 2664593002: Port parser and scanner fixes from rasta branch. (Closed)
Patch Set: Update status files. Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a 2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file. 3 // BSD-style license that can be found in the LICENSE file.
4 4
5 library dart_scanner.utf8_bytes_scanner; 5 library dart_scanner.utf8_bytes_scanner;
6 6
7 import 'dart:convert' show 7 import 'dart:convert' show
8 UNICODE_BOM_CHARACTER_RUNE, 8 UNICODE_BOM_CHARACTER_RUNE,
9 UTF8; 9 UTF8;
10 10
11 import '../dart_scanner.dart' show
12 unicodeReplacementCharacter;
13
11 import 'precedence.dart' show 14 import 'precedence.dart' show
12 PrecedenceInfo; 15 PrecedenceInfo;
13 16
14 import 'token.dart' show 17 import 'token.dart' show
15 StringToken, 18 StringToken,
16 Token; 19 Token;
17 20
18 import 'array_based_scanner.dart' show 21 import 'array_based_scanner.dart' show
19 ArrayBasedScanner; 22 ArrayBasedScanner;
20 23
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after
98 bytes[offset + 1] == BOM_UTF8[1] && 101 bytes[offset + 1] == BOM_UTF8[1] &&
99 bytes[offset + 2] == BOM_UTF8[2]; 102 bytes[offset + 2] == BOM_UTF8[2];
100 } 103 }
101 104
102 int advance() => bytes[++byteOffset]; 105 int advance() => bytes[++byteOffset];
103 106
104 int peek() => bytes[byteOffset + 1]; 107 int peek() => bytes[byteOffset + 1];
105 108
106 /** 109 /**
107 * Returns the unicode code point starting at the byte offset [startOffset] 110 * Returns the unicode code point starting at the byte offset [startOffset]
108 * with the byte [nextByte]. If [advance] is true the current [byteOffset] 111 * with the byte [nextByte]. If [advance] is true the current [byteOffset]
Johnni Winther 2017/01/30 09:04:38 Remove doc about [advance].
ahe 2017/01/30 13:26:22 Done.
109 * is advanced to the last byte of the code point. 112 * is advanced to the last byte of the code point.
110 */ 113 */
111 int nextCodePoint(int startOffset, int nextByte, bool advance) { 114 int nextCodePoint(int startOffset, int nextByte) {
112 // The number of 1s in the first byte indicate the number of bytes, at 115 int expectedHighBytes;
113 // least 2. 116 if (nextByte < 0xC2) {
114 int numBytes = 2; 117 expectedHighBytes = 1; // Bad code unit.
115 int bit = 0x20; 118 } else if (nextByte < 0xE0) {
116 while ((nextByte & bit) != 0) { 119 expectedHighBytes = 2;
120 } else if (nextByte < 0xF0) {
121 expectedHighBytes = 3;
122 } else if (nextByte < 0xF5) {
123 expectedHighBytes = 4;
124 } else {
125 expectedHighBytes = 1; // Bad code unit.
126 }
127 int numBytes = 0;
128 for (int i = 0; i < expectedHighBytes; i++) {
129 if (bytes[byteOffset + i] < 0x80) {
130 break;
131 }
117 numBytes++; 132 numBytes++;
118 bit >>= 1;
119 } 133 }
120 int end = startOffset + numBytes; 134 int end = startOffset + numBytes;
121 if (advance) { 135 byteOffset = end - 1;
122 byteOffset = end - 1; 136 if (expectedHighBytes == 1 || numBytes != expectedHighBytes) {
137 return unicodeReplacementCharacter;
123 } 138 }
124 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a 139 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a
125 // _Utf8Decoder instance. Also the sublist is eagerly allocated. 140 // _Utf8Decoder instance. Also the sublist is eagerly allocated.
126 String codePoint = UTF8.decode(bytes.sublist(startOffset, end)); 141 String codePoint =
142 UTF8.decode(bytes.sublist(startOffset, end), allowMalformed: true);
127 if (codePoint.length == 0) { 143 if (codePoint.length == 0) {
128 // The UTF-8 decoder discards leading BOM characters. 144 // The UTF-8 decoder discards leading BOM characters.
129 // TODO(floitsch): don't just assume that removed characters were the 145 // TODO(floitsch): don't just assume that removed characters were the
130 // BOM. 146 // BOM.
131 assert(containsBomAt(startOffset)); 147 assert(containsBomAt(startOffset));
132 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE); 148 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);
133 } 149 }
134 if (codePoint.length == 1) { 150 if (codePoint.length == 1) {
135 if (advance) { 151 utf8Slack += (numBytes - 1);
136 utf8Slack += (numBytes - 1); 152 scanSlack = numBytes - 1;
137 scanSlack = numBytes - 1; 153 scanSlackOffset = byteOffset;
138 scanSlackOffset = byteOffset;
139 }
140 return codePoint.codeUnitAt(0); 154 return codePoint.codeUnitAt(0);
141 } else if (codePoint.length == 2) { 155 } else if (codePoint.length == 2) {
142 if (advance) { 156 utf8Slack += (numBytes - 2);
143 utf8Slack += (numBytes - 2); 157 scanSlack = numBytes - 1;
144 scanSlack = numBytes - 1; 158 scanSlackOffset = byteOffset;
145 scanSlackOffset = byteOffset; 159 stringOffsetSlackOffset = byteOffset;
146 stringOffsetSlackOffset = byteOffset;
147 }
148 // In case of a surrogate pair, return a single code point. 160 // In case of a surrogate pair, return a single code point.
149 return codePoint.runes.single; 161 return codePoint.runes.single;
150 } else { 162 } else {
151 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}"; 163 return unicodeReplacementCharacter;
152 } 164 }
153 } 165 }
154 166
155 int lastUnicodeOffset = -1; 167 int lastUnicodeOffset = -1;
156 int currentAsUnicode(int next) { 168 int currentAsUnicode(int next) {
157 if (next < 128) return next; 169 if (next < 128) return next;
158 // Check if currentAsUnicode was already invoked. 170 // Check if currentAsUnicode was already invoked.
159 if (byteOffset == lastUnicodeOffset) return next; 171 if (byteOffset == lastUnicodeOffset) return next;
160 int res = nextCodePoint(byteOffset, next, true); 172 int res = nextCodePoint(byteOffset, next);
161 lastUnicodeOffset = byteOffset; 173 lastUnicodeOffset = byteOffset;
162 return res; 174 return res;
163 } 175 }
164 176
165 void handleUnicode(int startScanOffset) { 177 void handleUnicode(int startScanOffset) {
166 int end = byteOffset; 178 int end = byteOffset;
167 // TODO(lry): this measurably slows down the scanner for files with unicode. 179 // TODO(lry): this measurably slows down the scanner for files with unicode.
168 String s = UTF8.decode(bytes.sublist(startScanOffset, end)); 180 String s =
181 UTF8.decode(bytes.sublist(startScanOffset, end), allowMalformed: true);
169 utf8Slack += (end - startScanOffset) - s.length; 182 utf8Slack += (end - startScanOffset) - s.length;
170 } 183 }
171 184
172 /** 185 /**
173 * This field remembers the byte offset of the last character decoded with 186 * This field remembers the byte offset of the last character decoded with
174 * [nextCodePoint] that used two code units in UTF-16. 187 * [nextCodePoint] that used two code units in UTF-16.
175 * 188 *
176 * [nextCodePoint] returns a single code point for each unicode character, 189 * [nextCodePoint] returns a single code point for each unicode character,
177 * even if it needs two code units in UTF-16. 190 * even if it needs two code units in UTF-16.
178 * 191 *
(...skipping 18 matching lines...) Expand all
197 210
198 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly, 211 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,
199 [int extraOffset = 0]) { 212 [int extraOffset = 0]) {
200 tail.next = new StringToken.fromUtf8Bytes( 213 tail.next = new StringToken.fromUtf8Bytes(
201 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart); 214 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);
202 tail = tail.next; 215 tail = tail.next;
203 } 216 }
204 217
205 bool atEndOfFile() => byteOffset >= bytes.length - 1; 218 bool atEndOfFile() => byteOffset >= bytes.length - 1;
206 } 219 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698