| OLD | NEW |
| 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file | 1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file |
| 2 // for details. All rights reserved. Use of this source code is governed by a | 2 // for details. All rights reserved. Use of this source code is governed by a |
| 3 // BSD-style license that can be found in the LICENSE file. | 3 // BSD-style license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 library fasta.scanner.utf8_bytes_scanner; | 5 library fasta.scanner.utf8_bytes_scanner; |
| 6 | 6 |
| 7 import 'dart:convert' show | 7 import 'dart:convert' show |
| 8 UNICODE_BOM_CHARACTER_RUNE, | 8 UNICODE_BOM_CHARACTER_RUNE, |
| 9 UTF8; | 9 UTF8; |
| 10 | 10 |
| 11 import '../scanner.dart' show |
| 12 unicodeReplacementCharacter; |
| 13 |
| 11 import 'precedence.dart' show | 14 import 'precedence.dart' show |
| 12 PrecedenceInfo; | 15 PrecedenceInfo; |
| 13 | 16 |
| 14 import 'token.dart' show | 17 import 'token.dart' show |
| 15 StringToken, | 18 StringToken, |
| 16 Token; | 19 Token; |
| 17 | 20 |
| 18 import 'array_based_scanner.dart' show | 21 import 'array_based_scanner.dart' show |
| 19 ArrayBasedScanner; | 22 ArrayBasedScanner; |
| 20 | 23 |
| (...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 96 return offset + 3 < bytes.length && | 99 return offset + 3 < bytes.length && |
| 97 bytes[offset] == BOM_UTF8[0] && | 100 bytes[offset] == BOM_UTF8[0] && |
| 98 bytes[offset + 1] == BOM_UTF8[1] && | 101 bytes[offset + 1] == BOM_UTF8[1] && |
| 99 bytes[offset + 2] == BOM_UTF8[2]; | 102 bytes[offset + 2] == BOM_UTF8[2]; |
| 100 } | 103 } |
| 101 | 104 |
| 102 int advance() => bytes[++byteOffset]; | 105 int advance() => bytes[++byteOffset]; |
| 103 | 106 |
| 104 int peek() => bytes[byteOffset + 1]; | 107 int peek() => bytes[byteOffset + 1]; |
| 105 | 108 |
| 106 /** | 109 /// Returns the unicode code point starting at the byte offset [startOffset] |
| 107 * Returns the unicode code point starting at the byte offset [startOffset] | 110 /// with the byte [nextByte]. |
| 108 * with the byte [nextByte]. If [advance] is true the current [byteOffset] | 111 int nextCodePoint(int startOffset, int nextByte) { |
| 109 * is advanced to the last byte of the code point. | 112 int expectedHighBytes; |
| 110 */ | 113 if (nextByte < 0xC2) { |
| 111 int nextCodePoint(int startOffset, int nextByte, bool advance) { | 114 expectedHighBytes = 1; // Bad code unit. |
| 112 // The number of 1s in the first byte indicate the number of bytes, at | 115 } else if (nextByte < 0xE0) { |
| 113 // least 2. | 116 expectedHighBytes = 2; |
| 114 int numBytes = 2; | 117 } else if (nextByte < 0xF0) { |
| 115 int bit = 0x20; | 118 expectedHighBytes = 3; |
| 116 while ((nextByte & bit) != 0) { | 119 } else if (nextByte < 0xF5) { |
| 120 expectedHighBytes = 4; |
| 121 } else { |
| 122 expectedHighBytes = 1; // Bad code unit. |
| 123 } |
| 124 int numBytes = 0; |
| 125 for (int i = 0; i < expectedHighBytes; i++) { |
| 126 if (bytes[byteOffset + i] < 0x80) { |
| 127 break; |
| 128 } |
| 117 numBytes++; | 129 numBytes++; |
| 118 bit >>= 1; | |
| 119 } | 130 } |
| 120 int end = startOffset + numBytes; | 131 int end = startOffset + numBytes; |
| 121 if (advance) { | 132 byteOffset = end - 1; |
| 122 byteOffset = end - 1; | 133 if (expectedHighBytes == 1 || numBytes != expectedHighBytes) { |
| 134 return unicodeReplacementCharacter; |
| 123 } | 135 } |
| 124 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a | 136 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a |
| 125 // _Utf8Decoder instance. Also the sublist is eagerly allocated. | 137 // _Utf8Decoder instance. Also the sublist is eagerly allocated. |
| 126 String codePoint = UTF8.decode(bytes.sublist(startOffset, end)); | 138 String codePoint = |
| 139 UTF8.decode(bytes.sublist(startOffset, end), allowMalformed: true); |
| 127 if (codePoint.length == 0) { | 140 if (codePoint.length == 0) { |
| 128 // The UTF-8 decoder discards leading BOM characters. | 141 // The UTF-8 decoder discards leading BOM characters. |
| 129 // TODO(floitsch): don't just assume that removed characters were the | 142 // TODO(floitsch): don't just assume that removed characters were the |
| 130 // BOM. | 143 // BOM. |
| 131 assert(containsBomAt(startOffset)); | 144 assert(containsBomAt(startOffset)); |
| 132 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE); | 145 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE); |
| 133 } | 146 } |
| 134 if (codePoint.length == 1) { | 147 if (codePoint.length == 1) { |
| 135 if (advance) { | 148 utf8Slack += (numBytes - 1); |
| 136 utf8Slack += (numBytes - 1); | 149 scanSlack = numBytes - 1; |
| 137 scanSlack = numBytes - 1; | 150 scanSlackOffset = byteOffset; |
| 138 scanSlackOffset = byteOffset; | |
| 139 } | |
| 140 return codePoint.codeUnitAt(0); | 151 return codePoint.codeUnitAt(0); |
| 141 } else if (codePoint.length == 2) { | 152 } else if (codePoint.length == 2) { |
| 142 if (advance) { | 153 utf8Slack += (numBytes - 2); |
| 143 utf8Slack += (numBytes - 2); | 154 scanSlack = numBytes - 1; |
| 144 scanSlack = numBytes - 1; | 155 scanSlackOffset = byteOffset; |
| 145 scanSlackOffset = byteOffset; | 156 stringOffsetSlackOffset = byteOffset; |
| 146 stringOffsetSlackOffset = byteOffset; | |
| 147 } | |
| 148 // In case of a surrogate pair, return a single code point. | 157 // In case of a surrogate pair, return a single code point. |
| 149 return codePoint.runes.single; | 158 return codePoint.runes.single; |
| 150 } else { | 159 } else { |
| 151 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}"; | 160 return unicodeReplacementCharacter; |
| 152 } | 161 } |
| 153 } | 162 } |
| 154 | 163 |
| 155 int lastUnicodeOffset = -1; | 164 int lastUnicodeOffset = -1; |
| 156 int currentAsUnicode(int next) { | 165 int currentAsUnicode(int next) { |
| 157 if (next < 128) return next; | 166 if (next < 128) return next; |
| 158 // Check if currentAsUnicode was already invoked. | 167 // Check if currentAsUnicode was already invoked. |
| 159 if (byteOffset == lastUnicodeOffset) return next; | 168 if (byteOffset == lastUnicodeOffset) return next; |
| 160 int res = nextCodePoint(byteOffset, next, true); | 169 int res = nextCodePoint(byteOffset, next); |
| 161 lastUnicodeOffset = byteOffset; | 170 lastUnicodeOffset = byteOffset; |
| 162 return res; | 171 return res; |
| 163 } | 172 } |
| 164 | 173 |
| 165 void handleUnicode(int startScanOffset) { | 174 void handleUnicode(int startScanOffset) { |
| 166 int end = byteOffset; | 175 int end = byteOffset; |
| 167 // TODO(lry): this measurably slows down the scanner for files with unicode. | 176 // TODO(lry): this measurably slows down the scanner for files with unicode. |
| 168 String s = UTF8.decode(bytes.sublist(startScanOffset, end)); | 177 String s = |
| 178 UTF8.decode(bytes.sublist(startScanOffset, end), allowMalformed: true); |
| 169 utf8Slack += (end - startScanOffset) - s.length; | 179 utf8Slack += (end - startScanOffset) - s.length; |
| 170 } | 180 } |
| 171 | 181 |
| 172 /** | 182 /** |
| 173 * This field remembers the byte offset of the last character decoded with | 183 * This field remembers the byte offset of the last character decoded with |
| 174 * [nextCodePoint] that used two code units in UTF-16. | 184 * [nextCodePoint] that used two code units in UTF-16. |
| 175 * | 185 * |
| 176 * [nextCodePoint] returns a single code point for each unicode character, | 186 * [nextCodePoint] returns a single code point for each unicode character, |
| 177 * even if it needs two code units in UTF-16. | 187 * even if it needs two code units in UTF-16. |
| 178 * | 188 * |
| (...skipping 18 matching lines...) Expand all Loading... |
| 197 | 207 |
| 198 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly, | 208 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly, |
| 199 [int extraOffset = 0]) { | 209 [int extraOffset = 0]) { |
| 200 tail.next = new StringToken.fromUtf8Bytes( | 210 tail.next = new StringToken.fromUtf8Bytes( |
| 201 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart); | 211 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart); |
| 202 tail = tail.next; | 212 tail = tail.next; |
| 203 } | 213 } |
| 204 | 214 |
| 205 bool atEndOfFile() => byteOffset >= bytes.length - 1; | 215 bool atEndOfFile() => byteOffset >= bytes.length - 1; |
| 206 } | 216 } |
| OLD | NEW |