pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart - Issue 2644843006: Use packages dart_parser, dart_scanner, and compiler_util.

Side by Side Diff: pkg/compiler/lib/src/scanner/utf8_bytes_scanner.dart

Issue 2644843006: Use packages dart_parser, dart_scanner, and compiler_util. (Closed)

Patch Set: Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« pkg/compiler/lib/src/resolution/enum_creator.dart ('K') | « pkg/compiler/lib/src/scanner/string_scanner.dart ('k') | pkg/compiler/lib/src/serialization/modelz.dart » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.

4

5 library dart2js.scanner.utf8;

6

7 import 'dart:convert' show UNICODE_BOM_CHARACTER_RUNE, UTF8;

8

9 import '../io/source_file.dart' show SourceFile;

10 import '../tokens/precedence.dart' show PrecedenceInfo;

11 import '../tokens/token.dart' show StringToken, Token;

12 import 'array_based_scanner.dart' show ArrayBasedScanner;

13

14 /**

15 * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens

16 * that points to substrings.

17 */

18 class Utf8BytesScanner extends ArrayBasedScanner {

19 /**

20 * The file content.

21 *

22 * The content is zero-terminated.

23 */

24 List<int> bytes;

25

26 /**

27 * Points to the offset of the last byte returned by [advance].

28 *

29 * After invoking [currentAsUnicode], the [byteOffset] points to the last

30 * byte that is part of the (unicode or ASCII) character. That way, [advance]

31 * can always increase the byte offset by 1.

32 */

33 int byteOffset = -1;

34

35 /**

36 * The getter [scanOffset] is expected to return the index where the current

37 * character starts. In case of a non-ascii character, after invoking

38 * [currentAsUnicode], the byte offset points to the last byte.

39 *

40 * This field keeps track of the number of bytes for the current unicode

41 * character. For example, if bytes 7,8,9 encode one unicode character, the

42 * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack]

43 * will be 2, so that [scanOffset] returns 7.

44 */

45 int scanSlack = 0;

46

47 /**

48 * Holds the [byteOffset] value for which the current [scanSlack] is valid.

49 */

50 int scanSlackOffset = -1;

51

52 /**

53 * Returns the byte offset of the first byte that belongs to the current

54 * character.

55 */

56 int get scanOffset {

57 if (byteOffset == scanSlackOffset) {

58 return byteOffset - scanSlack;

59 } else {

60 return byteOffset;

61 }

62 }

63

64 /**

65 * The difference between the number of bytes and the number of corresponding

66 * string characters, up to the current [byteOffset].

67 */

68 int utf8Slack = 0;

69

70 /**

71 * Creates a new Utf8BytesScanner. The source file is expected to be a

72 * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the

73 * string text of the source file is decoded.

74 *

75 * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an

76 * array whose last element is '0' to signal the end of the file. If this

77 * is not the case, the entire array is copied before scanning.

78 */

79 Utf8BytesScanner(SourceFile file, {bool includeComments: false})

80 : bytes = file.slowUtf8ZeroTerminatedBytes(),

81 super(file, includeComments) {

82 assert(bytes.last == 0);

83 // Skip a leading BOM.

84 if (_containsBomAt(0)) byteOffset += 3;

85 }

86

87 /**

88 * Creates a new Utf8BytesScanner from a list of UTF-8 bytes.

89 *

90 * The last element of the list is expected to be '0' to signal the end of

91 * the file. If this is not the case, the entire array is copied before

92 * scanning.

93 */

94 Utf8BytesScanner.fromBytes(List<int> zeroTerminatedBytes,

95 {bool includeComments: false})

96 : this.bytes = zeroTerminatedBytes,

97 super(null, includeComments) {

98 assert(bytes.last == 0);

99 }

100

101 bool _containsBomAt(int offset) {

102 const BOM_UTF8 = const [0xEF, 0xBB, 0xBF];

103

104 return offset + 3 < bytes.length &&

105 bytes[offset] == BOM_UTF8[0] &&

106 bytes[offset + 1] == BOM_UTF8[1] &&

107 bytes[offset + 2] == BOM_UTF8[2];

108 }

109

110 int advance() => bytes[++byteOffset];

111

112 int peek() => bytes[byteOffset + 1];

113

114 /**

115 * Returns the unicode code point starting at the byte offset [startOffset]

116 * with the byte [nextByte]. If [advance] is true the current [byteOffset]

117 * is advanced to the last byte of the code point.

118 */

119 int nextCodePoint(int startOffset, int nextByte, bool advance) {

120 // The number of 1s in the first byte indicate the number of bytes, at

121 // least 2.

122 int numBytes = 2;

123 int bit = 0x20;

124 while ((nextByte & bit) != 0) {

125 numBytes++;

126 bit >>= 1;

127 }

128 int end = startOffset + numBytes;

129 if (advance) {

130 byteOffset = end - 1;

131 }

132 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a

133 // _Utf8Decoder instance. Also the sublist is eagerly allocated.

134 String codePoint = UTF8.decode(bytes.sublist(startOffset, end));

135 if (codePoint.length == 0) {

136 // The UTF-8 decoder discards leading BOM characters.

137 // TODO(floitsch): don't just assume that removed characters were the

138 // BOM.

139 assert(_containsBomAt(startOffset));

140 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);

141 }

142 if (codePoint.length == 1) {

143 if (advance) {

144 utf8Slack += (numBytes - 1);

145 scanSlack = numBytes - 1;

146 scanSlackOffset = byteOffset;

147 }

148 return codePoint.codeUnitAt(0);

149 } else if (codePoint.length == 2) {

150 if (advance) {

151 utf8Slack += (numBytes - 2);

152 scanSlack = numBytes - 1;

153 scanSlackOffset = byteOffset;

154 stringOffsetSlackOffset = byteOffset;

155 }

156 // In case of a surrogate pair, return a single code point.

157 return codePoint.runes.single;

158 } else {

159 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";

160 }

161 }

162

163 int lastUnicodeOffset = -1;

164 int currentAsUnicode(int next) {

165 if (next < 128) return next;

166 // Check if currentAsUnicode was already invoked.

167 if (byteOffset == lastUnicodeOffset) return next;

168 int res = nextCodePoint(byteOffset, next, true);

169 lastUnicodeOffset = byteOffset;

170 return res;

171 }

172

173 void handleUnicode(int startScanOffset) {

174 int end = byteOffset;

175 // TODO(lry): this measurably slows down the scanner for files with unicode.

176 String s = UTF8.decode(bytes.sublist(startScanOffset, end));

177 utf8Slack += (end - startScanOffset) - s.length;

178 }

179

180 /**

181 * This field remembers the byte offset of the last character decoded with

182 * [nextCodePoint] that used two code units in UTF-16.

183 *

184 * [nextCodePoint] returns a single code point for each unicode character,

185 * even if it needs two code units in UTF-16.

186 *

187 * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in

188 * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the

189 * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should

190 * return the offset of the first one, which is one position more left than

191 * the [utf8Slack].

192 */

193 int stringOffsetSlackOffset = -1;

194

195 int get stringOffset {

196 if (stringOffsetSlackOffset == byteOffset) {

197 return byteOffset - utf8Slack - 1;

198 } else {

199 return byteOffset - utf8Slack;

200 }

201 }

202

203 Token firstToken() => tokens.next;

204 Token previousToken() => tail;

205

206 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,

207 [int extraOffset = 0]) {

208 tail.next = new StringToken.fromUtf8Bytes(

209 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);

210 tail = tail.next;

211 }

212

213 bool atEndOfFile() => byteOffset >= bytes.length - 1;

214 }

OLD	NEW