sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart - Issue 694353007: Move dart2js from sdk/lib/_internal/compiler to pkg/compiler

Side by Side Diff: sdk/lib/_internal/compiler/implementation/scanner/utf8_bytes_scanner.dart

Issue 694353007: Move dart2js from sdk/lib/_internal/compiler to pkg/compiler (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 // Copyright (c) 2013, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.

4

5 part of scanner;

6

7 /**

8 * Scanner that reads from a UTF-8 encoded list of bytes and creates tokens

9 * that points to substrings.

10 */

11 class Utf8BytesScanner extends ArrayBasedScanner {

12 /** The file content. */

13 List<int> bytes;

14

15 /**

16 * Points to the offset of the last byte returned by [advance].

17 *

18 * After invoking [currentAsUnicode], the [byteOffset] points to the last

19 * byte that is part of the (unicode or ASCII) character. That way, [advance]

20 * can always increase the byte offset by 1.

21 */

22 int byteOffset = -1;

23

24 /**

25 * The getter [scanOffset] is expected to return the index where the current

26 * character starts. In case of a non-ascii character, after invoking

27 * [currentAsUnicode], the byte offset points to the last byte.

28 *

29 * This field keeps track of the number of bytes for the current unicode

30 * character. For example, if bytes 7,8,9 encode one unicode character, the

31 * [byteOffset] is 9 (after invoking [currentAsUnicode]). The [scanSlack]

32 * will be 2, so that [scanOffset] returns 7.

33 */

34 int scanSlack = 0;

35

36 /**

37 * Holds the [byteOffset] value for which the current [scanSlack] is valid.

38 */

39 int scanSlackOffset = -1;

40

41 /**

42 * Returns the byte offset of the first byte that belongs to the current

43 * character.

44 */

45 int get scanOffset {

46 if (byteOffset == scanSlackOffset) {

47 return byteOffset - scanSlack;

48 } else {

49 return byteOffset;

50 }

51 }

52

53 /**

54 * The difference between the number of bytes and the number of corresponding

55 * string characters, up to the current [byteOffset].

56 */

57 int utf8Slack = 0;

58

59 /**

60 * Creates a new Utf8BytesScanner. The source file is expected to be a

61 * [Utf8BytesSourceFile] that holds a list of UTF-8 bytes. Otherwise the

62 * string text of the source file is decoded.

63 *

64 * The list of UTF-8 bytes [file.slowUtf8Bytes()] is expected to return an

65 * array whose last element is '0' to signal the end of the file. If this

66 * is not the case, the entire array is copied before scanning.

67 */

68 Utf8BytesScanner(SourceFile file, {bool includeComments: false})

69 : bytes = file.slowUtf8Bytes(),

70 super(file, includeComments) {

71 ensureZeroTermination();

72 // Skip a leading BOM.

73 if (_containsBomAt(0)) byteOffset += 3;

74 }

75

76 /**

77 * Creates a new Utf8BytesScanner from a list of UTF-8 bytes.

78 *

79 * The last element of the list is expected to be '0' to signal the end of

80 * the file. If this is not the case, the entire array is copied before

81 * scanning.

82 */

83 Utf8BytesScanner.fromBytes(this.bytes, {bool includeComments: false})

84 : super(null, includeComments) {

85 ensureZeroTermination();

86 }

87

88 void ensureZeroTermination() {

89 if (bytes.isEmpty \|\| bytes[bytes.length - 1] != 0) {

90 // TODO(lry): abort instead of copying the array, or warn?

91 var newBytes = new Uint8List(bytes.length + 1);

92 for (int i = 0; i < bytes.length; i++) {

93 newBytes[i] = bytes[i];

94 }

95 newBytes[bytes.length] = 0;

96 bytes = newBytes;

97 }

98 }

99

100 bool _containsBomAt(int offset) {

101 const BOM_UTF8 = const [0xEF, 0xBB, 0xBF];

102

103 return offset + 3 < bytes.length &&

104 bytes[offset] == BOM_UTF8[0] &&

105 bytes[offset + 1] == BOM_UTF8[1] &&

106 bytes[offset + 2] == BOM_UTF8[2];

107 }

108

109 int advance() => bytes[++byteOffset];

110

111 int peek() => bytes[byteOffset + 1];

112

113 /**

114 * Returns the unicode code point starting at the byte offset [startOffset]

115 * with the byte [nextByte]. If [advance] is true the current [byteOffset]

116 * is advanced to the last byte of the code point.

117 */

118 int nextCodePoint(int startOffset, int nextByte, bool advance) {

119 // The number of 1s in the first byte indicate the number of bytes, at

120 // least 2.

121 int numBytes = 2;

122 int bit = 0x20;

123 while ((nextByte & bit) != 0) {

124 numBytes++;

125 bit >>= 1;

126 }

127 int end = startOffset + numBytes;

128 if (advance) {

129 byteOffset = end - 1;

130 }

131 // TODO(lry): measurably slow, decode creates first a Utf8Decoder and a

132 // _Utf8Decoder instance. Also the sublist is eagerly allocated.

133 String codePoint = UTF8.decode(bytes.sublist(startOffset, end));

134 if (codePoint.length == 0) {

135 // The UTF-8 decoder discards leading BOM characters.

136 // TODO(floitsch): don't just assume that removed characters were the

137 // BOM.

138 assert(_containsBomAt(startOffset));

139 codePoint = new String.fromCharCode(UNICODE_BOM_CHARACTER_RUNE);

140 }

141 if (codePoint.length == 1) {

142 if (advance) {

143 utf8Slack += (numBytes - 1);

144 scanSlack = numBytes - 1;

145 scanSlackOffset = byteOffset;

146 }

147 return codePoint.codeUnitAt(0);

148 } else if (codePoint.length == 2) {

149 if (advance) {

150 utf8Slack += (numBytes - 2);

151 scanSlack = numBytes - 1;

152 scanSlackOffset = byteOffset;

153 stringOffsetSlackOffset = byteOffset;

154 }

155 // In case of a surrogate pair, return a single code point.

156 return codePoint.runes.single;

157 } else {

158 throw "Invalid UTF-8 byte sequence: ${bytes.sublist(startOffset, end)}";

159 }

160 }

161

162 int lastUnicodeOffset = -1;

163 int currentAsUnicode(int next) {

164 if (next < 128) return next;

165 // Check if currentAsUnicode was already invoked.

166 if (byteOffset == lastUnicodeOffset) return next;

167 int res = nextCodePoint(byteOffset, next, true);

168 lastUnicodeOffset = byteOffset;

169 return res;

170 }

171

172 void handleUnicode(int startScanOffset) {

173 int end = byteOffset;

174 // TODO(lry): this measurably slows down the scanner for files with unicode.

175 String s = UTF8.decode(bytes.sublist(startScanOffset, end));

176 utf8Slack += (end - startScanOffset) - s.length;

177 }

178

179 /**

180 * This field remembers the byte offset of the last character decoded with

181 * [nextCodePoint] that used two code units in UTF-16.

182 *

183 * [nextCodePoint] returns a single code point for each unicode character,

184 * even if it needs two code units in UTF-16.

185 *

186 * For example, '\u{1d11e}' uses 4 bytes in UTF-8, and two code units in

187 * UTF-16. The [utf8Slack] is therefore 2. After invoking [nextCodePoint], the

188 * [byteOffset] points to the last (of 4) bytes. The [stringOffset] should

189 * return the offset of the first one, which is one position more left than

190 * the [utf8Slack].

191 */

192 int stringOffsetSlackOffset = -1;

193

194 int get stringOffset {

195 if (stringOffsetSlackOffset == byteOffset) {

196 return byteOffset - utf8Slack - 1;

197 } else {

198 return byteOffset - utf8Slack;

199 }

200 }

201

202 Token firstToken() => tokens.next;

203 Token previousToken() => tail;

204

205 void appendSubstringToken(PrecedenceInfo info, int start, bool asciiOnly,

206 [int extraOffset = 0]) {

207 tail.next = new StringToken.fromUtf8Bytes(

208 info, bytes, start, byteOffset + extraOffset, asciiOnly, tokenStart);

209 tail = tail.next;

210 }

211

212 bool atEndOfFile() => byteOffset >= bytes.length - 1;

213 }

OLD	NEW