OLD | NEW |
---|---|
(Empty) | |
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file | |
2 // for details. All rights reserved. Use of this source code is governed by a | |
3 // BSD-style license that can be found in the LICENSE file. | |
4 | |
5 library yaml.scanner; | |
6 | |
7 import 'package:collection/collection.dart'; | |
8 import 'package:string_scanner/string_scanner.dart'; | |
9 import 'package:source_span/source_span.dart'; | |
10 | |
11 import 'style.dart'; | |
12 import 'token.dart'; | |
13 import 'utils.dart'; | |
14 import 'yaml_exception.dart'; | |
15 | |
16 /// A scanner that reads a string of Unicode characters and emits [Token]s. | |
17 /// | |
18 /// This is based on the libyaml scanner, available at | |
19 /// https://github.com/yaml/libyaml/blob/master/src/scanner.c. The license for | |
20 /// that is available in ../../libyaml-license.txt. | |
21 class Scanner { | |
22 static const TAB = 0x9; | |
23 static const LF = 0xA; | |
24 static const CR = 0xD; | |
25 static const SP = 0x20; | |
26 static const TILDE = 0x7E; | |
Bob Nystrom
2014/10/31 20:03:29
Move to after GRAVE_ACCENT?
nweiz
2014/11/04 22:19:37
Done.
| |
27 static const NEL = 0x85; | |
Bob Nystrom
2014/10/31 20:03:29
This one's a bit uncommon. How about moving either
nweiz
2014/11/04 22:19:38
Done.
| |
28 static const DOLLAR = 0x24; | |
29 static const LEFT_PAREN = 0x28; | |
30 static const RIGHT_PAREN = 0x29; | |
31 static const PLUS = 0x2B; | |
32 static const COMMA = 0x2C; | |
33 static const HYPHEN = 0x2D; | |
34 static const PERIOD = 0x2E; | |
35 static const QUESTION = 0x3F; | |
36 static const COLON = 0x3A; | |
37 static const SEMICOLON = 0x3B; | |
38 static const EQUALS = 0x3D; | |
39 static const LEFT_SQUARE = 0x5B; | |
40 static const RIGHT_SQUARE = 0x5D; | |
41 static const LEFT_CURLY = 0x7B; | |
42 static const RIGHT_CURLY = 0x7D; | |
43 static const HASH = 0x23; | |
44 static const AMPERSAND = 0x26; | |
45 static const ASTERISK = 0x2A; | |
46 static const EXCLAMATION = 0x21; | |
47 static const VERTICAL_BAR = 0x7C; | |
48 static const LEFT_ANGLE = 0x3C; | |
49 static const RIGHT_ANGLE = 0x3E; | |
50 static const SINGLE_QUOTE = 0x27; | |
51 static const DOUBLE_QUOTE = 0x22; | |
52 static const PERCENT = 0x25; | |
53 static const AT = 0x40; | |
54 static const GRAVE_ACCENT = 0x60; | |
55 | |
56 static const NULL = 0x0; | |
57 static const BELL = 0x7; | |
58 static const BACKSPACE = 0x8; | |
59 static const VERTICAL_TAB = 0xB; | |
60 static const FORM_FEED = 0xC; | |
61 static const ESCAPE = 0x1B; | |
62 static const SLASH = 0x2F; | |
63 static const BACKSLASH = 0x5C; | |
64 static const UNDERSCORE = 0x5F; | |
65 static const NBSP = 0xA0; | |
66 static const LINE_SEPARATOR = 0x2028; | |
67 static const PARAGRAPH_SEPARATOR = 0x2029; | |
68 static const BOM = 0xFEFF; | |
69 | |
70 static const NUMBER_0 = 0x30; | |
71 static const NUMBER_9 = 0x39; | |
72 | |
73 static const LETTER_A = 0x61; | |
74 static const LETTER_B = 0x62; | |
75 static const LETTER_E = 0x65; | |
76 static const LETTER_F = 0x66; | |
77 static const LETTER_N = 0x6E; | |
78 static const LETTER_R = 0x72; | |
79 static const LETTER_T = 0x74; | |
80 static const LETTER_U = 0x75; | |
81 static const LETTER_V = 0x76; | |
82 static const LETTER_X = 0x78; | |
83 static const LETTER_Z = 0x7A; | |
84 | |
85 static const LETTER_CAP_A = 0x41; | |
86 static const LETTER_CAP_F = 0x46; | |
87 static const LETTER_CAP_L = 0x4C; | |
88 static const LETTER_CAP_N = 0x4E; | |
89 static const LETTER_CAP_P = 0x50; | |
90 static const LETTER_CAP_U = 0x55; | |
91 static const LETTER_CAP_X = 0x58; | |
92 static const LETTER_CAP_Z = 0x5A; | |
93 | |
94 /// The underlying [SpanScanner] used to read characters from the source text. | |
95 /// | |
96 /// This is also used to track line and column information and to generate | |
97 /// [SourceSpan]s. | |
98 final SpanScanner _scanner; | |
99 | |
100 /// Whether this scanner has produced a [TokenType.STREAM_START] token | |
101 /// indicating the beginning of the YAML stream. | |
102 var _streamStartProduced = false; | |
103 | |
104 /// Whether this scanner has produced a [TokenType.STREAM_END] token | |
105 /// indicating the end of the YAML stream. | |
106 var _streamEndProduced = false; | |
107 | |
108 /// How many levels deep the scanner is in flow nesting. | |
109 var _flowLevel = 0; | |
Bob Nystrom
2014/10/31 20:03:28
Can this be inferred from _simpleKeys.length?
nweiz
2014/11/04 22:19:37
Yes, good idea.
| |
110 | |
111 /// The queue of tokens yet to be emitted. | |
112 /// | |
113 /// These are queued up in advance so that [TokenType.KEY] tokens can be | |
114 /// inserted once the scanner determines that a series of tokens represents a | |
115 /// mapping key. | |
116 final _tokens = new QueueList<Token>(); | |
117 | |
118 /// The number of tokens that have been emitted. | |
119 /// | |
120 /// This doesn't count tokens in [tokens]. | |
121 var _tokensParsed = 0; | |
Bob Nystrom
2014/10/31 20:03:28
"Parsed" -> "Scanned"?
nweiz
2014/11/04 22:19:37
Done.
| |
122 | |
123 /// Whether the next token in [_tokens] is ready to be returned. | |
124 /// | |
125 /// It might not be ready if there may still be a [TokenType.KEY] inserted | |
126 /// before it. | |
127 var _tokenAvailable = false; | |
128 | |
129 /// The stack of indent levels for the current nested block contexts. | |
130 final _indents = new List<int>(); | |
Bob Nystrom
2014/10/31 20:03:29
<int>[]
nweiz
2014/11/04 22:19:37
Done.
| |
131 | |
132 /// The current indent level. | |
133 var _indent = -1; | |
Bob Nystrom
2014/10/31 20:03:27
Document what -1 means (or make a constant).
Does
nweiz
2014/11/04 22:19:38
Done.
| |
134 | |
135 /// Whether a simple key is allowed in this context. | |
136 /// | |
137 /// A simple key refers to any mapping key that doesn't have an explicit "?". | |
138 var _simpleKeyAllowed = true; | |
139 | |
140 /// The stack of potential simple keys for each level of flow nesting. | |
141 /// | |
142 /// Entries in this list may be `null`, indicating that there is no valid | |
143 /// simple key for the associated level of nesting. | |
144 /// | |
145 /// When a ":" is parsed and there's a simple key available, a [TokenType.KEY] | |
146 /// token is inserted in [_tokens] before that key's token. This allows the | |
147 /// parser to tell that the key is intended to be a mapping key. | |
148 final _simpleKeys = <_SimpleKey>[null]; | |
Bob Nystrom
2014/10/31 20:03:28
Why isn't this initially empty?
nweiz
2014/11/04 22:19:37
Because there is an initial flow level that could
| |
149 | |
150 /// Whether the scanner's currently positioned in a block-level structure (as | |
151 /// opposed to flow-level). | |
152 bool get _inBlockContext => _flowLevel == 0; | |
153 | |
154 /// Whether the current character is a line break or the end of the source. | |
155 bool get _isBreakOrEnd => _scanner.isDone || _isBreak; | |
156 | |
157 /// Whether the current character is a line break. | |
158 bool get _isBreak => _isBreakAt(0); | |
159 | |
160 /// Whether the current character is whitespace or the end of the source. | |
161 bool get _isBlankOrEnd => _isBlankOrEndAt(0); | |
162 | |
163 /// Whether the current character is whitespace. | |
164 bool get _isBlank => _isBlankAt(0); | |
165 | |
166 /// Whether the current character is a valid tag name character. | |
167 /// | |
168 /// See http://yaml.org/spec/1.2/spec.html#ns-tag-name. | |
169 bool get _isTagChar { | |
170 var char = _scanner.peekChar(); | |
171 if (char == null) return false; | |
172 return (char >= NUMBER_0 && char <= NUMBER_9) || | |
173 (char >= LETTER_A && char <= LETTER_Z) || | |
174 (char >= LETTER_CAP_A && char <= LETTER_CAP_Z) || | |
175 char == HYPHEN || char == SEMICOLON || char == SLASH || | |
176 char == COLON || char == AT || char == AMPERSAND || | |
177 char == EQUALS || char == PLUS || char == DOLLAR || | |
178 char == PERIOD || char == TILDE || char == QUESTION || | |
179 char == ASTERISK || char == SINGLE_QUOTE || char == LEFT_PAREN || | |
180 char == RIGHT_PAREN || char == PERCENT; | |
Bob Nystrom
2014/10/31 20:03:28
It may be quicker to look this up in a map or even
nweiz
2014/11/04 22:19:37
Done.
| |
181 } | |
182 | |
183 /// Whether the current character is a valid anchor name character. | |
184 /// | |
185 /// See http://yaml.org/spec/1.2/spec.html#ns-anchor-name. | |
186 bool get _isAnchorChar { | |
187 if (!_isNonSpace) return false; | |
188 | |
189 var char = _scanner.peekChar(); | |
190 return char != COMMA && char != LEFT_SQUARE && char != RIGHT_SQUARE && | |
191 char != LEFT_CURLY && char != RIGHT_CURLY; | |
192 } | |
193 | |
194 /// Whether the character at the current position is a decimal digit. | |
195 bool get _isDigit { | |
196 var char = _scanner.peekChar(); | |
197 return char != null && (char >= NUMBER_0 && char <= NUMBER_9); | |
198 } | |
199 | |
200 /// Whether the character at the current position is a hexidecimal | |
201 /// digit. | |
202 bool get _isHex { | |
203 var char = _scanner.peekChar(); | |
204 return char != null && | |
205 ((char >= NUMBER_0 && char <= NUMBER_9) || | |
206 (char >= LETTER_A && char <= LETTER_F) || | |
207 (char >= LETTER_CAP_A && char <= LETTER_CAP_F)); | |
208 } | |
209 | |
210 /// Whether the character at the current position is a plain character. | |
211 /// | |
212 /// See http://yaml.org/spec/1.2/spec.html#ns-plain-char(c). | |
213 bool get _isPlainChar => _isPlainCharAt(0); | |
214 | |
215 /// Whether the character at the current position is a printable character | |
216 /// other than a line break or byte-order mark. | |
217 /// | |
218 /// See http://yaml.org/spec/1.2/spec.html#nb-char. | |
219 bool get _isNonBreak { | |
220 var char = _scanner.peekChar(); | |
221 switch (char) { | |
222 case LF: | |
223 case CR: | |
224 case BOM: | |
225 return false; | |
226 case TAB: | |
227 case NEL: | |
228 return true; | |
229 default: | |
230 return char != null && | |
231 ((char >= 0x00020 && char <= 0x00007E) || | |
Bob Nystrom
2014/10/31 20:03:28
Nit: +2 more before "(".
nweiz
2014/11/04 22:19:37
Done.
| |
232 (char >= 0x000A0 && char <= 0x00D7FF) || | |
233 (char >= 0x0E000 && char <= 0x00FFFD) || | |
234 (char >= 0x10000 && char <= 0x10FFFF)); | |
235 } | |
236 } | |
237 | |
238 /// Whether the character at the current position is a printable character | |
239 /// other than whitespace. | |
240 /// | |
241 /// See http://yaml.org/spec/1.2/spec.html#nb-char. | |
242 bool get _isNonSpace { | |
243 var char = _scanner.peekChar(); | |
244 return char != null && char != LF && char != CR && char != BOM && char != SP && | |
Bob Nystrom
2014/10/31 20:03:27
Long line.
nweiz
2014/11/04 22:19:37
Done.
| |
245 char != SP && | |
246 (char == NEL || | |
247 (char >= 0x00020 && char <= 0x00007E) || | |
248 (char >= 0x000A0 && char <= 0x00D7FF) || | |
249 (char >= 0x0E000 && char <= 0x00FFFD) || | |
250 (char >= 0x10000 && char <= 0x10FFFF)); | |
Bob Nystrom
2014/10/31 20:03:28
This expression is pretty huge. How about using a
nweiz
2014/11/04 22:19:36
Done.
| |
251 } | |
252 | |
253 /// Creates a scanner that scans [source]. | |
254 /// | |
255 /// [sourceUrl] can be a String or a [Uri]. | |
256 Scanner(String source, {sourceUrl}) | |
257 : _scanner = new SpanScanner(source, sourceUrl: sourceUrl); | |
258 | |
259 /// Consumes and returns the next token. | |
260 Token scan() { | |
261 if (_streamEndProduced) throw new StateError("Out of tokens."); | |
262 if (!_tokenAvailable) _fetchMoreTokens(); | |
263 | |
264 var token = _tokens.removeFirst(); | |
265 _tokenAvailable = false; | |
266 _tokensParsed++; | |
267 _streamEndProduced = token is Token && | |
268 token.type == TokenType.STREAM_END; | |
269 return token; | |
270 } | |
271 | |
272 /// Returns the next token without consuming it. | |
273 Token peek() { | |
274 if (_streamEndProduced) return null; | |
275 if (!_tokenAvailable) _fetchMoreTokens(); | |
276 return _tokens.first; | |
277 } | |
278 | |
279 /// Ensures that [_tokens] contains at least one token which can be returned. | |
280 void _fetchMoreTokens() { | |
281 while (true) { | |
282 if (_tokens.isNotEmpty) { | |
283 _staleSimpleKeys(); | |
284 if (!_simpleKeys.any((key) => | |
Bob Nystrom
2014/10/31 20:03:28
Document this.
nweiz
2014/11/04 22:19:38
Done.
| |
285 key != null && key.tokenNumber == _tokensParsed)) { | |
286 break; | |
287 } | |
288 } | |
289 | |
290 _fetchNextToken(); | |
291 } | |
292 _tokenAvailable = true; | |
293 } | |
294 | |
295 /// The dispatcher for token fetchers. | |
296 void _fetchNextToken() { | |
297 if (!_streamStartProduced) { | |
298 _fetchStreamStart(); | |
299 return; | |
300 } | |
301 | |
302 _scanToNextToken(); | |
303 _staleSimpleKeys(); | |
304 _unrollIndent(_scanner.column); | |
305 | |
306 if (_scanner.isDone) { | |
307 _fetchStreamEnd(); | |
308 return; | |
309 } | |
310 | |
311 if (_scanner.column == 0) { | |
312 if (_scanner.peekChar() == PERCENT) { | |
313 _fetchDirective(); | |
314 return; | |
315 } else if (_isBlankOrEndAt(3)) { | |
Bob Nystrom
2014/10/31 20:03:28
Ditch the else.
nweiz
2014/11/04 22:19:37
Done.
| |
316 if (_scanner.matches('---')) { | |
317 _fetchDocumentIndicator(TokenType.DOCUMENT_START); | |
318 return; | |
319 } else if (_scanner.matches('...')) { | |
Bob Nystrom
2014/10/31 20:03:28
Here too.
nweiz
2014/11/04 22:19:36
Done.
| |
320 _fetchDocumentIndicator(TokenType.DOCUMENT_END); | |
321 return; | |
322 } | |
323 } | |
324 } | |
325 | |
326 switch (_scanner.peekChar()) { | |
327 case LEFT_SQUARE: | |
328 _fetchFlowCollectionStart(TokenType.FLOW_SEQUENCE_START); | |
329 return; | |
Bob Nystrom
2014/10/31 20:03:29
Is there a reason to prefer return over break thro
nweiz
2014/11/04 22:19:37
It allows the reader to avoid checking the end of
| |
330 case LEFT_CURLY: | |
331 _fetchFlowCollectionStart(TokenType.FLOW_MAPPING_START); | |
332 return; | |
333 case RIGHT_SQUARE: | |
334 _fetchFlowCollectionEnd(TokenType.FLOW_SEQUENCE_END); | |
335 return; | |
336 case RIGHT_CURLY: | |
337 _fetchFlowCollectionEnd(TokenType.FLOW_MAPPING_END); | |
338 return; | |
339 case COMMA: | |
340 _fetchFlowEntry(); | |
341 return; | |
342 case ASTERISK: | |
343 _fetchAnchor(anchor: false); | |
344 return; | |
345 case AMPERSAND: | |
346 _fetchAnchor(anchor: true); | |
347 return; | |
348 case EXCLAMATION: | |
349 _fetchTag(); | |
350 return; | |
351 case SINGLE_QUOTE: | |
352 _fetchFlowScalar(singleQuote: true); | |
353 return; | |
354 case DOUBLE_QUOTE: | |
355 _fetchFlowScalar(singleQuote: false); | |
356 return; | |
357 case VERTICAL_BAR: | |
358 if (!_inBlockContext) _invalidScalarCharacter(); | |
359 _fetchBlockScalar(literal: true); | |
360 return; | |
361 case RIGHT_ANGLE: | |
362 if (!_inBlockContext) _invalidScalarCharacter(); | |
363 _fetchBlockScalar(literal: false); | |
364 return; | |
365 case PERCENT: | |
366 case AT: | |
367 case GRAVE_ACCENT: | |
368 _invalidScalarCharacter(); | |
369 return; | |
370 | |
371 // These characters may sometimes begin plain scalars. | |
372 case HYPHEN: | |
373 if (_isPlainCharAt(1)) { | |
374 _fetchPlainScalar(); | |
375 } else { | |
376 _fetchBlockEntry(); | |
377 } | |
378 return; | |
379 case QUESTION: | |
380 if (_isPlainCharAt(1)) { | |
381 _fetchPlainScalar(); | |
382 } else { | |
383 _fetchKey(); | |
384 } | |
385 return; | |
386 case COLON: | |
387 if (!_inBlockContext && _tokens.isNotEmpty) { | |
388 // If a colon follows a "JSON-like" value (an explicit map or list, or | |
389 // a quoted string) it isn't required to have whitespace after it | |
390 // since it unambiguously describes a map. | |
391 var token = _tokens.last; | |
392 if (token.type == TokenType.FLOW_SEQUENCE_END || | |
393 token.type == TokenType.FLOW_MAPPING_END || | |
394 (token.type == TokenType.SCALAR && token.style.isQuoted)) { | |
395 _fetchValue(); | |
396 return; | |
397 } | |
398 } | |
399 | |
400 if (_isPlainCharAt(1)) { | |
401 _fetchPlainScalar(); | |
402 } else { | |
403 _fetchValue(); | |
404 } | |
405 return; | |
406 default: | |
407 if (!_isNonBreak) _invalidScalarCharacter(); | |
408 | |
409 _fetchPlainScalar(); | |
410 return; | |
411 } | |
412 | |
413 throw 'Inaccessible'; | |
414 } | |
415 | |
416 /// Throws an error about a disallowed character. | |
417 void _invalidScalarCharacter() => | |
418 _scanner.error("Unexpected character.", length: 1); | |
419 | |
420 /// Checks the list of potential simple keys and remove the positions that | |
421 /// cannot contain simple keys anymore. | |
422 void _staleSimpleKeys() { | |
423 for (var i = 0; i < _simpleKeys.length; i++) { | |
424 var key = _simpleKeys[i]; | |
425 if (key == null) continue; | |
426 | |
427 // libyaml requires that all simple keys be a single line and no longer | |
428 // than 1024 characters. However, in section 7.4.2 of the spec | |
429 // (http://yaml.org/spec/1.2/spec.html#id2790832), these restriction is | |
Bob Nystrom
2014/10/31 20:03:27
"restrictions are"
nweiz
2014/11/04 22:19:38
Done.
| |
430 // only applied when the curly braces are omitted. It's difficult to | |
431 // retain enough context to know which keys need to have the restriction | |
432 // placed on them, so for now we go the other direction and allow | |
433 // everything but multiline simple keys in a block context. | |
434 if (!_inBlockContext) continue; | |
435 | |
436 if (key.location.line == _scanner.line) continue; | |
437 | |
438 if (key.required) { | |
439 throw new YamlException("Expected ':'.", _scanner.emptySpan); | |
440 } | |
441 | |
442 _simpleKeys[i] = null; | |
443 } | |
444 } | |
445 | |
446 /// Checks if a simple key may start at the current position and saves it if | |
447 /// so. | |
448 void _saveSimpleKey() { | |
449 // A simple key is required at the current position if the scanner is in the | |
450 // block context and the current column coincides with the indentation | |
451 // level. | |
452 var required = _inBlockContext && _indent == _scanner.column; | |
453 | |
454 // A simple key is required only when it is the first token in the current | |
455 // line. Therefore it is always allowed. But we add a check anyway. | |
456 assert(_simpleKeyAllowed || !required); | |
457 | |
458 if (!_simpleKeyAllowed) return; | |
459 | |
460 // If the current position may start a simple key, save it. | |
461 _removeSimpleKey(); | |
462 _simpleKeys[_simpleKeys.length - 1] = new _SimpleKey( | |
463 _tokensParsed + _tokens.length, | |
464 _scanner.location, | |
465 required: required); | |
466 } | |
467 | |
468 /// Removes a potential simple key at the current flow level. | |
469 void _removeSimpleKey() { | |
470 var key = _simpleKeys.last; | |
471 if (key != null && key.required) { | |
472 throw new YamlException("Could not find expected ':' for simple key.", | |
473 key.location.pointSpan()); | |
474 } | |
475 | |
476 _simpleKeys[_simpleKeys.length - 1] = null; | |
477 } | |
478 | |
479 /// Increases the flow level and resizes the simple key list. | |
480 void _increaseFlowLevel() { | |
481 _simpleKeys.add(null); | |
482 _flowLevel++; | |
483 } | |
484 | |
485 /// Decreases the flow level. | |
486 void _decreaseFlowLevel() { | |
487 if (_inBlockContext) return; | |
488 _simpleKeys.removeLast(); | |
489 _flowLevel--; | |
490 } | |
491 | |
492 /// Pushes the current indentation level to the stack and sets the new level i f | |
Bob Nystrom
2014/10/31 20:03:28
Long line.
nweiz
2014/11/04 22:19:36
Done.
| |
493 /// [column] is greater than [_indent]. | |
494 /// | |
495 /// In it is, appends or inserts the specified token into [_tokens]. If | |
Bob Nystrom
2014/10/31 20:03:27
"it is"?
nweiz
2014/11/04 22:19:37
Done.
| |
496 /// [tokenNumber] is provided, the corresponding token will be replaced; | |
497 /// otherwise, the token will be added at the end. | |
498 void _rollIndent(int column, TokenType type, SourceLocation location, | |
499 {int tokenNumber}) { | |
500 if (!_inBlockContext) return; | |
501 if (_indent != -1 && _indent >= column) return; | |
502 | |
503 // Push the current indentation level to the stack and set the new | |
504 // indentation level. | |
505 _indents.add(_indent); | |
506 _indent = column; | |
507 | |
508 // Create a token and insert it into the queue. | |
509 var token = new Token(type, location.pointSpan()); | |
510 if (tokenNumber == null) { | |
511 _tokens.add(token); | |
512 } else { | |
513 _tokens.insert(tokenNumber - _tokensParsed, token); | |
514 } | |
515 } | |
516 | |
517 /// Pops indentation levels from [_indents] until the current level becomes | |
518 /// less than or equal to [column]. | |
519 /// | |
520 /// For each indentation level, appends a [TokenType.BLOCK_END] token. | |
521 void _unrollIndent(int column) { | |
522 if (!_inBlockContext) return; | |
523 | |
524 while (_indent > column) { | |
525 _tokens.add(new Token(TokenType.BLOCK_END, _scanner.emptySpan)); | |
526 _indent = _indents.removeLast(); | |
527 } | |
528 } | |
529 | |
530 /// Pops indentation levels from [_indents] until the current level resets to | |
531 /// -1. | |
532 /// | |
533 /// For each indentation level, appends a [TokenType.BLOCK_END] token. | |
534 void _resetIndent() => _unrollIndent(-1); | |
535 | |
536 /// Produces a [TokenType.STREAM_START] token. | |
537 void _fetchStreamStart() { | |
538 // Much of libyaml's initialization logic here is done in variable | |
539 // initializers instead. | |
540 _streamStartProduced = true; | |
541 _tokens.add(new Token(TokenType.STREAM_START, _scanner.emptySpan)); | |
542 } | |
543 | |
544 /// Produces a [TokenType.STREAM_END] token. | |
545 void _fetchStreamEnd() { | |
546 _resetIndent(); | |
547 _removeSimpleKey(); | |
548 _simpleKeyAllowed = false; | |
549 _tokens.add(new Token(TokenType.STREAM_END, _scanner.emptySpan)); | |
550 } | |
551 | |
552 /// Produces a [TokenType.VERSION_DIRECTIVE] or [TokenType.TAG_DIRECTIVE] | |
553 /// token. | |
554 void _fetchDirective() { | |
555 _resetIndent(); | |
556 _removeSimpleKey(); | |
557 _simpleKeyAllowed = false; | |
558 var directive = _scanDirective(); | |
559 if (directive != null) _tokens.add(directive); | |
560 } | |
561 | |
562 /// Produces a [TokenType.DOCUMENT_START] or [TokenType.DOCUMENT_END] token. | |
563 void _fetchDocumentIndicator(TokenType type) { | |
564 _resetIndent(); | |
565 _removeSimpleKey(); | |
566 _simpleKeyAllowed = false; | |
Bob Nystrom
2014/10/31 20:03:28
Hoist these three lines into a _resetState() metho
nweiz
2014/11/04 22:19:36
I'd rather have the visual similarity with the met
| |
567 | |
568 // Consume the indicator token. | |
569 var start = _scanner.state; | |
570 _scanner.readChar(); | |
571 _scanner.readChar(); | |
572 _scanner.readChar(); | |
573 | |
574 _tokens.add(new Token(type, _scanner.spanFrom(start))); | |
575 } | |
576 | |
577 /// Produces a [TokenType.FLOW_SEQUENCE_START] or | |
578 /// [TokenType.FLOW_MAPPING_START] token. | |
579 void _fetchFlowCollectionStart(TokenType type) { | |
580 _saveSimpleKey(); | |
581 _increaseFlowLevel(); | |
582 _simpleKeyAllowed = true; | |
583 _addCharToken(type); | |
584 } | |
585 | |
586 /// Produces a [TokenType.FLOW_SEQUENCE_END] or [TokenType.FLOW_MAPPING_END] | |
587 /// token. | |
588 void _fetchFlowCollectionEnd(TokenType type) { | |
589 _removeSimpleKey(); | |
590 _decreaseFlowLevel(); | |
591 _simpleKeyAllowed = false; | |
592 _addCharToken(type); | |
593 } | |
594 | |
595 /// Produces a [TokenType.FLOW_ENTRY] token. | |
596 void _fetchFlowEntry() { | |
597 _removeSimpleKey(); | |
598 _simpleKeyAllowed = true; | |
599 _addCharToken(TokenType.FLOW_ENTRY); | |
600 } | |
601 | |
602 /// Produces a [TokenType.BLOCK_ENTRY] token. | |
603 void _fetchBlockEntry() { | |
604 if (_inBlockContext) { | |
605 if (!_simpleKeyAllowed) { | |
606 throw new YamlException( | |
607 "Block sequence entries are not allowed in this context.", | |
Bob Nystrom
2014/10/31 20:03:29
Would be good to describe the context instead of j
nweiz
2014/11/04 22:19:37
That's pretty tough... we'd have to track the reas
| |
608 _scanner.emptySpan); | |
609 } | |
610 | |
611 _rollIndent( | |
612 _scanner.column, | |
613 TokenType.BLOCK_SEQUENCE_START, | |
614 _scanner.emptySpan.start); | |
615 } else { | |
616 // It is an error for the '-' indicator to occur in the flow context, but | |
617 // we let the Parser detect and report it because it's able to point to | |
618 // the context. | |
619 } | |
620 | |
621 _removeSimpleKey(); | |
622 _simpleKeyAllowed = true; | |
623 _addCharToken(TokenType.BLOCK_ENTRY); | |
624 } | |
625 | |
626 /// Produces the [TokenType.KEY] token. | |
627 void _fetchKey() { | |
628 if (_inBlockContext) { | |
629 if (!_simpleKeyAllowed) { | |
630 throw new YamlException("Mapping keys are not allowed in this context.", | |
Bob Nystrom
2014/10/31 20:03:28
Ditto.
| |
631 _scanner.emptySpan); | |
632 } | |
633 | |
634 _rollIndent( | |
635 _scanner.column, | |
636 TokenType.BLOCK_MAPPING_START, | |
637 _scanner.emptySpan.start); | |
638 } | |
639 | |
640 // Simple keys are allowed after `?` in a block context. | |
641 _simpleKeyAllowed = _inBlockContext; | |
642 _addCharToken(TokenType.KEY); | |
643 } | |
644 | |
645 /// Produces the [TokenType.VALUE] token. | |
646 void _fetchValue() { | |
647 var simpleKey = _simpleKeys.last; | |
648 if (simpleKey != null) { | |
649 // Add a [TokenType.KEY] directive before the first token of the simple | |
650 // key so the parser knows that it's part of a key/value pair. | |
651 _tokens.insert(simpleKey.tokenNumber - _tokensParsed, | |
652 new Token(TokenType.KEY, simpleKey.location.pointSpan())); | |
653 | |
654 // In the block context, we may need to add the | |
655 // [TokenType.BLOCK_MAPPING_START] token. | |
656 _rollIndent( | |
657 simpleKey.location.column, | |
658 TokenType.BLOCK_MAPPING_START, | |
659 simpleKey.location, | |
660 tokenNumber: simpleKey.tokenNumber); | |
661 | |
662 // Remove the simple key. | |
663 _simpleKeys[_simpleKeys.length - 1] = null; | |
664 | |
665 // A simple key cannot follow another simple key. | |
666 _simpleKeyAllowed = false; | |
667 } else if (_inBlockContext) { | |
668 // If we're here, we've found the ':' indicator following a complex key. | |
669 | |
670 if (!_simpleKeyAllowed) { | |
671 throw new YamlException( | |
672 "Mapping values are not allowed in this context.", | |
673 _scanner.emptySpan); | |
674 } | |
675 | |
676 _rollIndent( | |
677 _scanner.column, | |
678 TokenType.BLOCK_MAPPING_START, | |
679 _scanner.location); | |
680 _simpleKeyAllowed = true; | |
681 } else if (_simpleKeyAllowed) { | |
682 // If we're here, we've found the ':' indicator with an empty key. This | |
683 // behavior differs from libyaml, which disallows empty implicit keys. | |
684 _simpleKeyAllowed = false; | |
685 _addCharToken(TokenType.KEY); | |
686 } | |
687 | |
688 _addCharToken(TokenType.VALUE); | |
689 } | |
690 | |
691 /// Adds a token with [type] to [_tokens]. | |
692 /// | |
693 /// The span of the new token is the current character. | |
694 void _addCharToken(TokenType type) { | |
695 var start = _scanner.state; | |
696 _scanner.readChar(); | |
697 _tokens.add(new Token(type, _scanner.spanFrom(start))); | |
698 } | |
699 | |
700 /// Produces a [TokenType.ALIAS] or [TokenType.ANCHOR] token. | |
701 void _fetchAnchor({bool anchor: true}) { | |
702 _saveSimpleKey(); | |
703 _simpleKeyAllowed = false; | |
704 _tokens.add(_scanAnchor(anchor: anchor)); | |
705 } | |
706 | |
707 /// Produces a [TokenType.TAG] token. | |
708 void _fetchTag() { | |
709 _saveSimpleKey(); | |
710 _simpleKeyAllowed = false; | |
711 _tokens.add(_scanTag()); | |
712 } | |
713 | |
714 /// Produces a [TokenType.SCALAR] token with style [ScalarStyle.LITERAL] or | |
715 /// [ScalarStyle.FOLDED]. | |
716 void _fetchBlockScalar({bool literal: false}) { | |
717 _removeSimpleKey(); | |
718 _simpleKeyAllowed = true; | |
719 _tokens.add(_scanBlockScalar(literal: literal)); | |
720 } | |
721 | |
722 /// Produces a [TokenType.SCALAR] token with style [ScalarStyle.SINGLE_QUOTED] | |
723 /// or [ScalarStyle.DOUBLE_QUOTED]. | |
724 void _fetchFlowScalar({bool singleQuote: false}) { | |
725 _saveSimpleKey(); | |
726 _simpleKeyAllowed = false; | |
727 _tokens.add(_scanFlowScalar(singleQuote: singleQuote)); | |
728 } | |
729 | |
730 /// Produces a [TokenType.SCALAR] token with style [ScalarStyle.PLAIN]. | |
731 void _fetchPlainScalar() { | |
732 _saveSimpleKey(); | |
733 _simpleKeyAllowed = false; | |
734 _tokens.add(_scanPlainScalar()); | |
735 } | |
736 | |
737 /// Eats whitespace and comments until the next token is found. | |
738 void _scanToNextToken() { | |
739 var afterLineBreak = false; | |
740 while (true) { | |
741 // Allow the BOM to start a line. | |
742 if (_scanner.column == 0) _scanner.scan("\uFEFF"); | |
743 | |
744 // Eat whitespace. | |
745 // | |
746 // libyaml disallows tabs after "-", "?", or ":", but the spec allows | |
747 // them. See section 6.2: http://yaml.org/spec/1.2/spec.html#id2778241. | |
748 while (_scanner.peekChar() == SP || | |
749 ((!_inBlockContext || !afterLineBreak) && | |
750 _scanner.peekChar() == TAB)) { | |
751 _scanner.readChar(); | |
752 } | |
753 | |
754 if (_scanner.peekChar() == TAB) { | |
755 _scanner.error("Tab characters are not allowed as indentation.", | |
756 length: 1); | |
757 } | |
758 | |
759 // Eat a comment until a line break. | |
760 if (_scanner.peekChar() == HASH) { | |
761 while (!_isBreakOrEnd) { | |
762 _scanner.readChar(); | |
763 } | |
764 } | |
765 | |
766 // If we're at a line break, eat it. | |
767 if (_isBreak) { | |
768 _skipLine(); | |
769 | |
770 // In the block context, a new line may start a simple key. | |
771 if (_inBlockContext) _simpleKeyAllowed = true; | |
772 afterLineBreak = true; | |
773 } else { | |
774 // Otherwise we've found a token. | |
775 break; | |
776 } | |
777 } | |
778 } | |
779 | |
780 /// Scans a [TokenType.YAML_DIRECTIVE] or [TokenType.TAG_DIRECTIVE] token. | |
781 /// | |
782 /// %YAML 1.2 # a comment \n | |
783 /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
784 /// %TAG !yaml! tag:yaml.org,2002: \n | |
785 /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
786 Token _scanDirective() { | |
787 var start = _scanner.state; | |
788 | |
789 // Eat '%'. | |
790 _scanner.readChar(); | |
791 | |
792 var token; | |
793 var name = _scanDirectiveName(); | |
794 if (name == "YAML") { | |
795 token = _scanVersionDirectiveValue(start); | |
796 } else if (name == "TAG") { | |
797 token = _scanTagDirectiveValue(start); | |
798 } else { | |
799 warn("Warning: unknown directive.", _scanner.spanFrom(start)); | |
Bob Nystrom
2014/10/31 20:03:27
I don't think the parser should output directly to
nweiz
2014/11/04 22:19:37
Done. I wish there were a more standard way to do
| |
800 | |
801 // libyaml doesn't support unknown directives, but the spec says to ignore | |
802 // them and warn: http://yaml.org/spec/1.2/spec.html#id2781147. | |
803 while (!_isBreakOrEnd) { | |
804 _scanner.readChar(); | |
805 } | |
806 | |
807 return null; | |
808 } | |
809 | |
810 // Eat the rest of the line, including any comments. | |
811 while (_isBlank) { | |
812 _scanner.readChar(); | |
813 } | |
Bob Nystrom
2014/10/31 20:03:28
Make a _skipBlanks() method for this since you do
nweiz
2014/11/04 22:19:37
Done.
| |
814 | |
815 if (_scanner.peekChar() == HASH) { | |
816 while (!_isBreakOrEnd) { | |
817 _scanner.readChar(); | |
818 } | |
819 } | |
Bob Nystrom
2014/10/31 20:03:28
Probably this too.
nweiz
2014/11/04 22:19:36
Done.
| |
820 | |
821 if (!_isBreakOrEnd) { | |
822 throw new YamlException( | |
823 "Expected comment or line break after directive.", | |
824 _scanner.spanFrom(start)); | |
825 } | |
826 | |
827 if (_isBreak) _skipLine(); | |
Bob Nystrom
2014/10/31 20:03:27
Do you need to check _isBreak here? Doesn't _skipL
nweiz
2014/11/04 22:19:36
Done.
| |
828 return token; | |
829 } | |
830 | |
831 /// Scans a directive name. | |
832 /// | |
833 /// %YAML 1.2 # a comment \n | |
834 /// ^^^^ | |
835 /// %TAG !yaml! tag:yaml.org,2002: \n | |
836 /// ^^^ | |
837 String _scanDirectiveName() { | |
838 var buffer = new StringBuffer(); | |
839 // libyaml only allows word characters in directive names, but the spec | |
840 // disagrees: http://yaml.org/spec/1.2/spec.html#ns-directive-name. | |
841 while (_isNonSpace) { | |
842 buffer.writeCharCode(_scanner.readChar()); | |
Bob Nystrom
2014/10/31 20:03:29
This seems inefficient. Can you just get a substri
nweiz
2014/11/04 22:19:36
Done.
| |
843 } | |
844 | |
845 var name = buffer.toString(); | |
846 if (name.isEmpty) { | |
847 throw new YamlException("Expected directive name.", _scanner.emptySpan); | |
848 } else if (!_isBlankOrEnd) { | |
Bob Nystrom
2014/10/31 20:03:28
What about:
%YAML#Comment.
I'd expect this to be
nweiz
2014/11/04 22:19:36
I don't think that's a likely enough error to warr
| |
849 throw new YamlException( | |
850 "Unexpected character in directive name.", _scanner.emptySpan); | |
851 } | |
852 | |
853 return name; | |
854 } | |
855 | |
856 /// Scans the value of a version directive. | |
857 /// | |
858 /// %YAML 1.2 # a comment \n | |
859 /// ^^^^^^ | |
860 Token _scanVersionDirectiveValue(LineScannerState start) { | |
861 while (_isBlank) { | |
862 _scanner.readChar(); | |
863 } | |
864 | |
865 var major = _scanVersionDirectiveNumber(); | |
866 _scanner.expect('.'); | |
867 var minor = _scanVersionDirectiveNumber(); | |
868 | |
869 return new VersionDirectiveToken(_scanner.spanFrom(start), major, minor); | |
870 } | |
871 | |
872 /// Scans the version number of a version directive. | |
873 /// | |
874 /// %YAML 1.2 # a comment \n | |
875 /// ^ | |
876 /// %YAML 1.2 # a comment \n | |
877 /// ^ | |
878 int _scanVersionDirectiveNumber() { | |
879 var buffer = new StringBuffer(); | |
880 while (_isDigit) { | |
881 buffer.writeCharCode(_scanner.readChar()); | |
882 } | |
883 | |
884 var number = buffer.toString(); | |
885 if (number.isEmpty) { | |
886 throw new YamlException("Expected version number.", _scanner.emptySpan); | |
887 } | |
888 | |
889 return int.parse(number); | |
890 } | |
891 | |
892 /// Scans the value of a tag directive. | |
893 /// | |
894 /// %TAG !yaml! tag:yaml.org,2002: \n | |
895 /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
896 Token _scanTagDirectiveValue(LineScannerState start) { | |
897 while (_isBlank) { | |
898 _scanner.readChar(); | |
899 } | |
900 | |
901 var handle = _scanTagHandle(directive: true); | |
902 if (!_isBlank) { | |
903 throw new YamlException("Expected whitespace.", _scanner.emptySpan); | |
904 } | |
905 | |
906 while (_isBlank) { | |
907 _scanner.readChar(); | |
908 } | |
909 | |
910 var prefix = _scanTagUri(); | |
911 if (!_isBlankOrEnd) { | |
912 throw new YamlException("Expected whitespace.", _scanner.emptySpan); | |
913 } | |
914 | |
915 return new TagDirectiveToken(_scanner.spanFrom(start), handle, prefix); | |
916 } | |
917 | |
918 /// Scans a [TokenType.ANCHOR] token. | |
919 Token _scanAnchor({bool anchor: true}) { | |
920 var start = _scanner.state; | |
921 | |
922 // Eat the indicator character. | |
923 _scanner.readChar(); | |
924 | |
925 var buffer = new StringBuffer(); | |
926 // libyaml only allows word characters in anchor names, but the spec | |
927 // disagrees: http://yaml.org/spec/1.2/spec.html#ns-anchor-char. | |
928 while (_isAnchorChar) { | |
929 buffer.writeCharCode(_scanner.readChar()); | |
930 } | |
931 | |
932 var next = _scanner.peekChar(); | |
933 if (buffer.length == 0 || | |
934 (!_isBlankOrEnd && next != QUESTION && next != COLON && | |
935 next != COMMA && next != RIGHT_SQUARE && next != RIGHT_CURLY && | |
936 next != PERCENT && next != AT && next != GRAVE_ACCENT)) { | |
Bob Nystrom
2014/10/31 20:03:29
What are these specific character tests for?
nweiz
2014/11/04 22:19:37
They check whether the anchor is followed by some
| |
937 throw new YamlException("Expected alphanumeric character.", | |
938 _scanner.emptySpan); | |
939 } | |
940 | |
941 if (anchor) { | |
942 return new AnchorToken(_scanner.spanFrom(start), buffer.toString()); | |
943 } else { | |
944 return new AliasToken(_scanner.spanFrom(start), buffer.toString()); | |
945 } | |
946 } | |
947 | |
948 /// Scans a [TokenType.TAG] token. | |
949 Token _scanTag() { | |
950 var handle; | |
951 var suffix; | |
952 var start = _scanner.state; | |
953 | |
954 // Check if the tag is in the canonical form. | |
955 if (_scanner.peekChar(1) == LEFT_ANGLE) { | |
Bob Nystrom
2014/10/31 20:03:29
Does this fail on "!" (a bang by itself)?
nweiz
2014/11/04 22:19:36
No; [peekChar] returns null for out-of-range indic
| |
956 // Eat '!<'. | |
957 _scanner.readChar(); | |
958 _scanner.readChar(); | |
959 | |
960 handle = ''; | |
961 suffix = _scanTagUri(); | |
962 | |
963 _scanner.expect('>'); | |
964 } else { | |
965 // The tag has either the '!suffix' or the '!handle!suffix' form. | |
966 | |
967 // First, try to scan a handle. | |
968 handle = _scanTagHandle(); | |
969 | |
970 if (handle.length > 1 && handle.startsWith('!') && handle.endsWith('!')) { | |
971 suffix = _scanTagUri(flowSeparators: false); | |
972 } else { | |
973 suffix = _scanTagUri(head: handle, flowSeparators: false); | |
974 | |
975 // There was no explicit handle. | |
976 if (suffix.isEmpty) { | |
977 // This is the special '!' tag. | |
978 handle = null; | |
979 suffix = '!'; | |
980 } else { | |
981 handle = '!'; | |
982 } | |
983 } | |
984 } | |
985 | |
986 // libyaml insists on whitespace after a tag, but example 7.2 indicates | |
987 // that it's not required: http://yaml.org/spec/1.2/spec.html#id2786720. | |
988 | |
989 return new TagToken(_scanner.spanFrom(start), handle, suffix); | |
990 } | |
991 | |
992 /// Scans a tag handle. | |
993 String _scanTagHandle({bool directive: false}) { | |
994 _scanner.expect('!'); | |
995 | |
996 var buffer = new StringBuffer('!'); | |
997 | |
998 // libyaml only allows word characters in tags, but the spec disagrees: | |
999 // http://yaml.org/spec/1.2/spec.html#ns-tag-char. | |
1000 while (_isTagChar) { | |
1001 buffer.writeCharCode(_scanner.readChar()); | |
1002 } | |
1003 | |
1004 if (_scanner.peekChar() == EXCLAMATION) { | |
1005 buffer.writeCharCode(_scanner.readChar()); | |
1006 } else { | |
1007 // It's either the '!' tag or not really a tag handle. If it's a %TAG | |
1008 // directive, it's an error. If it's a tag token, it must be part of a | |
1009 // URI. | |
1010 if (directive && buffer.toString() != '!') _scanner.expect('!'); | |
1011 } | |
1012 | |
1013 return buffer.toString(); | |
1014 } | |
1015 | |
1016 /// Scans a tag URI. | |
1017 /// | |
1018 /// [head] is the initial portion of the tag that's already been scanned. | |
1019 /// [flowSeparators] indicates whether the tag URI can contain flow | |
1020 /// separators. | |
1021 String _scanTagUri({String head, bool flowSeparators: true}) { | |
1022 var length = head == null ? 0 : head.length; | |
1023 var buffer = new StringBuffer(); | |
1024 | |
1025 // Copy the head if needed. | |
1026 // | |
1027 // Note that we don't copy the leading '!' character. | |
1028 if (length > 1) buffer.write(head.substring(1)); | |
1029 | |
1030 // The set of characters that may appear in URI is as follows: | |
1031 // | |
1032 // '0'-'9', 'A'-'Z', 'a'-'z', '_', '-', ';', '/', '?', ':', '@', '&', | |
1033 // '=', '+', '$', ',', '.', '!', '~', '*', '\'', '(', ')', '[', ']', | |
1034 // '%'. | |
1035 // | |
1036 // In a shorthand tag annotation, the flow separators ',', '[', and ']' are | |
1037 // disallowed. | |
1038 var char = _scanner.peekChar(); | |
1039 while (_isTagChar || (flowSeparators && | |
1040 (char == COMMA || char == LEFT_SQUARE || char == RIGHT_SQUARE))) { | |
1041 buffer.writeCharCode(_scanner.readChar()); | |
1042 char = _scanner.peekChar(); | |
1043 } | |
1044 | |
1045 // libyaml manually decodes the URL, but we don't have to do that. | |
1046 return Uri.decodeFull(buffer.toString()); | |
1047 } | |
1048 | |
1049 /// Scans a block scalar. | |
1050 Token _scanBlockScalar({bool literal: false}) { | |
1051 var start = _scanner.state; | |
1052 | |
1053 // Eat the indicator '|' or '>'. | |
1054 _scanner.readChar(); | |
1055 | |
1056 // Check for a chomping indicator. | |
1057 var chomping = _Chomping.CLIP; | |
1058 var increment = 0; | |
1059 var char = _scanner.peekChar(); | |
1060 if (char == PLUS || char == HYPHEN) { | |
1061 chomping = char == PLUS ? _Chomping.KEEP : _Chomping.STRIP; | |
1062 _scanner.readChar(); | |
1063 | |
1064 // Check for an indentation indicator. | |
1065 if (_isDigit) { | |
1066 // Check that the indentation is greater than 0. | |
1067 if (_scanner.peekChar() == 0) { | |
Bob Nystrom
2014/10/31 20:03:29
NUMBER_0?
nweiz
2014/11/04 22:19:38
Done.
| |
1068 throw new YamlException( | |
1069 "0 may not be used as an indentation indicator.", | |
1070 _scanner.spanFrom(start)); | |
1071 } | |
1072 | |
1073 increment = _scanner.readChar() - NUMBER_0; | |
1074 } | |
1075 } else if (_isDigit) { | |
1076 // Do the same as above, but in the opposite order. | |
1077 if (_scanner.peekChar() == 0) { | |
Bob Nystrom
2014/10/31 20:03:29
Ditto.
nweiz
2014/11/04 22:19:36
Done.
| |
1078 throw new YamlException( | |
1079 "0 may not be used as an indentation indicator.", | |
1080 _scanner.spanFrom(start)); | |
1081 } | |
1082 | |
1083 increment = _scanner.readChar() - NUMBER_0; | |
1084 | |
1085 char = _scanner.peekChar(); | |
1086 if (char == PLUS || char == HYPHEN) { | |
1087 chomping = char == PLUS ? _Chomping.KEEP : _Chomping.STRIP; | |
1088 _scanner.readChar(); | |
1089 } | |
1090 } | |
1091 | |
1092 // Eat whitespace and comments to the end of the line. | |
1093 while (_isBlank) { | |
1094 _scanner.readChar(); | |
1095 } | |
1096 | |
1097 if (_scanner.peekChar() == HASH) { | |
1098 while (!_isBreakOrEnd) { | |
1099 _scanner.readChar(); | |
1100 } | |
1101 } | |
1102 | |
1103 // Check if we're at the end of the line. | |
1104 if (!_isBreakOrEnd) { | |
1105 throw new YamlException("Expected comment or line break.", | |
1106 _scanner.emptySpan); | |
1107 } | |
1108 | |
1109 if (_isBreak) _skipLine(); | |
1110 | |
1111 var indent = 0; | |
Bob Nystrom
2014/10/31 20:03:28
Document this little block.
nweiz
2014/11/04 22:19:37
Done.
| |
1112 if (increment != 0) { | |
1113 indent = _indent >= 0 ? _indent + increment : increment; | |
1114 } | |
1115 | |
1116 // Scan the leading line breaks to determine the indentation level if | |
1117 // needed. | |
1118 var pair = _scanBlockScalarBreaks(indent); | |
1119 indent = pair.first; | |
1120 var trailingBreaks = pair.last; | |
1121 | |
1122 // Scan the block scalar contents. | |
1123 var buffer = new StringBuffer(); | |
1124 var leadingBreak = ''; | |
1125 var leadingBlank = false; | |
1126 var trailingBlank = false; | |
1127 while (_scanner.column == indent && !_scanner.isDone) { | |
1128 // Check for a document indicator. libyaml doesn't do this, but the spec | |
1129 // mandates it. See example 9.5: | |
1130 // http://yaml.org/spec/1.2/spec.html#id2801606. | |
1131 if (_scanner.column == 0 && _isBlankOrEndAt(3) && | |
1132 (_scanner.matches('---') || _scanner.matches('...'))) { | |
1133 break; | |
1134 } | |
1135 | |
1136 // We are at the beginning of a non-empty line. | |
1137 | |
1138 // Is there trailing whitespace? | |
1139 trailingBlank = _isBlank; | |
1140 | |
1141 // Check if we need to fold the leading line break. | |
1142 if (!literal && leadingBreak.isNotEmpty && !leadingBlank && | |
1143 !trailingBlank) { | |
1144 // Do we need to join the lines with a space? | |
1145 if (trailingBreaks.isEmpty) buffer.writeCharCode(SP); | |
1146 leadingBreak = ''; | |
Bob Nystrom
2014/10/31 20:03:29
Move this after the if.
nweiz
2014/11/04 22:19:38
Done.
| |
1147 } else { | |
1148 buffer.write(leadingBreak); | |
1149 leadingBreak = ''; | |
1150 } | |
1151 | |
1152 // Append the remaining line breaks. | |
1153 buffer.write(trailingBreaks); | |
1154 | |
1155 // Is there leading whitespace? | |
1156 leadingBlank = _isBlank; | |
1157 | |
1158 while (!_isBreakOrEnd) { | |
1159 buffer.writeCharCode(_scanner.readChar()); | |
1160 } | |
1161 | |
1162 // libyaml always reads a line here, but this breaks on block scalars at | |
1163 // the end of the document that end without newlines. See example 8.1: | |
1164 // http://yaml.org/spec/1.2/spec.html#id2793888. | |
1165 if (!_scanner.isDone) leadingBreak = _readLine(); | |
1166 | |
1167 // Eat the following indentation and spaces. | |
1168 var pair = _scanBlockScalarBreaks(indent); | |
1169 indent = pair.first; | |
1170 trailingBreaks = pair.last; | |
1171 } | |
1172 | |
1173 // Chomp the tail. | |
1174 if (chomping != _Chomping.STRIP) { | |
Bob Nystrom
2014/10/31 20:03:28
Nit, but maybe make these single-line ifs?
nweiz
2014/11/04 22:19:37
Done.
| |
1175 buffer.write(leadingBreak); | |
1176 } | |
1177 if (chomping == _Chomping.KEEP) { | |
1178 buffer.write(trailingBreaks); | |
1179 } | |
1180 | |
1181 return new ScalarToken(_scanner.spanFrom(start), buffer.toString(), | |
1182 literal ? ScalarStyle.LITERAL : ScalarStyle.FOLDED); | |
1183 } | |
1184 | |
1185 /// Scans indentation spaces and line breaks for a block scalar. | |
1186 /// | |
1187 /// Determines the intendation level if needed. Returns the new indentation | |
1188 /// level and the text of the line breaks. | |
1189 Pair<int, String> _scanBlockScalarBreaks(int indent) { | |
1190 var maxIndent = 0; | |
1191 var breaks = new StringBuffer(); | |
1192 | |
1193 while (true) { | |
1194 while ((indent == 0 || _scanner.column < indent) && | |
1195 _scanner.peekChar() == SP) { | |
1196 _scanner.readChar(); | |
1197 } | |
1198 | |
1199 if (_scanner.column > maxIndent) maxIndent = _scanner.column; | |
1200 | |
1201 // libyaml throws an error here if a tab character is detected, but the | |
1202 // spec treats tabs like any other non-space character. See example 8.2: | |
1203 // http://yaml.org/spec/1.2/spec.html#id2794311. | |
1204 | |
1205 if (!_isBreak) break; | |
1206 breaks.write(_readLine()); | |
1207 } | |
1208 | |
1209 if (indent == 0) { | |
1210 indent = maxIndent; | |
1211 if (indent < _indent + 1) indent = _indent + 1; | |
1212 | |
1213 // libyaml forces indent to be at least 1 here, but that doesn't seem to | |
1214 // be supported by the spec. | |
1215 } | |
1216 | |
1217 return new Pair(indent, breaks.toString()); | |
1218 } | |
1219 | |
1220 // Scans a quoted scalar. | |
1221 Token _scanFlowScalar({bool singleQuote: false}) { | |
1222 var start = _scanner.state; | |
1223 var buffer = new StringBuffer(); | |
1224 | |
1225 // Eat the left quote. | |
1226 _scanner.readChar(); | |
1227 | |
1228 while (true) { | |
1229 // Check that there are no document indicators at the beginning of the | |
1230 // line. | |
1231 if (_scanner.column == 0 && _isBlankOrEndAt(3) && | |
1232 (_scanner.scan("---") || _scanner.scan("..."))) { | |
1233 _scanner.error("Unexpected document indicator."); | |
1234 } | |
Bob Nystrom
2014/10/31 20:03:28
Hoist this out into a function?
nweiz
2014/11/04 22:19:36
Done.
| |
1235 | |
1236 if (_scanner.isDone) { | |
1237 throw new YamlException("Unexpected end of file.", _scanner.emptySpan); | |
1238 } | |
1239 | |
1240 var leadingBlanks = false; | |
1241 while (!_isBlankOrEnd) { | |
1242 var char = _scanner.peekChar(); | |
1243 if (singleQuote && char == SINGLE_QUOTE && | |
1244 _scanner.peekChar(1) == SINGLE_QUOTE) { | |
1245 // An escaped single quote. | |
1246 _scanner.readChar(); | |
1247 _scanner.readChar(); | |
1248 buffer.writeCharCode(SINGLE_QUOTE); | |
1249 } else if (char == (singleQuote ? SINGLE_QUOTE : DOUBLE_QUOTE)) { | |
1250 // The closing quote. | |
1251 break; | |
1252 } else if (!singleQuote && char == BACKSLASH && _isBreakAt(1)) { | |
1253 // An escaped newline. | |
1254 _scanner.readChar(); | |
1255 _skipLine(); | |
1256 leadingBlanks = true; | |
1257 break; | |
1258 } else if (!singleQuote && char == BACKSLASH) { | |
1259 var escapeStart = _scanner.state; | |
1260 | |
1261 // An escape sequence. | |
1262 var codeLength = null; | |
1263 switch (_scanner.peekChar(1)) { | |
1264 case NUMBER_0: | |
1265 buffer.writeCharCode(NULL); | |
1266 break; | |
1267 case LETTER_A: | |
1268 buffer.writeCharCode(BELL); | |
1269 break; | |
1270 case LETTER_B: | |
1271 buffer.writeCharCode(BACKSPACE); | |
1272 break; | |
1273 case LETTER_T: | |
1274 case TAB: | |
Bob Nystrom
2014/10/31 20:03:29
Oh, YAML. You so crazy.
| |
1275 buffer.writeCharCode(TAB); | |
1276 break; | |
1277 case LETTER_N: | |
1278 buffer.writeCharCode(LF); | |
1279 break; | |
1280 case LETTER_V: | |
1281 buffer.writeCharCode(VERTICAL_TAB); | |
1282 break; | |
1283 case LETTER_F: | |
1284 buffer.writeCharCode(FORM_FEED); | |
1285 break; | |
1286 case LETTER_R: | |
1287 buffer.writeCharCode(CR); | |
1288 break; | |
1289 case LETTER_E: | |
1290 buffer.writeCharCode(ESCAPE); | |
1291 break; | |
1292 case SP: | |
1293 case DOUBLE_QUOTE: | |
1294 case SLASH: | |
1295 case BACKSLASH: | |
1296 // libyaml doesn't support an escaped forward slash, but it was | |
1297 // added in YAML 1.2. See section 5.7: | |
1298 // http://yaml.org/spec/1.2/spec.html#id2776092 | |
1299 buffer.writeCharCode(_scanner.peekChar(1)); | |
1300 break; | |
1301 case LETTER_CAP_N: | |
1302 buffer.writeCharCode(NEL); | |
1303 break; | |
1304 case UNDERSCORE: | |
1305 buffer.writeCharCode(NBSP); | |
1306 break; | |
1307 case LETTER_CAP_L: | |
1308 buffer.writeCharCode(LINE_SEPARATOR); | |
1309 break; | |
1310 case LETTER_CAP_P: | |
1311 buffer.writeCharCode(PARAGRAPH_SEPARATOR); | |
1312 break; | |
1313 case LETTER_X: | |
1314 codeLength = 2; | |
1315 break; | |
1316 case LETTER_U: | |
1317 codeLength = 4; | |
1318 break; | |
1319 case LETTER_CAP_U: | |
1320 codeLength = 8; | |
1321 break; | |
1322 default: | |
1323 throw new YamlException("Unknown escape character.", | |
1324 _scanner.spanFrom(escapeStart)); | |
1325 } | |
1326 | |
1327 _scanner.readChar(); | |
1328 _scanner.readChar(); | |
1329 | |
1330 if (codeLength != null) { | |
1331 var value = 0; | |
1332 for (var i = 0; i < codeLength; i++) { | |
1333 if (!_isHex) { | |
1334 _scanner.readChar(); | |
1335 throw new YamlException( | |
1336 "Expected $codeLength-digit hexidecimal number.", | |
1337 _scanner.spanFrom(escapeStart)); | |
1338 } | |
1339 | |
1340 value = (value << 4) + _asHex(_scanner.readChar()); | |
1341 } | |
1342 | |
1343 // Check the value and write the character. | |
1344 if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) { | |
1345 throw new YamlException( | |
1346 "Invalid Unicode character escape code.", | |
1347 _scanner.spanFrom(escapeStart)); | |
1348 } | |
1349 | |
1350 buffer.writeCharCode(value); | |
1351 } | |
1352 } else { | |
1353 buffer.writeCharCode(_scanner.readChar()); | |
1354 } | |
1355 } | |
1356 | |
1357 // Check if we're at the end of a scalar. | |
1358 if (_scanner.peekChar() == (singleQuote ? SINGLE_QUOTE : DOUBLE_QUOTE)) { | |
1359 break; | |
1360 } | |
1361 | |
1362 var whitespace = new StringBuffer(); | |
1363 var leadingBreak = ''; | |
1364 var trailingBreaks = new StringBuffer(); | |
1365 while (_isBlank || _isBreak) { | |
1366 if (_isBlank) { | |
1367 // Consume a space or a tab. | |
1368 if (!leadingBlanks) { | |
1369 whitespace.writeCharCode(_scanner.readChar()); | |
1370 } else { | |
1371 _scanner.readChar(); | |
1372 } | |
1373 } else { | |
1374 // Check if it's a first line break. | |
1375 if (!leadingBlanks) { | |
1376 whitespace.clear(); | |
1377 leadingBreak = _readLine(); | |
1378 leadingBlanks = true; | |
1379 } else { | |
1380 trailingBreaks.write(_readLine()); | |
1381 } | |
1382 } | |
1383 } | |
1384 | |
1385 // Join the whitespace or fold line breaks. | |
1386 if (leadingBlanks) { | |
1387 if (leadingBreak.isNotEmpty && trailingBreaks.isEmpty) { | |
1388 buffer.writeCharCode(SP); | |
1389 } else { | |
1390 buffer.write(trailingBreaks); | |
1391 } | |
1392 } else { | |
1393 buffer.write(whitespace); | |
1394 whitespace.clear(); | |
1395 } | |
1396 } | |
1397 | |
1398 // Eat the right quote. | |
1399 _scanner.readChar(); | |
1400 | |
1401 return new ScalarToken(_scanner.spanFrom(start), buffer.toString(), | |
1402 singleQuote ? ScalarStyle.SINGLE_QUOTED : ScalarStyle.DOUBLE_QUOTED); | |
1403 } | |
1404 | |
1405 /// Scans a plain scalar. | |
1406 Token _scanPlainScalar() { | |
1407 var start = _scanner.state; | |
1408 var buffer = new StringBuffer(); | |
1409 var leadingBreak = ''; | |
1410 var trailingBreaks = ''; | |
1411 var whitespace = new StringBuffer(); | |
1412 var indent = _indent + 1; | |
1413 | |
1414 while (true) { | |
1415 // Check for a document indicator. | |
1416 if (_scanner.column == 0 && _isBlankOrEndAt(3) && | |
1417 (_scanner.matches('---') || _scanner.matches('...'))) { | |
1418 break; | |
1419 } | |
1420 | |
1421 // Check for a comment. | |
1422 if (_scanner.peekChar() == HASH) break; | |
1423 | |
1424 if (_isPlainChar) { | |
1425 // Join the whitespace or fold line breaks. | |
1426 if (leadingBreak.isNotEmpty) { | |
1427 if (trailingBreaks.isEmpty) { | |
1428 buffer.writeCharCode(SP); | |
1429 } else { | |
1430 buffer.write(trailingBreaks); | |
1431 } | |
1432 leadingBreak = ''; | |
1433 trailingBreaks = ''; | |
1434 } else { | |
1435 buffer.write(whitespace); | |
1436 whitespace.clear(); | |
1437 } | |
1438 } | |
1439 | |
1440 // libyaml's notion of valid identifiers differs substantially from YAML | |
1441 // 1.2's. We use [_isPlainChar] instead of libyaml's character here. | |
1442 while (_isPlainChar) { | |
1443 buffer.writeCharCode(_scanner.readChar()); | |
1444 } | |
1445 | |
1446 // Is it the end? | |
1447 if (!_isBlank && !_isBreak) break; | |
1448 | |
1449 while (_isBlank || _isBreak) { | |
1450 if (_isBlank) { | |
1451 // Check for a tab character messing up the intendation. | |
1452 if (leadingBreak.isNotEmpty && _scanner.column < indent && | |
1453 _scanner.peekChar() == TAB) { | |
1454 _scanner.error("Expected a space but found a tab.", length: 1); | |
1455 } | |
1456 | |
1457 if (leadingBreak.isEmpty) { | |
1458 whitespace.writeCharCode(_scanner.readChar()); | |
1459 } else { | |
1460 _scanner.readChar(); | |
1461 } | |
1462 } else { | |
1463 // Check if it's a first line break. | |
1464 if (leadingBreak.isEmpty) { | |
1465 leadingBreak = _readLine(); | |
1466 whitespace.clear(); | |
1467 } else { | |
1468 trailingBreaks = _readLine(); | |
1469 } | |
1470 } | |
1471 } | |
1472 | |
1473 // Check the indentation level. | |
1474 if (_inBlockContext && _scanner.column < indent) break; | |
1475 } | |
1476 | |
1477 // Allow a simple key after a plain scalar with leading blanks. | |
1478 if (leadingBreak.isNotEmpty) _simpleKeyAllowed = true; | |
1479 | |
1480 return new ScalarToken(_scanner.spanFrom(start), buffer.toString(), | |
1481 ScalarStyle.PLAIN); | |
1482 } | |
1483 | |
1484 /// Moves past the current line break, if there is one. | |
1485 void _skipLine() { | |
1486 var char = _scanner.peekChar(); | |
1487 if (char != CR && char != LF) return; | |
1488 _scanner.readChar(); | |
1489 if (char == CR && _scanner.peekChar() == LF) _scanner.readChar(); | |
1490 } | |
1491 | |
1492 // Moves past the current line break and returns a newline. | |
1493 String _readLine() { | |
1494 var char = _scanner.peekChar(); | |
1495 | |
1496 // libyaml supports NEL, PS, and LS characters as line separators, but this | |
1497 // is explicitly forbidden in section 5.4 of the YAML spec. | |
1498 if (char != CR && char != LF) { | |
1499 throw new YamlException("Expected newline.", _scanner.emptySpan); | |
1500 } | |
1501 | |
1502 _scanner.readChar(); | |
1503 // CR LF | CR | LF -> LF | |
1504 if (char == CR && _scanner.peekChar() == LF) _scanner.readChar(); | |
1505 return "\n"; | |
1506 } | |
1507 | |
1508 // Returns whether the character at [offset] is whitespace. | |
1509 bool _isBlankAt(int offset) { | |
1510 var char = _scanner.peekChar(offset); | |
1511 return char == SP || char == TAB; | |
1512 } | |
1513 | |
1514 // Returns whether the character at [offset] is a line break. | |
1515 bool _isBreakAt(int offset) { | |
1516 // Libyaml considers NEL, LS, and PS to be line breaks as well, but that's | |
1517 // contrary to the spec. | |
1518 var char = _scanner.peekChar(offset); | |
1519 return char == CR || char == LF; | |
1520 } | |
1521 | |
1522 // Returns whether the character at [offset] is whitespace or past the end of | |
1523 // the source. | |
1524 bool _isBlankOrEndAt(int offset) { | |
1525 var char = _scanner.peekChar(offset); | |
1526 return char == null || char == SP || char == TAB || char == CR || | |
1527 char == LF; | |
1528 } | |
1529 | |
1530 /// Returns whether the character at [offset] is a plain character. | |
1531 /// | |
1532 /// See http://yaml.org/spec/1.2/spec.html#ns-plain-char(c). | |
1533 bool _isPlainCharAt(int offset) { | |
1534 switch (_scanner.peekChar(offset)) { | |
1535 case COLON: | |
1536 return _isPlainSafeAt(offset + 1); | |
1537 case HASH: | |
1538 var previous = _scanner.peekChar(offset - 1); | |
1539 return previous != SP && previous != TAB; | |
1540 default: | |
1541 return _isPlainSafeAt(offset); | |
1542 } | |
1543 } | |
1544 | |
1545 /// Returns whether the character at [offset] is a plain-safe character. | |
1546 /// | |
1547 /// See http://yaml.org/spec/1.2/spec.html#ns-plain-safe(c). | |
1548 bool _isPlainSafeAt(int offset) { | |
1549 var char = _scanner.peekChar(offset); | |
1550 switch (char) { | |
1551 case COMMA: | |
1552 case LEFT_SQUARE: | |
1553 case RIGHT_SQUARE: | |
1554 case LEFT_CURLY: | |
1555 case RIGHT_CURLY: | |
1556 // These characters are delimiters in a flow context and thus are only | |
1557 // safe in a block context. | |
1558 return _inBlockContext; | |
1559 case SP: | |
1560 case TAB: | |
1561 case LF: | |
1562 case CR: | |
1563 case BOM: | |
1564 return false; | |
1565 case NEL: | |
1566 return true; | |
1567 default: | |
1568 return char != null && | |
1569 ((char >= 0x00020 && char <= 0x00007E) || | |
1570 (char >= 0x000A0 && char <= 0x00D7FF) || | |
1571 (char >= 0x0E000 && char <= 0x00FFFD) || | |
1572 (char >= 0x10000 && char <= 0x10FFFF)); | |
1573 } | |
1574 } | |
1575 | |
1576 /// Returns the hexidecimal value of [char]. | |
1577 int _asHex(int char) { | |
1578 if (char <= NUMBER_9) return char - NUMBER_0; | |
1579 if (char <= LETTER_CAP_F) return 10 + char - LETTER_CAP_A; | |
1580 return 10 + char - LETTER_A; | |
1581 } | |
1582 } | |
1583 | |
1584 /// A record of the location of a potential simple key. | |
1585 class _SimpleKey { | |
1586 /// The index of the token that begins the simple key. | |
1587 /// | |
1588 /// This is the index relative to all tokens emitted, rather than relative to | |
1589 /// [_tokens]. | |
1590 final int tokenNumber; | |
1591 | |
1592 /// The source location of the beginning of the simple key. | |
1593 /// | |
1594 /// This is used for error reporting and for determining when a simple key is | |
1595 /// no longer on the current line. | |
1596 final SourceLocation location; | |
1597 | |
1598 /// Whether this key must exist for the document to be scanned. | |
1599 final bool required; | |
1600 | |
1601 _SimpleKey(this.tokenNumber, this.location, {bool required}) | |
1602 : required = required; | |
1603 } | |
1604 | |
1605 /// An enum of chomping indicators that describe how to handle trailing | |
1606 /// whitespace for a block scalar. | |
1607 /// | |
1608 /// See http://yaml.org/spec/1.2/spec.html#id2794534. | |
1609 class _Chomping { | |
1610 /// All trailing whitespace is discarded. | |
1611 static const STRIP = const _Chomping("STRIP"); | |
1612 | |
1613 /// A single trailing newline is retained. | |
1614 static const CLIP = const _Chomping("CLIP"); | |
1615 | |
1616 /// All trailing whitespace is preserved. | |
1617 static const KEEP = const _Chomping("KEEP"); | |
1618 | |
1619 final String name; | |
1620 | |
1621 const _Chomping(this.name); | |
1622 | |
1623 String toString() => name; | |
1624 } | |
OLD | NEW |