Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(425)

Side by Side Diff: pkg/yaml/lib/src/scanner.dart

Issue 689513002: Rewrite the pkg/yaml parser. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart
Patch Set: Update string_scanner dependency. Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file
2 // for details. All rights reserved. Use of this source code is governed by a
3 // BSD-style license that can be found in the LICENSE file.
4
5 library yaml.scanner;
6
7 import 'package:collection/collection.dart';
8 import 'package:string_scanner/string_scanner.dart';
9 import 'package:source_span/source_span.dart';
10
11 import 'style.dart';
12 import 'token.dart';
13 import 'utils.dart';
14 import 'yaml_exception.dart';
15
16 /// A scanner that reads a string of Unicode characters and emits [Token]s.
17 ///
18 /// This is based on the libyaml scanner, available at
19 /// https://github.com/yaml/libyaml/blob/master/src/scanner.c. The license for
20 /// that is available in ../../libyaml-license.txt.
21 class Scanner {
22 static const TAB = 0x9;
23 static const LF = 0xA;
24 static const CR = 0xD;
25 static const SP = 0x20;
26 static const TILDE = 0x7E;
Bob Nystrom 2014/10/31 20:03:29 Move to after GRAVE_ACCENT?
nweiz 2014/11/04 22:19:37 Done.
27 static const NEL = 0x85;
Bob Nystrom 2014/10/31 20:03:29 This one's a bit uncommon. How about moving either
nweiz 2014/11/04 22:19:38 Done.
28 static const DOLLAR = 0x24;
29 static const LEFT_PAREN = 0x28;
30 static const RIGHT_PAREN = 0x29;
31 static const PLUS = 0x2B;
32 static const COMMA = 0x2C;
33 static const HYPHEN = 0x2D;
34 static const PERIOD = 0x2E;
35 static const QUESTION = 0x3F;
36 static const COLON = 0x3A;
37 static const SEMICOLON = 0x3B;
38 static const EQUALS = 0x3D;
39 static const LEFT_SQUARE = 0x5B;
40 static const RIGHT_SQUARE = 0x5D;
41 static const LEFT_CURLY = 0x7B;
42 static const RIGHT_CURLY = 0x7D;
43 static const HASH = 0x23;
44 static const AMPERSAND = 0x26;
45 static const ASTERISK = 0x2A;
46 static const EXCLAMATION = 0x21;
47 static const VERTICAL_BAR = 0x7C;
48 static const LEFT_ANGLE = 0x3C;
49 static const RIGHT_ANGLE = 0x3E;
50 static const SINGLE_QUOTE = 0x27;
51 static const DOUBLE_QUOTE = 0x22;
52 static const PERCENT = 0x25;
53 static const AT = 0x40;
54 static const GRAVE_ACCENT = 0x60;
55
56 static const NULL = 0x0;
57 static const BELL = 0x7;
58 static const BACKSPACE = 0x8;
59 static const VERTICAL_TAB = 0xB;
60 static const FORM_FEED = 0xC;
61 static const ESCAPE = 0x1B;
62 static const SLASH = 0x2F;
63 static const BACKSLASH = 0x5C;
64 static const UNDERSCORE = 0x5F;
65 static const NBSP = 0xA0;
66 static const LINE_SEPARATOR = 0x2028;
67 static const PARAGRAPH_SEPARATOR = 0x2029;
68 static const BOM = 0xFEFF;
69
70 static const NUMBER_0 = 0x30;
71 static const NUMBER_9 = 0x39;
72
73 static const LETTER_A = 0x61;
74 static const LETTER_B = 0x62;
75 static const LETTER_E = 0x65;
76 static const LETTER_F = 0x66;
77 static const LETTER_N = 0x6E;
78 static const LETTER_R = 0x72;
79 static const LETTER_T = 0x74;
80 static const LETTER_U = 0x75;
81 static const LETTER_V = 0x76;
82 static const LETTER_X = 0x78;
83 static const LETTER_Z = 0x7A;
84
85 static const LETTER_CAP_A = 0x41;
86 static const LETTER_CAP_F = 0x46;
87 static const LETTER_CAP_L = 0x4C;
88 static const LETTER_CAP_N = 0x4E;
89 static const LETTER_CAP_P = 0x50;
90 static const LETTER_CAP_U = 0x55;
91 static const LETTER_CAP_X = 0x58;
92 static const LETTER_CAP_Z = 0x5A;
93
94 /// The underlying [SpanScanner] used to read characters from the source text.
95 ///
96 /// This is also used to track line and column information and to generate
97 /// [SourceSpan]s.
98 final SpanScanner _scanner;
99
100 /// Whether this scanner has produced a [TokenType.STREAM_START] token
101 /// indicating the beginning of the YAML stream.
102 var _streamStartProduced = false;
103
104 /// Whether this scanner has produced a [TokenType.STREAM_END] token
105 /// indicating the end of the YAML stream.
106 var _streamEndProduced = false;
107
108 /// How many levels deep the scanner is in flow nesting.
109 var _flowLevel = 0;
Bob Nystrom 2014/10/31 20:03:28 Can this be inferred from _simpleKeys.length?
nweiz 2014/11/04 22:19:37 Yes, good idea.
110
111 /// The queue of tokens yet to be emitted.
112 ///
113 /// These are queued up in advance so that [TokenType.KEY] tokens can be
114 /// inserted once the scanner determines that a series of tokens represents a
115 /// mapping key.
116 final _tokens = new QueueList<Token>();
117
118 /// The number of tokens that have been emitted.
119 ///
120 /// This doesn't count tokens in [tokens].
121 var _tokensParsed = 0;
Bob Nystrom 2014/10/31 20:03:28 "Parsed" -> "Scanned"?
nweiz 2014/11/04 22:19:37 Done.
122
123 /// Whether the next token in [_tokens] is ready to be returned.
124 ///
125 /// It might not be ready if there may still be a [TokenType.KEY] inserted
126 /// before it.
127 var _tokenAvailable = false;
128
129 /// The stack of indent levels for the current nested block contexts.
130 final _indents = new List<int>();
Bob Nystrom 2014/10/31 20:03:29 <int>[]
nweiz 2014/11/04 22:19:37 Done.
131
132 /// The current indent level.
133 var _indent = -1;
Bob Nystrom 2014/10/31 20:03:27 Document what -1 means (or make a constant). Does
nweiz 2014/11/04 22:19:38 Done.
134
135 /// Whether a simple key is allowed in this context.
136 ///
137 /// A simple key refers to any mapping key that doesn't have an explicit "?".
138 var _simpleKeyAllowed = true;
139
140 /// The stack of potential simple keys for each level of flow nesting.
141 ///
142 /// Entries in this list may be `null`, indicating that there is no valid
143 /// simple key for the associated level of nesting.
144 ///
145 /// When a ":" is parsed and there's a simple key available, a [TokenType.KEY]
146 /// token is inserted in [_tokens] before that key's token. This allows the
147 /// parser to tell that the key is intended to be a mapping key.
148 final _simpleKeys = <_SimpleKey>[null];
Bob Nystrom 2014/10/31 20:03:28 Why isn't this initially empty?
nweiz 2014/11/04 22:19:37 Because there is an initial flow level that could
149
150 /// Whether the scanner's currently positioned in a block-level structure (as
151 /// opposed to flow-level).
152 bool get _inBlockContext => _flowLevel == 0;
153
154 /// Whether the current character is a line break or the end of the source.
155 bool get _isBreakOrEnd => _scanner.isDone || _isBreak;
156
157 /// Whether the current character is a line break.
158 bool get _isBreak => _isBreakAt(0);
159
160 /// Whether the current character is whitespace or the end of the source.
161 bool get _isBlankOrEnd => _isBlankOrEndAt(0);
162
163 /// Whether the current character is whitespace.
164 bool get _isBlank => _isBlankAt(0);
165
166 /// Whether the current character is a valid tag name character.
167 ///
168 /// See http://yaml.org/spec/1.2/spec.html#ns-tag-name.
169 bool get _isTagChar {
170 var char = _scanner.peekChar();
171 if (char == null) return false;
172 return (char >= NUMBER_0 && char <= NUMBER_9) ||
173 (char >= LETTER_A && char <= LETTER_Z) ||
174 (char >= LETTER_CAP_A && char <= LETTER_CAP_Z) ||
175 char == HYPHEN || char == SEMICOLON || char == SLASH ||
176 char == COLON || char == AT || char == AMPERSAND ||
177 char == EQUALS || char == PLUS || char == DOLLAR ||
178 char == PERIOD || char == TILDE || char == QUESTION ||
179 char == ASTERISK || char == SINGLE_QUOTE || char == LEFT_PAREN ||
180 char == RIGHT_PAREN || char == PERCENT;
Bob Nystrom 2014/10/31 20:03:28 It may be quicker to look this up in a map or even
nweiz 2014/11/04 22:19:37 Done.
181 }
182
183 /// Whether the current character is a valid anchor name character.
184 ///
185 /// See http://yaml.org/spec/1.2/spec.html#ns-anchor-name.
186 bool get _isAnchorChar {
187 if (!_isNonSpace) return false;
188
189 var char = _scanner.peekChar();
190 return char != COMMA && char != LEFT_SQUARE && char != RIGHT_SQUARE &&
191 char != LEFT_CURLY && char != RIGHT_CURLY;
192 }
193
194 /// Whether the character at the current position is a decimal digit.
195 bool get _isDigit {
196 var char = _scanner.peekChar();
197 return char != null && (char >= NUMBER_0 && char <= NUMBER_9);
198 }
199
200 /// Whether the character at the current position is a hexidecimal
201 /// digit.
202 bool get _isHex {
203 var char = _scanner.peekChar();
204 return char != null &&
205 ((char >= NUMBER_0 && char <= NUMBER_9) ||
206 (char >= LETTER_A && char <= LETTER_F) ||
207 (char >= LETTER_CAP_A && char <= LETTER_CAP_F));
208 }
209
210 /// Whether the character at the current position is a plain character.
211 ///
212 /// See http://yaml.org/spec/1.2/spec.html#ns-plain-char(c).
213 bool get _isPlainChar => _isPlainCharAt(0);
214
215 /// Whether the character at the current position is a printable character
216 /// other than a line break or byte-order mark.
217 ///
218 /// See http://yaml.org/spec/1.2/spec.html#nb-char.
219 bool get _isNonBreak {
220 var char = _scanner.peekChar();
221 switch (char) {
222 case LF:
223 case CR:
224 case BOM:
225 return false;
226 case TAB:
227 case NEL:
228 return true;
229 default:
230 return char != null &&
231 ((char >= 0x00020 && char <= 0x00007E) ||
Bob Nystrom 2014/10/31 20:03:28 Nit: +2 more before "(".
nweiz 2014/11/04 22:19:37 Done.
232 (char >= 0x000A0 && char <= 0x00D7FF) ||
233 (char >= 0x0E000 && char <= 0x00FFFD) ||
234 (char >= 0x10000 && char <= 0x10FFFF));
235 }
236 }
237
238 /// Whether the character at the current position is a printable character
239 /// other than whitespace.
240 ///
241 /// See http://yaml.org/spec/1.2/spec.html#nb-char.
242 bool get _isNonSpace {
243 var char = _scanner.peekChar();
244 return char != null && char != LF && char != CR && char != BOM && char != SP &&
Bob Nystrom 2014/10/31 20:03:27 Long line.
nweiz 2014/11/04 22:19:37 Done.
245 char != SP &&
246 (char == NEL ||
247 (char >= 0x00020 && char <= 0x00007E) ||
248 (char >= 0x000A0 && char <= 0x00D7FF) ||
249 (char >= 0x0E000 && char <= 0x00FFFD) ||
250 (char >= 0x10000 && char <= 0x10FFFF));
Bob Nystrom 2014/10/31 20:03:28 This expression is pretty huge. How about using a
nweiz 2014/11/04 22:19:36 Done.
251 }
252
253 /// Creates a scanner that scans [source].
254 ///
255 /// [sourceUrl] can be a String or a [Uri].
256 Scanner(String source, {sourceUrl})
257 : _scanner = new SpanScanner(source, sourceUrl: sourceUrl);
258
259 /// Consumes and returns the next token.
260 Token scan() {
261 if (_streamEndProduced) throw new StateError("Out of tokens.");
262 if (!_tokenAvailable) _fetchMoreTokens();
263
264 var token = _tokens.removeFirst();
265 _tokenAvailable = false;
266 _tokensParsed++;
267 _streamEndProduced = token is Token &&
268 token.type == TokenType.STREAM_END;
269 return token;
270 }
271
272 /// Returns the next token without consuming it.
273 Token peek() {
274 if (_streamEndProduced) return null;
275 if (!_tokenAvailable) _fetchMoreTokens();
276 return _tokens.first;
277 }
278
279 /// Ensures that [_tokens] contains at least one token which can be returned.
280 void _fetchMoreTokens() {
281 while (true) {
282 if (_tokens.isNotEmpty) {
283 _staleSimpleKeys();
284 if (!_simpleKeys.any((key) =>
Bob Nystrom 2014/10/31 20:03:28 Document this.
nweiz 2014/11/04 22:19:38 Done.
285 key != null && key.tokenNumber == _tokensParsed)) {
286 break;
287 }
288 }
289
290 _fetchNextToken();
291 }
292 _tokenAvailable = true;
293 }
294
295 /// The dispatcher for token fetchers.
296 void _fetchNextToken() {
297 if (!_streamStartProduced) {
298 _fetchStreamStart();
299 return;
300 }
301
302 _scanToNextToken();
303 _staleSimpleKeys();
304 _unrollIndent(_scanner.column);
305
306 if (_scanner.isDone) {
307 _fetchStreamEnd();
308 return;
309 }
310
311 if (_scanner.column == 0) {
312 if (_scanner.peekChar() == PERCENT) {
313 _fetchDirective();
314 return;
315 } else if (_isBlankOrEndAt(3)) {
Bob Nystrom 2014/10/31 20:03:28 Ditch the else.
nweiz 2014/11/04 22:19:37 Done.
316 if (_scanner.matches('---')) {
317 _fetchDocumentIndicator(TokenType.DOCUMENT_START);
318 return;
319 } else if (_scanner.matches('...')) {
Bob Nystrom 2014/10/31 20:03:28 Here too.
nweiz 2014/11/04 22:19:36 Done.
320 _fetchDocumentIndicator(TokenType.DOCUMENT_END);
321 return;
322 }
323 }
324 }
325
326 switch (_scanner.peekChar()) {
327 case LEFT_SQUARE:
328 _fetchFlowCollectionStart(TokenType.FLOW_SEQUENCE_START);
329 return;
Bob Nystrom 2014/10/31 20:03:29 Is there a reason to prefer return over break thro
nweiz 2014/11/04 22:19:37 It allows the reader to avoid checking the end of
330 case LEFT_CURLY:
331 _fetchFlowCollectionStart(TokenType.FLOW_MAPPING_START);
332 return;
333 case RIGHT_SQUARE:
334 _fetchFlowCollectionEnd(TokenType.FLOW_SEQUENCE_END);
335 return;
336 case RIGHT_CURLY:
337 _fetchFlowCollectionEnd(TokenType.FLOW_MAPPING_END);
338 return;
339 case COMMA:
340 _fetchFlowEntry();
341 return;
342 case ASTERISK:
343 _fetchAnchor(anchor: false);
344 return;
345 case AMPERSAND:
346 _fetchAnchor(anchor: true);
347 return;
348 case EXCLAMATION:
349 _fetchTag();
350 return;
351 case SINGLE_QUOTE:
352 _fetchFlowScalar(singleQuote: true);
353 return;
354 case DOUBLE_QUOTE:
355 _fetchFlowScalar(singleQuote: false);
356 return;
357 case VERTICAL_BAR:
358 if (!_inBlockContext) _invalidScalarCharacter();
359 _fetchBlockScalar(literal: true);
360 return;
361 case RIGHT_ANGLE:
362 if (!_inBlockContext) _invalidScalarCharacter();
363 _fetchBlockScalar(literal: false);
364 return;
365 case PERCENT:
366 case AT:
367 case GRAVE_ACCENT:
368 _invalidScalarCharacter();
369 return;
370
371 // These characters may sometimes begin plain scalars.
372 case HYPHEN:
373 if (_isPlainCharAt(1)) {
374 _fetchPlainScalar();
375 } else {
376 _fetchBlockEntry();
377 }
378 return;
379 case QUESTION:
380 if (_isPlainCharAt(1)) {
381 _fetchPlainScalar();
382 } else {
383 _fetchKey();
384 }
385 return;
386 case COLON:
387 if (!_inBlockContext && _tokens.isNotEmpty) {
388 // If a colon follows a "JSON-like" value (an explicit map or list, or
389 // a quoted string) it isn't required to have whitespace after it
390 // since it unambiguously describes a map.
391 var token = _tokens.last;
392 if (token.type == TokenType.FLOW_SEQUENCE_END ||
393 token.type == TokenType.FLOW_MAPPING_END ||
394 (token.type == TokenType.SCALAR && token.style.isQuoted)) {
395 _fetchValue();
396 return;
397 }
398 }
399
400 if (_isPlainCharAt(1)) {
401 _fetchPlainScalar();
402 } else {
403 _fetchValue();
404 }
405 return;
406 default:
407 if (!_isNonBreak) _invalidScalarCharacter();
408
409 _fetchPlainScalar();
410 return;
411 }
412
413 throw 'Inaccessible';
414 }
415
416 /// Throws an error about a disallowed character.
417 void _invalidScalarCharacter() =>
418 _scanner.error("Unexpected character.", length: 1);
419
420 /// Checks the list of potential simple keys and remove the positions that
421 /// cannot contain simple keys anymore.
422 void _staleSimpleKeys() {
423 for (var i = 0; i < _simpleKeys.length; i++) {
424 var key = _simpleKeys[i];
425 if (key == null) continue;
426
427 // libyaml requires that all simple keys be a single line and no longer
428 // than 1024 characters. However, in section 7.4.2 of the spec
429 // (http://yaml.org/spec/1.2/spec.html#id2790832), these restriction is
Bob Nystrom 2014/10/31 20:03:27 "restrictions are"
nweiz 2014/11/04 22:19:38 Done.
430 // only applied when the curly braces are omitted. It's difficult to
431 // retain enough context to know which keys need to have the restriction
432 // placed on them, so for now we go the other direction and allow
433 // everything but multiline simple keys in a block context.
434 if (!_inBlockContext) continue;
435
436 if (key.location.line == _scanner.line) continue;
437
438 if (key.required) {
439 throw new YamlException("Expected ':'.", _scanner.emptySpan);
440 }
441
442 _simpleKeys[i] = null;
443 }
444 }
445
446 /// Checks if a simple key may start at the current position and saves it if
447 /// so.
448 void _saveSimpleKey() {
449 // A simple key is required at the current position if the scanner is in the
450 // block context and the current column coincides with the indentation
451 // level.
452 var required = _inBlockContext && _indent == _scanner.column;
453
454 // A simple key is required only when it is the first token in the current
455 // line. Therefore it is always allowed. But we add a check anyway.
456 assert(_simpleKeyAllowed || !required);
457
458 if (!_simpleKeyAllowed) return;
459
460 // If the current position may start a simple key, save it.
461 _removeSimpleKey();
462 _simpleKeys[_simpleKeys.length - 1] = new _SimpleKey(
463 _tokensParsed + _tokens.length,
464 _scanner.location,
465 required: required);
466 }
467
468 /// Removes a potential simple key at the current flow level.
469 void _removeSimpleKey() {
470 var key = _simpleKeys.last;
471 if (key != null && key.required) {
472 throw new YamlException("Could not find expected ':' for simple key.",
473 key.location.pointSpan());
474 }
475
476 _simpleKeys[_simpleKeys.length - 1] = null;
477 }
478
479 /// Increases the flow level and resizes the simple key list.
480 void _increaseFlowLevel() {
481 _simpleKeys.add(null);
482 _flowLevel++;
483 }
484
485 /// Decreases the flow level.
486 void _decreaseFlowLevel() {
487 if (_inBlockContext) return;
488 _simpleKeys.removeLast();
489 _flowLevel--;
490 }
491
492 /// Pushes the current indentation level to the stack and sets the new level i f
Bob Nystrom 2014/10/31 20:03:28 Long line.
nweiz 2014/11/04 22:19:36 Done.
493 /// [column] is greater than [_indent].
494 ///
495 /// In it is, appends or inserts the specified token into [_tokens]. If
Bob Nystrom 2014/10/31 20:03:27 "it is"?
nweiz 2014/11/04 22:19:37 Done.
496 /// [tokenNumber] is provided, the corresponding token will be replaced;
497 /// otherwise, the token will be added at the end.
498 void _rollIndent(int column, TokenType type, SourceLocation location,
499 {int tokenNumber}) {
500 if (!_inBlockContext) return;
501 if (_indent != -1 && _indent >= column) return;
502
503 // Push the current indentation level to the stack and set the new
504 // indentation level.
505 _indents.add(_indent);
506 _indent = column;
507
508 // Create a token and insert it into the queue.
509 var token = new Token(type, location.pointSpan());
510 if (tokenNumber == null) {
511 _tokens.add(token);
512 } else {
513 _tokens.insert(tokenNumber - _tokensParsed, token);
514 }
515 }
516
517 /// Pops indentation levels from [_indents] until the current level becomes
518 /// less than or equal to [column].
519 ///
520 /// For each indentation level, appends a [TokenType.BLOCK_END] token.
521 void _unrollIndent(int column) {
522 if (!_inBlockContext) return;
523
524 while (_indent > column) {
525 _tokens.add(new Token(TokenType.BLOCK_END, _scanner.emptySpan));
526 _indent = _indents.removeLast();
527 }
528 }
529
530 /// Pops indentation levels from [_indents] until the current level resets to
531 /// -1.
532 ///
533 /// For each indentation level, appends a [TokenType.BLOCK_END] token.
534 void _resetIndent() => _unrollIndent(-1);
535
536 /// Produces a [TokenType.STREAM_START] token.
537 void _fetchStreamStart() {
538 // Much of libyaml's initialization logic here is done in variable
539 // initializers instead.
540 _streamStartProduced = true;
541 _tokens.add(new Token(TokenType.STREAM_START, _scanner.emptySpan));
542 }
543
544 /// Produces a [TokenType.STREAM_END] token.
545 void _fetchStreamEnd() {
546 _resetIndent();
547 _removeSimpleKey();
548 _simpleKeyAllowed = false;
549 _tokens.add(new Token(TokenType.STREAM_END, _scanner.emptySpan));
550 }
551
552 /// Produces a [TokenType.VERSION_DIRECTIVE] or [TokenType.TAG_DIRECTIVE]
553 /// token.
554 void _fetchDirective() {
555 _resetIndent();
556 _removeSimpleKey();
557 _simpleKeyAllowed = false;
558 var directive = _scanDirective();
559 if (directive != null) _tokens.add(directive);
560 }
561
562 /// Produces a [TokenType.DOCUMENT_START] or [TokenType.DOCUMENT_END] token.
563 void _fetchDocumentIndicator(TokenType type) {
564 _resetIndent();
565 _removeSimpleKey();
566 _simpleKeyAllowed = false;
Bob Nystrom 2014/10/31 20:03:28 Hoist these three lines into a _resetState() metho
nweiz 2014/11/04 22:19:36 I'd rather have the visual similarity with the met
567
568 // Consume the indicator token.
569 var start = _scanner.state;
570 _scanner.readChar();
571 _scanner.readChar();
572 _scanner.readChar();
573
574 _tokens.add(new Token(type, _scanner.spanFrom(start)));
575 }
576
577 /// Produces a [TokenType.FLOW_SEQUENCE_START] or
578 /// [TokenType.FLOW_MAPPING_START] token.
579 void _fetchFlowCollectionStart(TokenType type) {
580 _saveSimpleKey();
581 _increaseFlowLevel();
582 _simpleKeyAllowed = true;
583 _addCharToken(type);
584 }
585
586 /// Produces a [TokenType.FLOW_SEQUENCE_END] or [TokenType.FLOW_MAPPING_END]
587 /// token.
588 void _fetchFlowCollectionEnd(TokenType type) {
589 _removeSimpleKey();
590 _decreaseFlowLevel();
591 _simpleKeyAllowed = false;
592 _addCharToken(type);
593 }
594
595 /// Produces a [TokenType.FLOW_ENTRY] token.
596 void _fetchFlowEntry() {
597 _removeSimpleKey();
598 _simpleKeyAllowed = true;
599 _addCharToken(TokenType.FLOW_ENTRY);
600 }
601
602 /// Produces a [TokenType.BLOCK_ENTRY] token.
603 void _fetchBlockEntry() {
604 if (_inBlockContext) {
605 if (!_simpleKeyAllowed) {
606 throw new YamlException(
607 "Block sequence entries are not allowed in this context.",
Bob Nystrom 2014/10/31 20:03:29 Would be good to describe the context instead of j
nweiz 2014/11/04 22:19:37 That's pretty tough... we'd have to track the reas
608 _scanner.emptySpan);
609 }
610
611 _rollIndent(
612 _scanner.column,
613 TokenType.BLOCK_SEQUENCE_START,
614 _scanner.emptySpan.start);
615 } else {
616 // It is an error for the '-' indicator to occur in the flow context, but
617 // we let the Parser detect and report it because it's able to point to
618 // the context.
619 }
620
621 _removeSimpleKey();
622 _simpleKeyAllowed = true;
623 _addCharToken(TokenType.BLOCK_ENTRY);
624 }
625
626 /// Produces the [TokenType.KEY] token.
627 void _fetchKey() {
628 if (_inBlockContext) {
629 if (!_simpleKeyAllowed) {
630 throw new YamlException("Mapping keys are not allowed in this context.",
Bob Nystrom 2014/10/31 20:03:28 Ditto.
631 _scanner.emptySpan);
632 }
633
634 _rollIndent(
635 _scanner.column,
636 TokenType.BLOCK_MAPPING_START,
637 _scanner.emptySpan.start);
638 }
639
640 // Simple keys are allowed after `?` in a block context.
641 _simpleKeyAllowed = _inBlockContext;
642 _addCharToken(TokenType.KEY);
643 }
644
645 /// Produces the [TokenType.VALUE] token.
646 void _fetchValue() {
647 var simpleKey = _simpleKeys.last;
648 if (simpleKey != null) {
649 // Add a [TokenType.KEY] directive before the first token of the simple
650 // key so the parser knows that it's part of a key/value pair.
651 _tokens.insert(simpleKey.tokenNumber - _tokensParsed,
652 new Token(TokenType.KEY, simpleKey.location.pointSpan()));
653
654 // In the block context, we may need to add the
655 // [TokenType.BLOCK_MAPPING_START] token.
656 _rollIndent(
657 simpleKey.location.column,
658 TokenType.BLOCK_MAPPING_START,
659 simpleKey.location,
660 tokenNumber: simpleKey.tokenNumber);
661
662 // Remove the simple key.
663 _simpleKeys[_simpleKeys.length - 1] = null;
664
665 // A simple key cannot follow another simple key.
666 _simpleKeyAllowed = false;
667 } else if (_inBlockContext) {
668 // If we're here, we've found the ':' indicator following a complex key.
669
670 if (!_simpleKeyAllowed) {
671 throw new YamlException(
672 "Mapping values are not allowed in this context.",
673 _scanner.emptySpan);
674 }
675
676 _rollIndent(
677 _scanner.column,
678 TokenType.BLOCK_MAPPING_START,
679 _scanner.location);
680 _simpleKeyAllowed = true;
681 } else if (_simpleKeyAllowed) {
682 // If we're here, we've found the ':' indicator with an empty key. This
683 // behavior differs from libyaml, which disallows empty implicit keys.
684 _simpleKeyAllowed = false;
685 _addCharToken(TokenType.KEY);
686 }
687
688 _addCharToken(TokenType.VALUE);
689 }
690
691 /// Adds a token with [type] to [_tokens].
692 ///
693 /// The span of the new token is the current character.
694 void _addCharToken(TokenType type) {
695 var start = _scanner.state;
696 _scanner.readChar();
697 _tokens.add(new Token(type, _scanner.spanFrom(start)));
698 }
699
700 /// Produces a [TokenType.ALIAS] or [TokenType.ANCHOR] token.
701 void _fetchAnchor({bool anchor: true}) {
702 _saveSimpleKey();
703 _simpleKeyAllowed = false;
704 _tokens.add(_scanAnchor(anchor: anchor));
705 }
706
707 /// Produces a [TokenType.TAG] token.
708 void _fetchTag() {
709 _saveSimpleKey();
710 _simpleKeyAllowed = false;
711 _tokens.add(_scanTag());
712 }
713
714 /// Produces a [TokenType.SCALAR] token with style [ScalarStyle.LITERAL] or
715 /// [ScalarStyle.FOLDED].
716 void _fetchBlockScalar({bool literal: false}) {
717 _removeSimpleKey();
718 _simpleKeyAllowed = true;
719 _tokens.add(_scanBlockScalar(literal: literal));
720 }
721
722 /// Produces a [TokenType.SCALAR] token with style [ScalarStyle.SINGLE_QUOTED]
723 /// or [ScalarStyle.DOUBLE_QUOTED].
724 void _fetchFlowScalar({bool singleQuote: false}) {
725 _saveSimpleKey();
726 _simpleKeyAllowed = false;
727 _tokens.add(_scanFlowScalar(singleQuote: singleQuote));
728 }
729
730 /// Produces a [TokenType.SCALAR] token with style [ScalarStyle.PLAIN].
731 void _fetchPlainScalar() {
732 _saveSimpleKey();
733 _simpleKeyAllowed = false;
734 _tokens.add(_scanPlainScalar());
735 }
736
737 /// Eats whitespace and comments until the next token is found.
738 void _scanToNextToken() {
739 var afterLineBreak = false;
740 while (true) {
741 // Allow the BOM to start a line.
742 if (_scanner.column == 0) _scanner.scan("\uFEFF");
743
744 // Eat whitespace.
745 //
746 // libyaml disallows tabs after "-", "?", or ":", but the spec allows
747 // them. See section 6.2: http://yaml.org/spec/1.2/spec.html#id2778241.
748 while (_scanner.peekChar() == SP ||
749 ((!_inBlockContext || !afterLineBreak) &&
750 _scanner.peekChar() == TAB)) {
751 _scanner.readChar();
752 }
753
754 if (_scanner.peekChar() == TAB) {
755 _scanner.error("Tab characters are not allowed as indentation.",
756 length: 1);
757 }
758
759 // Eat a comment until a line break.
760 if (_scanner.peekChar() == HASH) {
761 while (!_isBreakOrEnd) {
762 _scanner.readChar();
763 }
764 }
765
766 // If we're at a line break, eat it.
767 if (_isBreak) {
768 _skipLine();
769
770 // In the block context, a new line may start a simple key.
771 if (_inBlockContext) _simpleKeyAllowed = true;
772 afterLineBreak = true;
773 } else {
774 // Otherwise we've found a token.
775 break;
776 }
777 }
778 }
779
780 /// Scans a [TokenType.YAML_DIRECTIVE] or [TokenType.TAG_DIRECTIVE] token.
781 ///
782 /// %YAML 1.2 # a comment \n
783 /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
784 /// %TAG !yaml! tag:yaml.org,2002: \n
785 /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
786 Token _scanDirective() {
787 var start = _scanner.state;
788
789 // Eat '%'.
790 _scanner.readChar();
791
792 var token;
793 var name = _scanDirectiveName();
794 if (name == "YAML") {
795 token = _scanVersionDirectiveValue(start);
796 } else if (name == "TAG") {
797 token = _scanTagDirectiveValue(start);
798 } else {
799 warn("Warning: unknown directive.", _scanner.spanFrom(start));
Bob Nystrom 2014/10/31 20:03:27 I don't think the parser should output directly to
nweiz 2014/11/04 22:19:37 Done. I wish there were a more standard way to do
800
801 // libyaml doesn't support unknown directives, but the spec says to ignore
802 // them and warn: http://yaml.org/spec/1.2/spec.html#id2781147.
803 while (!_isBreakOrEnd) {
804 _scanner.readChar();
805 }
806
807 return null;
808 }
809
810 // Eat the rest of the line, including any comments.
811 while (_isBlank) {
812 _scanner.readChar();
813 }
Bob Nystrom 2014/10/31 20:03:28 Make a _skipBlanks() method for this since you do
nweiz 2014/11/04 22:19:37 Done.
814
815 if (_scanner.peekChar() == HASH) {
816 while (!_isBreakOrEnd) {
817 _scanner.readChar();
818 }
819 }
Bob Nystrom 2014/10/31 20:03:28 Probably this too.
nweiz 2014/11/04 22:19:36 Done.
820
821 if (!_isBreakOrEnd) {
822 throw new YamlException(
823 "Expected comment or line break after directive.",
824 _scanner.spanFrom(start));
825 }
826
827 if (_isBreak) _skipLine();
Bob Nystrom 2014/10/31 20:03:27 Do you need to check _isBreak here? Doesn't _skipL
nweiz 2014/11/04 22:19:36 Done.
828 return token;
829 }
830
831 /// Scans a directive name.
832 ///
833 /// %YAML 1.2 # a comment \n
834 /// ^^^^
835 /// %TAG !yaml! tag:yaml.org,2002: \n
836 /// ^^^
837 String _scanDirectiveName() {
838 var buffer = new StringBuffer();
839 // libyaml only allows word characters in directive names, but the spec
840 // disagrees: http://yaml.org/spec/1.2/spec.html#ns-directive-name.
841 while (_isNonSpace) {
842 buffer.writeCharCode(_scanner.readChar());
Bob Nystrom 2014/10/31 20:03:29 This seems inefficient. Can you just get a substri
nweiz 2014/11/04 22:19:36 Done.
843 }
844
845 var name = buffer.toString();
846 if (name.isEmpty) {
847 throw new YamlException("Expected directive name.", _scanner.emptySpan);
848 } else if (!_isBlankOrEnd) {
Bob Nystrom 2014/10/31 20:03:28 What about: %YAML#Comment. I'd expect this to be
nweiz 2014/11/04 22:19:36 I don't think that's a likely enough error to warr
849 throw new YamlException(
850 "Unexpected character in directive name.", _scanner.emptySpan);
851 }
852
853 return name;
854 }
855
856 /// Scans the value of a version directive.
857 ///
858 /// %YAML 1.2 # a comment \n
859 /// ^^^^^^
860 Token _scanVersionDirectiveValue(LineScannerState start) {
861 while (_isBlank) {
862 _scanner.readChar();
863 }
864
865 var major = _scanVersionDirectiveNumber();
866 _scanner.expect('.');
867 var minor = _scanVersionDirectiveNumber();
868
869 return new VersionDirectiveToken(_scanner.spanFrom(start), major, minor);
870 }
871
872 /// Scans the version number of a version directive.
873 ///
874 /// %YAML 1.2 # a comment \n
875 /// ^
876 /// %YAML 1.2 # a comment \n
877 /// ^
878 int _scanVersionDirectiveNumber() {
879 var buffer = new StringBuffer();
880 while (_isDigit) {
881 buffer.writeCharCode(_scanner.readChar());
882 }
883
884 var number = buffer.toString();
885 if (number.isEmpty) {
886 throw new YamlException("Expected version number.", _scanner.emptySpan);
887 }
888
889 return int.parse(number);
890 }
891
892 /// Scans the value of a tag directive.
893 ///
894 /// %TAG !yaml! tag:yaml.org,2002: \n
895 /// ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
896 Token _scanTagDirectiveValue(LineScannerState start) {
897 while (_isBlank) {
898 _scanner.readChar();
899 }
900
901 var handle = _scanTagHandle(directive: true);
902 if (!_isBlank) {
903 throw new YamlException("Expected whitespace.", _scanner.emptySpan);
904 }
905
906 while (_isBlank) {
907 _scanner.readChar();
908 }
909
910 var prefix = _scanTagUri();
911 if (!_isBlankOrEnd) {
912 throw new YamlException("Expected whitespace.", _scanner.emptySpan);
913 }
914
915 return new TagDirectiveToken(_scanner.spanFrom(start), handle, prefix);
916 }
917
918 /// Scans a [TokenType.ANCHOR] token.
919 Token _scanAnchor({bool anchor: true}) {
920 var start = _scanner.state;
921
922 // Eat the indicator character.
923 _scanner.readChar();
924
925 var buffer = new StringBuffer();
926 // libyaml only allows word characters in anchor names, but the spec
927 // disagrees: http://yaml.org/spec/1.2/spec.html#ns-anchor-char.
928 while (_isAnchorChar) {
929 buffer.writeCharCode(_scanner.readChar());
930 }
931
932 var next = _scanner.peekChar();
933 if (buffer.length == 0 ||
934 (!_isBlankOrEnd && next != QUESTION && next != COLON &&
935 next != COMMA && next != RIGHT_SQUARE && next != RIGHT_CURLY &&
936 next != PERCENT && next != AT && next != GRAVE_ACCENT)) {
Bob Nystrom 2014/10/31 20:03:29 What are these specific character tests for?
nweiz 2014/11/04 22:19:37 They check whether the anchor is followed by some
937 throw new YamlException("Expected alphanumeric character.",
938 _scanner.emptySpan);
939 }
940
941 if (anchor) {
942 return new AnchorToken(_scanner.spanFrom(start), buffer.toString());
943 } else {
944 return new AliasToken(_scanner.spanFrom(start), buffer.toString());
945 }
946 }
947
948 /// Scans a [TokenType.TAG] token.
949 Token _scanTag() {
950 var handle;
951 var suffix;
952 var start = _scanner.state;
953
954 // Check if the tag is in the canonical form.
955 if (_scanner.peekChar(1) == LEFT_ANGLE) {
Bob Nystrom 2014/10/31 20:03:29 Does this fail on "!" (a bang by itself)?
nweiz 2014/11/04 22:19:36 No; [peekChar] returns null for out-of-range indic
956 // Eat '!<'.
957 _scanner.readChar();
958 _scanner.readChar();
959
960 handle = '';
961 suffix = _scanTagUri();
962
963 _scanner.expect('>');
964 } else {
965 // The tag has either the '!suffix' or the '!handle!suffix' form.
966
967 // First, try to scan a handle.
968 handle = _scanTagHandle();
969
970 if (handle.length > 1 && handle.startsWith('!') && handle.endsWith('!')) {
971 suffix = _scanTagUri(flowSeparators: false);
972 } else {
973 suffix = _scanTagUri(head: handle, flowSeparators: false);
974
975 // There was no explicit handle.
976 if (suffix.isEmpty) {
977 // This is the special '!' tag.
978 handle = null;
979 suffix = '!';
980 } else {
981 handle = '!';
982 }
983 }
984 }
985
986 // libyaml insists on whitespace after a tag, but example 7.2 indicates
987 // that it's not required: http://yaml.org/spec/1.2/spec.html#id2786720.
988
989 return new TagToken(_scanner.spanFrom(start), handle, suffix);
990 }
991
992 /// Scans a tag handle.
993 String _scanTagHandle({bool directive: false}) {
994 _scanner.expect('!');
995
996 var buffer = new StringBuffer('!');
997
998 // libyaml only allows word characters in tags, but the spec disagrees:
999 // http://yaml.org/spec/1.2/spec.html#ns-tag-char.
1000 while (_isTagChar) {
1001 buffer.writeCharCode(_scanner.readChar());
1002 }
1003
1004 if (_scanner.peekChar() == EXCLAMATION) {
1005 buffer.writeCharCode(_scanner.readChar());
1006 } else {
1007 // It's either the '!' tag or not really a tag handle. If it's a %TAG
1008 // directive, it's an error. If it's a tag token, it must be part of a
1009 // URI.
1010 if (directive && buffer.toString() != '!') _scanner.expect('!');
1011 }
1012
1013 return buffer.toString();
1014 }
1015
1016 /// Scans a tag URI.
1017 ///
1018 /// [head] is the initial portion of the tag that's already been scanned.
1019 /// [flowSeparators] indicates whether the tag URI can contain flow
1020 /// separators.
1021 String _scanTagUri({String head, bool flowSeparators: true}) {
1022 var length = head == null ? 0 : head.length;
1023 var buffer = new StringBuffer();
1024
1025 // Copy the head if needed.
1026 //
1027 // Note that we don't copy the leading '!' character.
1028 if (length > 1) buffer.write(head.substring(1));
1029
1030 // The set of characters that may appear in URI is as follows:
1031 //
1032 // '0'-'9', 'A'-'Z', 'a'-'z', '_', '-', ';', '/', '?', ':', '@', '&',
1033 // '=', '+', '$', ',', '.', '!', '~', '*', '\'', '(', ')', '[', ']',
1034 // '%'.
1035 //
1036 // In a shorthand tag annotation, the flow separators ',', '[', and ']' are
1037 // disallowed.
1038 var char = _scanner.peekChar();
1039 while (_isTagChar || (flowSeparators &&
1040 (char == COMMA || char == LEFT_SQUARE || char == RIGHT_SQUARE))) {
1041 buffer.writeCharCode(_scanner.readChar());
1042 char = _scanner.peekChar();
1043 }
1044
1045 // libyaml manually decodes the URL, but we don't have to do that.
1046 return Uri.decodeFull(buffer.toString());
1047 }
1048
1049 /// Scans a block scalar.
1050 Token _scanBlockScalar({bool literal: false}) {
1051 var start = _scanner.state;
1052
1053 // Eat the indicator '|' or '>'.
1054 _scanner.readChar();
1055
1056 // Check for a chomping indicator.
1057 var chomping = _Chomping.CLIP;
1058 var increment = 0;
1059 var char = _scanner.peekChar();
1060 if (char == PLUS || char == HYPHEN) {
1061 chomping = char == PLUS ? _Chomping.KEEP : _Chomping.STRIP;
1062 _scanner.readChar();
1063
1064 // Check for an indentation indicator.
1065 if (_isDigit) {
1066 // Check that the indentation is greater than 0.
1067 if (_scanner.peekChar() == 0) {
Bob Nystrom 2014/10/31 20:03:29 NUMBER_0?
nweiz 2014/11/04 22:19:38 Done.
1068 throw new YamlException(
1069 "0 may not be used as an indentation indicator.",
1070 _scanner.spanFrom(start));
1071 }
1072
1073 increment = _scanner.readChar() - NUMBER_0;
1074 }
1075 } else if (_isDigit) {
1076 // Do the same as above, but in the opposite order.
1077 if (_scanner.peekChar() == 0) {
Bob Nystrom 2014/10/31 20:03:29 Ditto.
nweiz 2014/11/04 22:19:36 Done.
1078 throw new YamlException(
1079 "0 may not be used as an indentation indicator.",
1080 _scanner.spanFrom(start));
1081 }
1082
1083 increment = _scanner.readChar() - NUMBER_0;
1084
1085 char = _scanner.peekChar();
1086 if (char == PLUS || char == HYPHEN) {
1087 chomping = char == PLUS ? _Chomping.KEEP : _Chomping.STRIP;
1088 _scanner.readChar();
1089 }
1090 }
1091
1092 // Eat whitespace and comments to the end of the line.
1093 while (_isBlank) {
1094 _scanner.readChar();
1095 }
1096
1097 if (_scanner.peekChar() == HASH) {
1098 while (!_isBreakOrEnd) {
1099 _scanner.readChar();
1100 }
1101 }
1102
1103 // Check if we're at the end of the line.
1104 if (!_isBreakOrEnd) {
1105 throw new YamlException("Expected comment or line break.",
1106 _scanner.emptySpan);
1107 }
1108
1109 if (_isBreak) _skipLine();
1110
1111 var indent = 0;
Bob Nystrom 2014/10/31 20:03:28 Document this little block.
nweiz 2014/11/04 22:19:37 Done.
1112 if (increment != 0) {
1113 indent = _indent >= 0 ? _indent + increment : increment;
1114 }
1115
1116 // Scan the leading line breaks to determine the indentation level if
1117 // needed.
1118 var pair = _scanBlockScalarBreaks(indent);
1119 indent = pair.first;
1120 var trailingBreaks = pair.last;
1121
1122 // Scan the block scalar contents.
1123 var buffer = new StringBuffer();
1124 var leadingBreak = '';
1125 var leadingBlank = false;
1126 var trailingBlank = false;
1127 while (_scanner.column == indent && !_scanner.isDone) {
1128 // Check for a document indicator. libyaml doesn't do this, but the spec
1129 // mandates it. See example 9.5:
1130 // http://yaml.org/spec/1.2/spec.html#id2801606.
1131 if (_scanner.column == 0 && _isBlankOrEndAt(3) &&
1132 (_scanner.matches('---') || _scanner.matches('...'))) {
1133 break;
1134 }
1135
1136 // We are at the beginning of a non-empty line.
1137
1138 // Is there trailing whitespace?
1139 trailingBlank = _isBlank;
1140
1141 // Check if we need to fold the leading line break.
1142 if (!literal && leadingBreak.isNotEmpty && !leadingBlank &&
1143 !trailingBlank) {
1144 // Do we need to join the lines with a space?
1145 if (trailingBreaks.isEmpty) buffer.writeCharCode(SP);
1146 leadingBreak = '';
Bob Nystrom 2014/10/31 20:03:29 Move this after the if.
nweiz 2014/11/04 22:19:38 Done.
1147 } else {
1148 buffer.write(leadingBreak);
1149 leadingBreak = '';
1150 }
1151
1152 // Append the remaining line breaks.
1153 buffer.write(trailingBreaks);
1154
1155 // Is there leading whitespace?
1156 leadingBlank = _isBlank;
1157
1158 while (!_isBreakOrEnd) {
1159 buffer.writeCharCode(_scanner.readChar());
1160 }
1161
1162 // libyaml always reads a line here, but this breaks on block scalars at
1163 // the end of the document that end without newlines. See example 8.1:
1164 // http://yaml.org/spec/1.2/spec.html#id2793888.
1165 if (!_scanner.isDone) leadingBreak = _readLine();
1166
1167 // Eat the following indentation and spaces.
1168 var pair = _scanBlockScalarBreaks(indent);
1169 indent = pair.first;
1170 trailingBreaks = pair.last;
1171 }
1172
1173 // Chomp the tail.
1174 if (chomping != _Chomping.STRIP) {
Bob Nystrom 2014/10/31 20:03:28 Nit, but maybe make these single-line ifs?
nweiz 2014/11/04 22:19:37 Done.
1175 buffer.write(leadingBreak);
1176 }
1177 if (chomping == _Chomping.KEEP) {
1178 buffer.write(trailingBreaks);
1179 }
1180
1181 return new ScalarToken(_scanner.spanFrom(start), buffer.toString(),
1182 literal ? ScalarStyle.LITERAL : ScalarStyle.FOLDED);
1183 }
1184
1185 /// Scans indentation spaces and line breaks for a block scalar.
1186 ///
1187 /// Determines the intendation level if needed. Returns the new indentation
1188 /// level and the text of the line breaks.
1189 Pair<int, String> _scanBlockScalarBreaks(int indent) {
1190 var maxIndent = 0;
1191 var breaks = new StringBuffer();
1192
1193 while (true) {
1194 while ((indent == 0 || _scanner.column < indent) &&
1195 _scanner.peekChar() == SP) {
1196 _scanner.readChar();
1197 }
1198
1199 if (_scanner.column > maxIndent) maxIndent = _scanner.column;
1200
1201 // libyaml throws an error here if a tab character is detected, but the
1202 // spec treats tabs like any other non-space character. See example 8.2:
1203 // http://yaml.org/spec/1.2/spec.html#id2794311.
1204
1205 if (!_isBreak) break;
1206 breaks.write(_readLine());
1207 }
1208
1209 if (indent == 0) {
1210 indent = maxIndent;
1211 if (indent < _indent + 1) indent = _indent + 1;
1212
1213 // libyaml forces indent to be at least 1 here, but that doesn't seem to
1214 // be supported by the spec.
1215 }
1216
1217 return new Pair(indent, breaks.toString());
1218 }
1219
1220 // Scans a quoted scalar.
1221 Token _scanFlowScalar({bool singleQuote: false}) {
1222 var start = _scanner.state;
1223 var buffer = new StringBuffer();
1224
1225 // Eat the left quote.
1226 _scanner.readChar();
1227
1228 while (true) {
1229 // Check that there are no document indicators at the beginning of the
1230 // line.
1231 if (_scanner.column == 0 && _isBlankOrEndAt(3) &&
1232 (_scanner.scan("---") || _scanner.scan("..."))) {
1233 _scanner.error("Unexpected document indicator.");
1234 }
Bob Nystrom 2014/10/31 20:03:28 Hoist this out into a function?
nweiz 2014/11/04 22:19:36 Done.
1235
1236 if (_scanner.isDone) {
1237 throw new YamlException("Unexpected end of file.", _scanner.emptySpan);
1238 }
1239
1240 var leadingBlanks = false;
1241 while (!_isBlankOrEnd) {
1242 var char = _scanner.peekChar();
1243 if (singleQuote && char == SINGLE_QUOTE &&
1244 _scanner.peekChar(1) == SINGLE_QUOTE) {
1245 // An escaped single quote.
1246 _scanner.readChar();
1247 _scanner.readChar();
1248 buffer.writeCharCode(SINGLE_QUOTE);
1249 } else if (char == (singleQuote ? SINGLE_QUOTE : DOUBLE_QUOTE)) {
1250 // The closing quote.
1251 break;
1252 } else if (!singleQuote && char == BACKSLASH && _isBreakAt(1)) {
1253 // An escaped newline.
1254 _scanner.readChar();
1255 _skipLine();
1256 leadingBlanks = true;
1257 break;
1258 } else if (!singleQuote && char == BACKSLASH) {
1259 var escapeStart = _scanner.state;
1260
1261 // An escape sequence.
1262 var codeLength = null;
1263 switch (_scanner.peekChar(1)) {
1264 case NUMBER_0:
1265 buffer.writeCharCode(NULL);
1266 break;
1267 case LETTER_A:
1268 buffer.writeCharCode(BELL);
1269 break;
1270 case LETTER_B:
1271 buffer.writeCharCode(BACKSPACE);
1272 break;
1273 case LETTER_T:
1274 case TAB:
Bob Nystrom 2014/10/31 20:03:29 Oh, YAML. You so crazy.
1275 buffer.writeCharCode(TAB);
1276 break;
1277 case LETTER_N:
1278 buffer.writeCharCode(LF);
1279 break;
1280 case LETTER_V:
1281 buffer.writeCharCode(VERTICAL_TAB);
1282 break;
1283 case LETTER_F:
1284 buffer.writeCharCode(FORM_FEED);
1285 break;
1286 case LETTER_R:
1287 buffer.writeCharCode(CR);
1288 break;
1289 case LETTER_E:
1290 buffer.writeCharCode(ESCAPE);
1291 break;
1292 case SP:
1293 case DOUBLE_QUOTE:
1294 case SLASH:
1295 case BACKSLASH:
1296 // libyaml doesn't support an escaped forward slash, but it was
1297 // added in YAML 1.2. See section 5.7:
1298 // http://yaml.org/spec/1.2/spec.html#id2776092
1299 buffer.writeCharCode(_scanner.peekChar(1));
1300 break;
1301 case LETTER_CAP_N:
1302 buffer.writeCharCode(NEL);
1303 break;
1304 case UNDERSCORE:
1305 buffer.writeCharCode(NBSP);
1306 break;
1307 case LETTER_CAP_L:
1308 buffer.writeCharCode(LINE_SEPARATOR);
1309 break;
1310 case LETTER_CAP_P:
1311 buffer.writeCharCode(PARAGRAPH_SEPARATOR);
1312 break;
1313 case LETTER_X:
1314 codeLength = 2;
1315 break;
1316 case LETTER_U:
1317 codeLength = 4;
1318 break;
1319 case LETTER_CAP_U:
1320 codeLength = 8;
1321 break;
1322 default:
1323 throw new YamlException("Unknown escape character.",
1324 _scanner.spanFrom(escapeStart));
1325 }
1326
1327 _scanner.readChar();
1328 _scanner.readChar();
1329
1330 if (codeLength != null) {
1331 var value = 0;
1332 for (var i = 0; i < codeLength; i++) {
1333 if (!_isHex) {
1334 _scanner.readChar();
1335 throw new YamlException(
1336 "Expected $codeLength-digit hexidecimal number.",
1337 _scanner.spanFrom(escapeStart));
1338 }
1339
1340 value = (value << 4) + _asHex(_scanner.readChar());
1341 }
1342
1343 // Check the value and write the character.
1344 if ((value >= 0xD800 && value <= 0xDFFF) || value > 0x10FFFF) {
1345 throw new YamlException(
1346 "Invalid Unicode character escape code.",
1347 _scanner.spanFrom(escapeStart));
1348 }
1349
1350 buffer.writeCharCode(value);
1351 }
1352 } else {
1353 buffer.writeCharCode(_scanner.readChar());
1354 }
1355 }
1356
1357 // Check if we're at the end of a scalar.
1358 if (_scanner.peekChar() == (singleQuote ? SINGLE_QUOTE : DOUBLE_QUOTE)) {
1359 break;
1360 }
1361
1362 var whitespace = new StringBuffer();
1363 var leadingBreak = '';
1364 var trailingBreaks = new StringBuffer();
1365 while (_isBlank || _isBreak) {
1366 if (_isBlank) {
1367 // Consume a space or a tab.
1368 if (!leadingBlanks) {
1369 whitespace.writeCharCode(_scanner.readChar());
1370 } else {
1371 _scanner.readChar();
1372 }
1373 } else {
1374 // Check if it's a first line break.
1375 if (!leadingBlanks) {
1376 whitespace.clear();
1377 leadingBreak = _readLine();
1378 leadingBlanks = true;
1379 } else {
1380 trailingBreaks.write(_readLine());
1381 }
1382 }
1383 }
1384
1385 // Join the whitespace or fold line breaks.
1386 if (leadingBlanks) {
1387 if (leadingBreak.isNotEmpty && trailingBreaks.isEmpty) {
1388 buffer.writeCharCode(SP);
1389 } else {
1390 buffer.write(trailingBreaks);
1391 }
1392 } else {
1393 buffer.write(whitespace);
1394 whitespace.clear();
1395 }
1396 }
1397
1398 // Eat the right quote.
1399 _scanner.readChar();
1400
1401 return new ScalarToken(_scanner.spanFrom(start), buffer.toString(),
1402 singleQuote ? ScalarStyle.SINGLE_QUOTED : ScalarStyle.DOUBLE_QUOTED);
1403 }
1404
1405 /// Scans a plain scalar.
1406 Token _scanPlainScalar() {
1407 var start = _scanner.state;
1408 var buffer = new StringBuffer();
1409 var leadingBreak = '';
1410 var trailingBreaks = '';
1411 var whitespace = new StringBuffer();
1412 var indent = _indent + 1;
1413
1414 while (true) {
1415 // Check for a document indicator.
1416 if (_scanner.column == 0 && _isBlankOrEndAt(3) &&
1417 (_scanner.matches('---') || _scanner.matches('...'))) {
1418 break;
1419 }
1420
1421 // Check for a comment.
1422 if (_scanner.peekChar() == HASH) break;
1423
1424 if (_isPlainChar) {
1425 // Join the whitespace or fold line breaks.
1426 if (leadingBreak.isNotEmpty) {
1427 if (trailingBreaks.isEmpty) {
1428 buffer.writeCharCode(SP);
1429 } else {
1430 buffer.write(trailingBreaks);
1431 }
1432 leadingBreak = '';
1433 trailingBreaks = '';
1434 } else {
1435 buffer.write(whitespace);
1436 whitespace.clear();
1437 }
1438 }
1439
1440 // libyaml's notion of valid identifiers differs substantially from YAML
1441 // 1.2's. We use [_isPlainChar] instead of libyaml's character here.
1442 while (_isPlainChar) {
1443 buffer.writeCharCode(_scanner.readChar());
1444 }
1445
1446 // Is it the end?
1447 if (!_isBlank && !_isBreak) break;
1448
1449 while (_isBlank || _isBreak) {
1450 if (_isBlank) {
1451 // Check for a tab character messing up the intendation.
1452 if (leadingBreak.isNotEmpty && _scanner.column < indent &&
1453 _scanner.peekChar() == TAB) {
1454 _scanner.error("Expected a space but found a tab.", length: 1);
1455 }
1456
1457 if (leadingBreak.isEmpty) {
1458 whitespace.writeCharCode(_scanner.readChar());
1459 } else {
1460 _scanner.readChar();
1461 }
1462 } else {
1463 // Check if it's a first line break.
1464 if (leadingBreak.isEmpty) {
1465 leadingBreak = _readLine();
1466 whitespace.clear();
1467 } else {
1468 trailingBreaks = _readLine();
1469 }
1470 }
1471 }
1472
1473 // Check the indentation level.
1474 if (_inBlockContext && _scanner.column < indent) break;
1475 }
1476
1477 // Allow a simple key after a plain scalar with leading blanks.
1478 if (leadingBreak.isNotEmpty) _simpleKeyAllowed = true;
1479
1480 return new ScalarToken(_scanner.spanFrom(start), buffer.toString(),
1481 ScalarStyle.PLAIN);
1482 }
1483
1484 /// Moves past the current line break, if there is one.
1485 void _skipLine() {
1486 var char = _scanner.peekChar();
1487 if (char != CR && char != LF) return;
1488 _scanner.readChar();
1489 if (char == CR && _scanner.peekChar() == LF) _scanner.readChar();
1490 }
1491
1492 // Moves past the current line break and returns a newline.
1493 String _readLine() {
1494 var char = _scanner.peekChar();
1495
1496 // libyaml supports NEL, PS, and LS characters as line separators, but this
1497 // is explicitly forbidden in section 5.4 of the YAML spec.
1498 if (char != CR && char != LF) {
1499 throw new YamlException("Expected newline.", _scanner.emptySpan);
1500 }
1501
1502 _scanner.readChar();
1503 // CR LF | CR | LF -> LF
1504 if (char == CR && _scanner.peekChar() == LF) _scanner.readChar();
1505 return "\n";
1506 }
1507
1508 // Returns whether the character at [offset] is whitespace.
1509 bool _isBlankAt(int offset) {
1510 var char = _scanner.peekChar(offset);
1511 return char == SP || char == TAB;
1512 }
1513
1514 // Returns whether the character at [offset] is a line break.
1515 bool _isBreakAt(int offset) {
1516 // Libyaml considers NEL, LS, and PS to be line breaks as well, but that's
1517 // contrary to the spec.
1518 var char = _scanner.peekChar(offset);
1519 return char == CR || char == LF;
1520 }
1521
1522 // Returns whether the character at [offset] is whitespace or past the end of
1523 // the source.
1524 bool _isBlankOrEndAt(int offset) {
1525 var char = _scanner.peekChar(offset);
1526 return char == null || char == SP || char == TAB || char == CR ||
1527 char == LF;
1528 }
1529
1530 /// Returns whether the character at [offset] is a plain character.
1531 ///
1532 /// See http://yaml.org/spec/1.2/spec.html#ns-plain-char(c).
1533 bool _isPlainCharAt(int offset) {
1534 switch (_scanner.peekChar(offset)) {
1535 case COLON:
1536 return _isPlainSafeAt(offset + 1);
1537 case HASH:
1538 var previous = _scanner.peekChar(offset - 1);
1539 return previous != SP && previous != TAB;
1540 default:
1541 return _isPlainSafeAt(offset);
1542 }
1543 }
1544
1545 /// Returns whether the character at [offset] is a plain-safe character.
1546 ///
1547 /// See http://yaml.org/spec/1.2/spec.html#ns-plain-safe(c).
1548 bool _isPlainSafeAt(int offset) {
1549 var char = _scanner.peekChar(offset);
1550 switch (char) {
1551 case COMMA:
1552 case LEFT_SQUARE:
1553 case RIGHT_SQUARE:
1554 case LEFT_CURLY:
1555 case RIGHT_CURLY:
1556 // These characters are delimiters in a flow context and thus are only
1557 // safe in a block context.
1558 return _inBlockContext;
1559 case SP:
1560 case TAB:
1561 case LF:
1562 case CR:
1563 case BOM:
1564 return false;
1565 case NEL:
1566 return true;
1567 default:
1568 return char != null &&
1569 ((char >= 0x00020 && char <= 0x00007E) ||
1570 (char >= 0x000A0 && char <= 0x00D7FF) ||
1571 (char >= 0x0E000 && char <= 0x00FFFD) ||
1572 (char >= 0x10000 && char <= 0x10FFFF));
1573 }
1574 }
1575
1576 /// Returns the hexidecimal value of [char].
1577 int _asHex(int char) {
1578 if (char <= NUMBER_9) return char - NUMBER_0;
1579 if (char <= LETTER_CAP_F) return 10 + char - LETTER_CAP_A;
1580 return 10 + char - LETTER_A;
1581 }
1582 }
1583
1584 /// A record of the location of a potential simple key.
1585 class _SimpleKey {
1586 /// The index of the token that begins the simple key.
1587 ///
1588 /// This is the index relative to all tokens emitted, rather than relative to
1589 /// [_tokens].
1590 final int tokenNumber;
1591
1592 /// The source location of the beginning of the simple key.
1593 ///
1594 /// This is used for error reporting and for determining when a simple key is
1595 /// no longer on the current line.
1596 final SourceLocation location;
1597
1598 /// Whether this key must exist for the document to be scanned.
1599 final bool required;
1600
1601 _SimpleKey(this.tokenNumber, this.location, {bool required})
1602 : required = required;
1603 }
1604
1605 /// An enum of chomping indicators that describe how to handle trailing
1606 /// whitespace for a block scalar.
1607 ///
1608 /// See http://yaml.org/spec/1.2/spec.html#id2794534.
1609 class _Chomping {
1610 /// All trailing whitespace is discarded.
1611 static const STRIP = const _Chomping("STRIP");
1612
1613 /// A single trailing newline is retained.
1614 static const CLIP = const _Chomping("CLIP");
1615
1616 /// All trailing whitespace is preserved.
1617 static const KEEP = const _Chomping("KEEP");
1618
1619 final String name;
1620
1621 const _Chomping(this.name);
1622
1623 String toString() => name;
1624 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698