pkg/csslib/lib/src/tokenizer.dart - Issue 268623002: [html5lib] implement querySelector/querySelectorAll

Side by Side Diff: pkg/csslib/lib/src/tokenizer.dart

Issue 268623002: [html5lib] implement querySelector/querySelectorAll (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2012, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 part of csslib.parser;	5 part of csslib.parser;

6	6

7 class Tokenizer extends TokenizerBase {	7 class Tokenizer extends TokenizerBase {

8 /** U+ prefix for unicode characters. */	8 /** U+ prefix for unicode characters. */

9 final UNICODE_U = 'U'.codeUnitAt(0);	9 final UNICODE_U = 'U'.codeUnitAt(0);

10 final UNICODE_LOWER_U = 'u'.codeUnitAt(0);	10 final UNICODE_LOWER_U = 'u'.codeUnitAt(0);

(...skipping 23 matching lines...) Expand all Loading...
34 case TokenChar.END_OF_FILE:	34 case TokenChar.END_OF_FILE:

35 return _finishToken(TokenKind.END_OF_FILE);	35 return _finishToken(TokenKind.END_OF_FILE);

36 case TokenChar.AT:	36 case TokenChar.AT:

37 int peekCh = _peekChar();	37 int peekCh = _peekChar();

38 if (TokenizerHelpers.isIdentifierStart(peekCh)) {	38 if (TokenizerHelpers.isIdentifierStart(peekCh)) {

39 var oldIndex = _index;	39 var oldIndex = _index;

40 var oldStartIndex = _startIndex;	40 var oldStartIndex = _startIndex;

41	41

42 _startIndex = _index;	42 _startIndex = _index;

43 ch = _nextChar();	43 ch = _nextChar();

44 Token ident = this.finishIdentifier(ch);	44 Token ident = finishIdentifier();

45	45

46 // Is it a directive?	46 // Is it a directive?

47 int tokId = TokenKind.matchDirectives(_text, _startIndex,	47 int tokId = TokenKind.matchDirectives(_text, _startIndex,

48 _index - _startIndex);	48 _index - _startIndex);

49 if (tokId == -1) {	49 if (tokId == -1) {

50 // No, is it a margin directive?	50 // No, is it a margin directive?

51 tokId = TokenKind.matchMarginDirectives(_text, _startIndex,	51 tokId = TokenKind.matchMarginDirectives(_text, _startIndex,

52 _index - _startIndex);	52 _index - _startIndex);

53 }	53 }

54	54

(...skipping 39 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
94 // ]]>	94 // ]]>

95 return next();	95 return next();

96 }	96 }

97 return _finishToken(TokenKind.RBRACK);	97 return _finishToken(TokenKind.RBRACK);

98 case TokenChar.HASH:	98 case TokenChar.HASH:

99 return _finishToken(TokenKind.HASH);	99 return _finishToken(TokenKind.HASH);

100 case TokenChar.PLUS:	100 case TokenChar.PLUS:

101 if (maybeEatDigit()) return finishNumber();	101 if (maybeEatDigit()) return finishNumber();

102 return _finishToken(TokenKind.PLUS);	102 return _finishToken(TokenKind.PLUS);

103 case TokenChar.MINUS:	103 case TokenChar.MINUS:

104 if (selectorExpression \|\| unicodeRange) {	104 if (inSelectorExpression \|\| unicodeRange) {

105 // If parsing in pseudo function expression then minus is an operator	105 // If parsing in pseudo function expression then minus is an operator

106 // not part of identifier e.g., interval value range (e.g. U+400-4ff)	106 // not part of identifier e.g., interval value range (e.g. U+400-4ff)

107 // or minus operator in selector expression.	107 // or minus operator in selector expression.

108 return _finishToken(TokenKind.MINUS);	108 return _finishToken(TokenKind.MINUS);

109 } else if (maybeEatDigit()) {	109 } else if (maybeEatDigit()) {

110 return finishNumber();	110 return finishNumber();

111 } else if (TokenizerHelpers.isIdentifierStart(ch)) {	111 } else if (TokenizerHelpers.isIdentifierStart(ch)) {

112 return this.finishIdentifier(ch);	112 return finishIdentifier();

113 }	113 }

114 return _finishToken(TokenKind.MINUS);	114 return _finishToken(TokenKind.MINUS);

115 case TokenChar.GREATER:	115 case TokenChar.GREATER:

116 return _finishToken(TokenKind.GREATER);	116 return _finishToken(TokenKind.GREATER);

117 case TokenChar.TILDE:	117 case TokenChar.TILDE:

118 if (_maybeEatChar(TokenChar.EQUALS)) {	118 if (_maybeEatChar(TokenChar.EQUALS)) {

119 return _finishToken(TokenKind.INCLUDES); // ~=	119 return _finishToken(TokenKind.INCLUDES); // ~=

120 }	120 }

121 return _finishToken(TokenKind.TILDE);	121 return _finishToken(TokenKind.TILDE);

122 case TokenChar.ASTERISK:	122 case TokenChar.ASTERISK:

123 if (_maybeEatChar(TokenChar.EQUALS)) {	123 if (_maybeEatChar(TokenChar.EQUALS)) {

124 return _finishToken(TokenKind.SUBSTRING_MATCH); // *=	124 return _finishToken(TokenKind.SUBSTRING_MATCH); // *=

125 }	125 }

126 return _finishToken(TokenKind.ASTERISK);	126 return _finishToken(TokenKind.ASTERISK);

127 case TokenChar.AMPERSAND:	127 case TokenChar.AMPERSAND:

128 return _finishToken(TokenKind.AMPERSAND);	128 return _finishToken(TokenKind.AMPERSAND);

129 case TokenChar.NAMESPACE:	129 case TokenChar.NAMESPACE:

	130 if (_maybeEatChar(TokenChar.EQUALS)) {

	131 return _finishToken(TokenKind.DASH_MATCH); // \|=

	132 }

130 return _finishToken(TokenKind.NAMESPACE);	133 return _finishToken(TokenKind.NAMESPACE);

131 case TokenChar.COLON:	134 case TokenChar.COLON:

132 return _finishToken(TokenKind.COLON);	135 return _finishToken(TokenKind.COLON);

133 case TokenChar.COMMA:	136 case TokenChar.COMMA:

134 return _finishToken(TokenKind.COMMA);	137 return _finishToken(TokenKind.COMMA);

135 case TokenChar.SEMICOLON:	138 case TokenChar.SEMICOLON:

136 return _finishToken(TokenKind.SEMICOLON);	139 return _finishToken(TokenKind.SEMICOLON);

137 case TokenChar.PERCENT:	140 case TokenChar.PERCENT:

138 return _finishToken(TokenKind.PERCENT);	141 return _finishToken(TokenKind.PERCENT);

139 case TokenChar.SINGLE_QUOTE:	142 case TokenChar.SINGLE_QUOTE:

(...skipping 15 matching lines...) Expand all Loading...
155 _maybeEatChar(CDATA_NAME[3]) &&	158 _maybeEatChar(CDATA_NAME[3]) &&

156 _maybeEatChar(CDATA_NAME[4]) &&	159 _maybeEatChar(CDATA_NAME[4]) &&

157 _maybeEatChar(TokenChar.LBRACK)) {	160 _maybeEatChar(TokenChar.LBRACK)) {

158 // <![CDATA[	161 // <![CDATA[

159 return next();	162 return next();

160 }	163 }

161 }	164 }

162 return _finishToken(TokenKind.LESS);	165 return _finishToken(TokenKind.LESS);

163 case TokenChar.EQUALS:	166 case TokenChar.EQUALS:

164 return _finishToken(TokenKind.EQUALS);	167 return _finishToken(TokenKind.EQUALS);

165 case TokenChar.OR:

166 if (_maybeEatChar(TokenChar.EQUALS)) {

167 return _finishToken(TokenKind.DASH_MATCH); // \|=

168 }

169 return _finishToken(TokenKind.OR);

170 case TokenChar.CARET:	168 case TokenChar.CARET:

171 if (_maybeEatChar(TokenChar.EQUALS)) {	169 if (_maybeEatChar(TokenChar.EQUALS)) {

172 return _finishToken(TokenKind.PREFIX_MATCH); // ^=	170 return _finishToken(TokenKind.PREFIX_MATCH); // ^=

173 }	171 }

174 return _finishToken(TokenKind.CARET);	172 return _finishToken(TokenKind.CARET);

175 case TokenChar.DOLLAR:	173 case TokenChar.DOLLAR:

176 if (_maybeEatChar(TokenChar.EQUALS)) {	174 if (_maybeEatChar(TokenChar.EQUALS)) {

177 return _finishToken(TokenKind.SUFFIX_MATCH); // $=	175 return _finishToken(TokenKind.SUFFIX_MATCH); // $=

178 }	176 }

179 return _finishToken(TokenKind.DOLLAR);	177 return _finishToken(TokenKind.DOLLAR);

180 case TokenChar.BANG:	178 case TokenChar.BANG:

181 Token tok = finishIdentifier(ch);	179 Token tok = finishIdentifier();

182 return (tok == null) ? _finishToken(TokenKind.BANG) : tok;	180 return (tok == null) ? _finishToken(TokenKind.BANG) : tok;

183 case TokenChar.BACKSLASH:

184 return _finishToken(TokenKind.BACKSLASH);

185 default:	181 default:

	182 // TODO(jmesserly): this is used for IE8 detection; I'm not sure it's

	183 // appropriate outside of a few specific places; certainly shouldn't

	184 // be parsed in selectors.

	185 if (!inSelector && ch == TokenChar.BACKSLASH) {

	186 return _finishToken(TokenKind.BACKSLASH);

	187 }

	188

186 if (unicodeRange) {	189 if (unicodeRange) {

187 // Three types of unicode ranges:	190 // Three types of unicode ranges:

188 // - single code point (e.g. U+416)	191 // - single code point (e.g. U+416)

189 // - interval value range (e.g. U+400-4ff)	192 // - interval value range (e.g. U+400-4ff)

190 // - range where trailing ‘?’ characters imply ‘any digit value’	193 // - range where trailing ‘?’ characters imply ‘any digit value’

191 // (e.g. U+4??)	194 // (e.g. U+4??)

192 if (maybeEatHexDigit()) {	195 if (maybeEatHexDigit()) {

193 var t = finishHexNumber();	196 var t = finishHexNumber();

194 // Any question marks then it's a HEX_RANGE not HEX_NUMBER.	197 // Any question marks then it's a HEX_RANGE not HEX_NUMBER.

195 if (maybeEatQuestionMark()) finishUnicodeRange();	198 if (maybeEatQuestionMark()) finishUnicodeRange();

196 return t;	199 return t;

197 } else if (maybeEatQuestionMark()) {	200 } else if (maybeEatQuestionMark()) {

198 // HEX_RANGE U+N???	201 // HEX_RANGE U+N???

199 return finishUnicodeRange();	202 return finishUnicodeRange();

200 } else {	203 } else {

201 return _errorToken();	204 return _errorToken();

202 }	205 }

203 } else if ((ch == UNICODE_U \|\| ch == UNICODE_LOWER_U) &&	206 } else if ((ch == UNICODE_U \|\| ch == UNICODE_LOWER_U) &&

204 (_peekChar() == UNICODE_PLUS)) {	207 (_peekChar() == UNICODE_PLUS)) {

205 // Unicode range: U+uNumber[-U+uNumber]	208 // Unicode range: U+uNumber[-U+uNumber]

206 // uNumber = 0..10FFFF	209 // uNumber = 0..10FFFF

207 _nextChar(); // Skip +	210 _nextChar(); // Skip +

208 _startIndex = _index; // Starts at the number	211 _startIndex = _index; // Starts at the number

209 return _finishToken(TokenKind.UNICODE_RANGE);	212 return _finishToken(TokenKind.UNICODE_RANGE);

210 } else if (varDef(ch)) {	213 } else if (varDef(ch)) {

211 return _finishToken(TokenKind.VAR_DEFINITION);	214 return _finishToken(TokenKind.VAR_DEFINITION);

212 } else if (varUsage(ch)) {	215 } else if (varUsage(ch)) {

213 return _finishToken(TokenKind.VAR_USAGE);	216 return _finishToken(TokenKind.VAR_USAGE);

214 } else if (TokenizerHelpers.isIdentifierStart(ch)) {	217 } else if (TokenizerHelpers.isIdentifierStart(ch)) {

215 return finishIdentifier(ch);	218 return finishIdentifier();

216 } else if (TokenizerHelpers.isDigit(ch)) {	219 } else if (TokenizerHelpers.isDigit(ch)) {

217 return finishNumber();	220 return finishNumber();

218 }	221 }

219 return _errorToken();	222 return _errorToken();

220 }	223 }

221 }	224 }

222	225

223 bool varDef(int ch) {	226 bool varDef(int ch) {

224 return ch == 'v'.codeUnitAt(0) && _maybeEatChar('a'.codeUnitAt(0)) &&	227 return ch == 'v'.codeUnitAt(0) && _maybeEatChar('a'.codeUnitAt(0)) &&

225 _maybeEatChar('r'.codeUnitAt(0)) && _maybeEatChar('-'.codeUnitAt(0));	228 _maybeEatChar('r'.codeUnitAt(0)) && _maybeEatChar('-'.codeUnitAt(0));

226 }	229 }

227	230

228 bool varUsage(int ch) {	231 bool varUsage(int ch) {

229 return ch == 'v'.codeUnitAt(0) && _maybeEatChar('a'.codeUnitAt(0)) &&	232 return ch == 'v'.codeUnitAt(0) && _maybeEatChar('a'.codeUnitAt(0)) &&

230 _maybeEatChar('r'.codeUnitAt(0)) && (_peekChar() == '-'.codeUnitAt(0));	233 _maybeEatChar('r'.codeUnitAt(0)) && (_peekChar() == '-'.codeUnitAt(0));

231 }	234 }

232	235

233 Token _errorToken([String message = null]) {	236 Token _errorToken([String message = null]) {

234 return _finishToken(TokenKind.ERROR);	237 return _finishToken(TokenKind.ERROR);

235 }	238 }

236	239

237 int getIdentifierKind() {	240 int getIdentifierKind() {

238 // Is the identifier a unit type?	241 // Is the identifier a unit type?

239 int tokId = TokenKind.matchUnits(_text, _startIndex, _index - _startIndex);	242 int tokId = -1;

	243

	244 // Don't match units in selectors or selector expressions.

	245 if (!inSelectorExpression && !inSelector) {

	246 tokId = TokenKind.matchUnits(_text, _startIndex, _index - _startIndex);

	247 }

240 if (tokId == -1) {	248 if (tokId == -1) {

241 tokId = (_text.substring(_startIndex, _index) == '!important') ?	249 tokId = (_text.substring(_startIndex, _index) == '!important') ?

242 TokenKind.IMPORTANT : -1;	250 TokenKind.IMPORTANT : -1;

243 }	251 }

244	252

245 return tokId >= 0 ? tokId : TokenKind.IDENTIFIER;	253 return tokId >= 0 ? tokId : TokenKind.IDENTIFIER;

246 }	254 }

247	255

248 // Need to override so CSS version of isIdentifierPart is used.	256 Token finishIdentifier() {

249 Token finishIdentifier(int ch) {	257 // If we encounter an escape sequence, remember it so we can post-process

	258 // to unescape.

	259 bool hasEscapedChars = false;

	260 var chars = [];

	261

	262 // backup so we can start with the first character

	263 int validateFrom = _index;

	264 _index = _startIndex;

250 while (_index < _text.length) {	265 while (_index < _text.length) {

251 // If parsing in pseudo function expression then minus is an operator	266 int ch = _text.codeUnitAt(_index);

252 // not part of identifier.	267

253 var isIdentifier = selectorExpression	268 // If the previous character was "\" we need to escape. T

254 ? TokenizerHelpers.isIdentifierPartExpr(_text.codeUnitAt(_index))	269 // http://www.w3.org/TR/CSS21/syndata.html#characters

255 : TokenizerHelpers.isIdentifierPart(_text.codeUnitAt(_index));	270 // if followed by hexadecimal digits, create the appropriate character.

256 if (!isIdentifier) {	271 // otherwise, include the character in the identifier and don't treat it

257 break;	272 // specially.

	273 if (ch == 92/\/) {

	274 int startHex = ++_index;

	275 eatHexDigits(startHex + 6);

	276 if (_index != startHex) {

	277 // Parse the hex digits and add that character.

	278 chars.add(int.parse('0x' + _text.substring(startHex, _index)));

	279

	280 if (_index == _text.length) break;

	281

	282 // if we stopped the hex because of a whitespace char, skip it

	283 ch = _text.codeUnitAt(_index);

	284 if (_index - startHex != 6 &&

	285 (ch == TokenChar.SPACE \|\| ch == TokenChar.TAB \|\|

	286 ch == TokenChar.RETURN \|\| ch == TokenChar.NEWLINE)) {

	287 _index++;

	288 }

	289 } else {

	290 // not a digit, just add the next character literally

	291 if (_index == _text.length) break;

	292 chars.add(_text.codeUnitAt(_index++));

	293 }

	294 } else if (_index < validateFrom \|\| (inSelectorExpression

	295 ? TokenizerHelpers.isIdentifierPartExpr(ch)

	296 : TokenizerHelpers.isIdentifierPart(ch))) {

	297 chars.add(ch);

	298 _index++;

258 } else {	299 } else {

259 _index += 1;	300 // Not an identifier or escaped character.

	301 break;

260 }	302 }

261 }	303 }

262	304

263 int kind = getIdentifierKind();	305 var span = _file.span(_startIndex, _index);

264 if (kind == TokenKind.IDENTIFIER) {	306 var text = new String.fromCharCodes(chars);

265 return _finishToken(TokenKind.IDENTIFIER);

266 } else {

267 return _finishToken(kind);

268 }

269 }

270	307

271 Token finishImportant() {	308 return new IdentifierToken(text, getIdentifierKind(), span);

272

273 }	309 }

274	310

275 Token finishNumber() {	311 Token finishNumber() {

276 eatDigits();	312 eatDigits();

277	313

278 if (_peekChar() == 46/./) {	314 if (_peekChar() == 46/./) {

279 // Handle the case of 1.toString().	315 // Handle the case of 1.toString().

280 _nextChar();	316 _nextChar();

281 if (TokenizerHelpers.isDigit(_peekChar())) {	317 if (TokenizerHelpers.isDigit(_peekChar())) {

282 eatDigits();	318 eatDigits();

283 return _finishToken(TokenKind.DOUBLE);	319 return _finishToken(TokenKind.DOUBLE);

284 } else {	320 } else {

285 _index -= 1;	321 _index -= 1;

286 }	322 }

287 }	323 }

288	324

289 return _finishToken(TokenKind.INTEGER);	325 return _finishToken(TokenKind.INTEGER);

290 }	326 }

291	327

292 bool maybeEatDigit() {	328 bool maybeEatDigit() {

293 if (_index < _text.length	329 if (_index < _text.length

294 && TokenizerHelpers.isDigit(_text.codeUnitAt(_index))) {	330 && TokenizerHelpers.isDigit(_text.codeUnitAt(_index))) {

295 _index += 1;	331 _index += 1;

296 return true;	332 return true;

297 }	333 }

298 return false;	334 return false;

299 }	335 }

300	336

301 Token finishHexNumber() {	337 Token finishHexNumber() {

302 eatHexDigits();	338 eatHexDigits(_text.length);

303 return _finishToken(TokenKind.HEX_INTEGER);	339 return _finishToken(TokenKind.HEX_INTEGER);

304 }	340 }

305	341

306 void eatHexDigits() {	342 void eatHexDigits(int end) {

307 while (_index < _text.length) {	343 end = math.min(end, _text.length);

	344 while (_index < end) {

308 if (TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) {	345 if (TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) {

309 _index += 1;	346 _index += 1;

310 } else {	347 } else {

311 return;	348 return;

312 }	349 }

313 }	350 }

314 }	351 }

315	352

316 bool maybeEatHexDigit() {	353 bool maybeEatHexDigit() {

317 if (_index < _text.length	354 if (_index < _text.length

(...skipping 74 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
392 \|\| (c >= 65/A/ && c <= 70/F/));	429 \|\| (c >= 65/A/ && c <= 70/F/));

393 }	430 }

394	431

395 static bool isIdentifierPart(int c) {	432 static bool isIdentifierPart(int c) {

396 return isIdentifierPartExpr(c) \|\| c == 45 /-/;	433 return isIdentifierPartExpr(c) \|\| c == 45 /-/;

397 }	434 }

398	435

399 /** Pseudo function expressions identifiers can't have a minus sign. */	436 /** Pseudo function expressions identifiers can't have a minus sign. */

400 static bool isIdentifierStartExpr(int c) {	437 static bool isIdentifierStartExpr(int c) {

401 return ((c >= 97/a/ && c <= 122/z/) \|\| (c >= 65/A/ && c <= 90/Z/) \|\|	438 return ((c >= 97/a/ && c <= 122/z/) \|\| (c >= 65/A/ && c <= 90/Z/) \|\|

402 c == 95/_/);	439 // Note: Unicode 10646 chars U+00A0 or higher are allowed, see:

	440 // http://www.w3.org/TR/CSS21/syndata.html#value-def-identifier

	441 // http://www.w3.org/TR/CSS21/syndata.html#characters

	442 // Also, escaped character should be allowed.

	443 c == 95/_/ \|\| c >= 0xA0 \|\| c == 92/\/);

403 }	444 }

404	445

405 /** Pseudo function expressions identifiers can't have a minus sign. */	446 /** Pseudo function expressions identifiers can't have a minus sign. */

406 static bool isIdentifierPartExpr(int c) {	447 static bool isIdentifierPartExpr(int c) {

407 return (isIdentifierStartExpr(c) \|\| isDigit(c));	448 return (isIdentifierStartExpr(c) \|\| isDigit(c));

408 }	449 }

409 }	450 }

OLD	NEW

« no previous file with comments | « pkg/csslib/lib/src/token.dart ('k') | pkg/csslib/lib/src/tokenizer_base.dart » ('j') | no next file with comments »