pkg/csslib/lib/src/tokenizer.dart - Issue 268623002: [html5lib] implement querySelector/querySelectorAll

Unified Diff: pkg/csslib/lib/src/tokenizer.dart

Issue 268623002: [html5lib] implement querySelector/querySelectorAll (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Created 6 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: pkg/csslib/lib/src/tokenizer.dart

diff --git a/pkg/csslib/lib/src/tokenizer.dart b/pkg/csslib/lib/src/tokenizer.dart

index e3cdfdca171a6e761eca9ad1c16ddb8e6ab7b101..8d929abeb92a35c7420260f359183e0ecaed8fa8 100644

--- a/pkg/csslib/lib/src/tokenizer.dart

+++ b/pkg/csslib/lib/src/tokenizer.dart

@@ -41,7 +41,7 @@ class Tokenizer extends TokenizerBase {

_startIndex = _index;

ch = _nextChar();

- Token ident = this.finishIdentifier(ch);

+ Token ident = finishIdentifier();

// Is it a directive?

int tokId = TokenKind.matchDirectives(_text, _startIndex,

@@ -101,7 +101,7 @@ class Tokenizer extends TokenizerBase {

if (maybeEatDigit()) return finishNumber();

return _finishToken(TokenKind.PLUS);

case TokenChar.MINUS:

- if (selectorExpression || unicodeRange) {

+ if (inSelectorExpression || unicodeRange) {

// If parsing in pseudo function expression then minus is an operator

// not part of identifier e.g., interval value range (e.g. U+400-4ff)

// or minus operator in selector expression.

@@ -109,7 +109,7 @@ class Tokenizer extends TokenizerBase {

} else if (maybeEatDigit()) {

return finishNumber();

} else if (TokenizerHelpers.isIdentifierStart(ch)) {

- return this.finishIdentifier(ch);

+ return finishIdentifier();

}

return _finishToken(TokenKind.MINUS);

case TokenChar.GREATER:

@@ -127,6 +127,9 @@ class Tokenizer extends TokenizerBase {

case TokenChar.AMPERSAND:

return _finishToken(TokenKind.AMPERSAND);

case TokenChar.NAMESPACE:

+ if (_maybeEatChar(TokenChar.EQUALS)) {

+ return _finishToken(TokenKind.DASH_MATCH); // |=

+ }

return _finishToken(TokenKind.NAMESPACE);

case TokenChar.COLON:

return _finishToken(TokenKind.COLON);

@@ -162,11 +165,6 @@ class Tokenizer extends TokenizerBase {

return _finishToken(TokenKind.LESS);

case TokenChar.EQUALS:

return _finishToken(TokenKind.EQUALS);

- case TokenChar.OR:

- if (_maybeEatChar(TokenChar.EQUALS)) {

- return _finishToken(TokenKind.DASH_MATCH); // |=

- }

- return _finishToken(TokenKind.OR);

case TokenChar.CARET:

if (_maybeEatChar(TokenChar.EQUALS)) {

return _finishToken(TokenKind.PREFIX_MATCH); // ^=

@@ -178,11 +176,16 @@ class Tokenizer extends TokenizerBase {

}

return _finishToken(TokenKind.DOLLAR);

case TokenChar.BANG:

- Token tok = finishIdentifier(ch);

+ Token tok = finishIdentifier();

return (tok == null) ? _finishToken(TokenKind.BANG) : tok;

- case TokenChar.BACKSLASH:

- return _finishToken(TokenKind.BACKSLASH);

default:

+ // TODO(jmesserly): this is used for IE8 detection; I'm not sure it's

+ // appropriate outside of a few specific places; certainly shouldn't

+ // be parsed in selectors.

+ if (!inSelector && ch == TokenChar.BACKSLASH) {

+ return _finishToken(TokenKind.BACKSLASH);

+ }

if (unicodeRange) {

// Three types of unicode ranges:

// - single code point (e.g. U+416)

@@ -212,7 +215,7 @@ class Tokenizer extends TokenizerBase {

} else if (varUsage(ch)) {

return _finishToken(TokenKind.VAR_USAGE);

} else if (TokenizerHelpers.isIdentifierStart(ch)) {

- return finishIdentifier(ch);

+ return finishIdentifier();

} else if (TokenizerHelpers.isDigit(ch)) {

return finishNumber();

}

@@ -236,7 +239,12 @@ class Tokenizer extends TokenizerBase {

int getIdentifierKind() {

// Is the identifier a unit type?

- int tokId = TokenKind.matchUnits(_text, _startIndex, _index - _startIndex);

+ int tokId = -1;

+ // Don't match units in selectors or selector expressions.

+ if (!inSelectorExpression && !inSelector) {

+ tokId = TokenKind.matchUnits(_text, _startIndex, _index - _startIndex);

+ }

if (tokId == -1) {

tokId = (_text.substring(_startIndex, _index) == '!important') ?

TokenKind.IMPORTANT : -1;

@@ -245,31 +253,59 @@ class Tokenizer extends TokenizerBase {

return tokId >= 0 ? tokId : TokenKind.IDENTIFIER;

}

- // Need to override so CSS version of isIdentifierPart is used.

- Token finishIdentifier(int ch) {

+ Token finishIdentifier() {

+ // If we encounter an escape sequence, remember it so we can post-process

+ // to unescape.

+ bool hasEscapedChars = false;

+ var chars = [];

+ // backup so we can start with the first character

+ int validateFrom = _index;

+ _index = _startIndex;

while (_index < _text.length) {

- // If parsing in pseudo function expression then minus is an operator

- // not part of identifier.

- var isIdentifier = selectorExpression

- ? TokenizerHelpers.isIdentifierPartExpr(_text.codeUnitAt(_index))

- : TokenizerHelpers.isIdentifierPart(_text.codeUnitAt(_index));

- if (!isIdentifier) {

- break;

+ int ch = _text.codeUnitAt(_index);

+ // If the previous character was "\" we need to escape. T

+ // http://www.w3.org/TR/CSS21/syndata.html#characters

+ // if followed by hexadecimal digits, create the appropriate character.

+ // otherwise, include the character in the identifier and don't treat it

+ // specially.

+ if (ch == 92/*\*/) {

+ int startHex = ++_index;

+ eatHexDigits(startHex + 6);

+ if (_index != startHex) {

+ // Parse the hex digits and add that character.

+ chars.add(int.parse('0x' + _text.substring(startHex, _index)));

+ if (_index == _text.length) break;

+ // if we stopped the hex because of a whitespace char, skip it

+ ch = _text.codeUnitAt(_index);

+ if (_index - startHex != 6 &&

+ (ch == TokenChar.SPACE || ch == TokenChar.TAB ||

+ ch == TokenChar.RETURN || ch == TokenChar.NEWLINE)) {

+ _index++;

+ }

+ } else {

+ // not a digit, just add the next character literally

+ if (_index == _text.length) break;

+ chars.add(_text.codeUnitAt(_index++));

+ }

+ } else if (_index < validateFrom || (inSelectorExpression

+ ? TokenizerHelpers.isIdentifierPartExpr(ch)

+ : TokenizerHelpers.isIdentifierPart(ch))) {

+ chars.add(ch);

+ _index++;

} else {

- _index += 1;

+ // Not an identifier or escaped character.

+ break;

}

- int kind = getIdentifierKind();

- if (kind == TokenKind.IDENTIFIER) {

- return _finishToken(TokenKind.IDENTIFIER);

- } else {

- return _finishToken(kind);

- }

- Token finishImportant() {

+ var span = _file.span(_startIndex, _index);

+ var text = new String.fromCharCodes(chars);

+ return new IdentifierToken(text, getIdentifierKind(), span);

}

Token finishNumber() {

@@ -299,12 +335,13 @@ class Tokenizer extends TokenizerBase {

}

Token finishHexNumber() {

- eatHexDigits();

+ eatHexDigits(_text.length);

return _finishToken(TokenKind.HEX_INTEGER);

}

- void eatHexDigits() {

- while (_index < _text.length) {

+ void eatHexDigits(int end) {

+ end = math.min(end, _text.length);

+ while (_index < end) {

if (TokenizerHelpers.isHexDigit(_text.codeUnitAt(_index))) {

_index += 1;

} else {

@@ -399,7 +436,11 @@ class TokenizerHelpers {

/** Pseudo function expressions identifiers can't have a minus sign. */

static bool isIdentifierStartExpr(int c) {

return ((c >= 97/*a*/ && c <= 122/*z*/) || (c >= 65/*A*/ && c <= 90/*Z*/) ||

- c == 95/*_*/);

+ // Note: Unicode 10646 chars U+00A0 or higher are allowed, see:

+ // http://www.w3.org/TR/CSS21/syndata.html#value-def-identifier

+ // http://www.w3.org/TR/CSS21/syndata.html#characters

+ // Also, escaped character should be allowed.

+ c == 95/*_*/ || c >= 0xA0 || c == 92/*\*/);

}

/** Pseudo function expressions identifiers can't have a minus sign. */

« no previous file with comments | « pkg/csslib/lib/src/token.dart ('k') | pkg/csslib/lib/src/tokenizer_base.dart » ('j') | no next file with comments »