| Index: Source/core/css/parser/NewCSSTokenizer.cpp
|
| diff --git a/Source/core/css/parser/NewCSSTokenizer.cpp b/Source/core/css/parser/NewCSSTokenizer.cpp
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..d053944c3b92d67a87ac6cadbed86cfca4920a64
|
| --- /dev/null
|
| +++ b/Source/core/css/parser/NewCSSTokenizer.cpp
|
| @@ -0,0 +1,437 @@
|
| +/*
|
| + * Copyright (C) 2013 Google Inc. All rights reserved.
|
| + *
|
| + * Redistribution and use in source and binary forms, with or without
|
| + * modification, are permitted provided that the following conditions are
|
| + * met:
|
| + *
|
| + * * Redistributions of source code must retain the above copyright
|
| + * notice, this list of conditions and the following disclaimer.
|
| + * * Redistributions in binary form must reproduce the above
|
| + * copyright notice, this list of conditions and the following disclaimer
|
| + * in the documentation and/or other materials provided with the
|
| + * distribution.
|
| + * * Neither the name of Google Inc. nor the names of its
|
| + * contributors may be used to endorse or promote products derived from
|
| + * this software without specific prior written permission.
|
| + *
|
| + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
| + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
| + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
| + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
| + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
| + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
| + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
| + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
| + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| + */
|
| +
|
| +#include "config.h"
|
| +#include "core/css/parser/NewCSSTokenizer.h"
|
| +
|
| +#include "core/css/parser/CSSInputStream.h"
|
| +#include "core/css/parser/CSSParserIdioms.h"
|
| +#include "platform/text/SegmentedString.h"
|
| +#include "wtf/TemporaryChange.h"
|
| +#include "wtf/unicode/CharacterNames.h"
|
| +
|
| +namespace WebCore {
|
| +
|
| +// http://dev.w3.org/csswg/css-syntax/#name-start-code-point
|
| +static bool isNameStart(UChar c)
|
| +{
|
| + if (isASCIIAlpha(c))
|
| + return true;
|
| + if (c == '_')
|
| + return true;
|
| + return !isASCII(c);
|
| +}
|
| +
|
| +// http://www.w3.org/TR/css-syntax-3/#name-code-point
|
| +static bool isNameChar(UChar c)
|
| +{
|
| + return isNameStart(c) || isASCIIDigit(c) || c == '-';
|
| +}
|
| +
|
| +// http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape
|
| +static bool twoCharsAreValidEscape(UChar first, UChar second)
|
| +{
|
| + return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker));
|
| +}
|
| +
|
| +NewCSSTokenizer::NewCSSTokenizer()
|
| +{
|
| +}
|
| +
|
| +void NewCSSTokenizer::reconsume(UChar c)
|
| +{
|
| + m_input->pushBack(c);
|
| +}
|
| +
|
| +UChar NewCSSTokenizer::consume()
|
| +{
|
| + UChar current = m_input->currentInputChar();
|
| + m_input->advance();
|
| + return current;
|
| +}
|
| +
|
| +void NewCSSTokenizer::tokenize(String string, Vector<CSSToken>& outTokens)
|
| +{
|
| + NewCSSTokenizer tokenizer;
|
| + CSSInputStream input(string);
|
| + while (true) {
|
| + outTokens.append(tokenizer.nextToken(input));
|
| + if (outTokens.last().type() == EOFToken)
|
| + return;
|
| + }
|
| +}
|
| +
|
| +CSSToken NewCSSTokenizer::nextToken(CSSInputStream& input)
|
| +{
|
| + // Unlike the HTMLTokenizer, the CSS Syntax spec is written
|
| + // as a stateless, (fixed-size) look-ahead tokenizer.
|
| + // We could move to the stateful model and instead create
|
| + // states for all the "next 3 codepoints are X" cases.
|
| + // State-machine tokenizers are easier to write to handle
|
| + // incremental tokenization of partial sources.
|
| + // However, for now we follow the spec exactly.
|
| + m_input = &input;
|
| + UChar cc = consume();
|
| +
|
| + if (isCSSSpace(cc)) {
|
| + // CSS Tokenization is currently lossy, but we could record
|
| + // the exact whitespace instead of discarding it here.
|
| + consumeUntilNotWhitespace();
|
| + return CSSToken(WhitespaceToken);
|
| + }
|
| + if (cc == '\"' || cc == '\'')
|
| + return consumeStringTokenUntil(cc);
|
| + if (cc == '#') {
|
| + if (nextCharIsNameChar() || nextTwoCharsAreValidEscape()) {
|
| + HashTokenType hashType = UnrestrictedHashToken;
|
| + if (nextCharsAreIdentifier())
|
| + hashType = IdHashToken;
|
| + return CSSToken(HashToken, consumeName(), hashType);
|
| + }
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == '$') {
|
| + if (consumeIfNext('='))
|
| + return CSSToken(SuffixMatchToken);
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == '(')
|
| + return CSSToken(LeftParenToken);
|
| + if (cc == ')')
|
| + return CSSToken(RightParenToken);
|
| + if (cc == '*') {
|
| + if (consumeIfNext('='))
|
| + return CSSToken(SubstringMatchToken);
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == '+' || cc == '.') {
|
| + if (nextCharsAreNumber()) {
|
| + reconsume(cc);
|
| + return consumeNumericToken();
|
| + }
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == ',')
|
| + return CSSToken(CommaToken);
|
| + if (cc == '-') {
|
| + if (nextCharsAreNumber()) {
|
| + reconsume(cc);
|
| + return consumeNumericToken();
|
| + }
|
| + if (nextCharsAreIdentifier()) {
|
| + reconsume(cc);
|
| + return consumeIdentLikeToken();
|
| + }
|
| + if (consumeIfNext("->"))
|
| + return CSSToken(CDCToken);
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == '/') {
|
| + if (consumeIfNext('*')) {
|
| + consumeThroughCommentEndOrUntilEOF();
|
| + return nextToken(*m_input);
|
| + }
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == ':')
|
| + return CSSToken(ColonToken);
|
| + if (cc == ';')
|
| + return CSSToken(SemicolonToken);
|
| + if (cc == '<') {
|
| + if (consumeIfNext("!--"))
|
| + return CSSToken(CDOToken);
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == '@') {
|
| + if (nextCharsAreIdentifier())
|
| + return CSSToken(AtKeywordToken, consumeName());
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == '[')
|
| + return CSSToken(LeftBracketToken);
|
| + if (cc == '\\') {
|
| + if (twoCharsAreValidEscape(cc, m_input->currentInputChar())) {
|
| + reconsume(cc);
|
| + return consumeIdentLikeToken();
|
| + }
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == ']')
|
| + return CSSToken(RightBracketToken);
|
| + if (cc == '^') {
|
| + if (consumeIfNext('='))
|
| + return CSSToken(PrefixMatchToken);
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == '{')
|
| + return CSSToken(LeftBraceToken);
|
| + if (cc == '{')
|
| + return CSSToken(RightBraceToken);
|
| + if (isASCIIDigit(cc)) {
|
| + // "reconsume" here is not according to spec, but required AFAICT.
|
| + // https://www.w3.org/Bugs/Public/show_bug.cgi?id=24661
|
| + reconsume(cc);
|
| + return consumeNumericToken();
|
| + }
|
| + // if (cc == 'U' || cc == 'u') {
|
| + // // U+0055 LATIN CAPITAL LETTER U (U)
|
| + // // U+0075 LATIN SMALL LETTER U (u)
|
| + // // If the next 2 input code points are U+002B PLUS SIGN (+) followed by a hex digit or U+003F QUESTION MARK (?), consume the next input code point. Note: don’t consume both of them. Consume a unicode-range token and return it.
|
| + // // Otherwise, reconsume the current input code point, consume an ident-like token, and return it.
|
| + // reconsume(cc);
|
| + // return consumeIdentLikeToken();
|
| + // }
|
| + if (isNameStart(cc)) {
|
| + reconsume(cc);
|
| + return consumeIdentLikeToken();
|
| + }
|
| + if (cc == '|') {
|
| + if (consumeIfNext('='))
|
| + return CSSToken(DashMatchToken);
|
| + if (consumeIfNext('|'))
|
| + return CSSToken(ColumnToken);
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == '~') {
|
| + if (consumeIfNext('='))
|
| + return CSSToken(IncludeMatchToken);
|
| + return CSSToken(DelimToken, cc);
|
| + }
|
| + if (cc == kEndOfFileMarker)
|
| + return CSSToken(EOFToken);
|
| + return CSSToken(DelimToken, cc);
|
| +}
|
| +
|
| +// This method merges the following spec sections for efficiency
|
| +// http://www.w3.org/TR/css3-syntax/#consume-a-number
|
| +// http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number
|
| +CSSToken NewCSSTokenizer::consumeNumber()
|
| +{
|
| + ASSERT(nextCharsAreNumber());
|
| + // FIXME - repr should get the value as a string, even though I'm not sure it's useful
|
| + String repr;
|
| + NumericValueType type = IntegerValueType;
|
| + double value = 0;
|
| + int sign = 1;
|
| + unsigned peekOffset = 0;
|
| + int exponentSign = 1;
|
| + unsigned exponentStartPos = 0;
|
| + unsigned exponentEndPos = 0;
|
| + unsigned fractionStartPos = 0;
|
| + unsigned fractionEndPos = 0;
|
| + unsigned integerPart;
|
| + unsigned fractionPart;
|
| + unsigned fractionDigits;
|
| + unsigned exponentPart;
|
| + if (m_input->currentInputChar() == '+') {
|
| + ++peekOffset;
|
| + } else if (m_input->peek(peekOffset) == '-') {
|
| + sign = -1;
|
| + ++peekOffset;
|
| + }
|
| + unsigned intStartPos = peekOffset;
|
| + peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);
|
| + unsigned intEndPos = peekOffset;
|
| + if (m_input->peek(peekOffset) == '.' && isASCIIDigit(m_input->peek(++peekOffset))) {
|
| + fractionStartPos = peekOffset;
|
| + peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);
|
| + fractionEndPos = peekOffset;
|
| + }
|
| + if ((m_input->peek(peekOffset) == 'E' || m_input->peek(peekOffset) == 'e')) {
|
| + ++peekOffset;
|
| + if (m_input->peek(peekOffset) == '+') {
|
| + ++peekOffset;
|
| + } else if (m_input->peek(peekOffset) =='-') {
|
| + exponentSign = -1;
|
| + ++peekOffset;
|
| + }
|
| + exponentStartPos = peekOffset;
|
| + peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);
|
| + exponentEndPos = peekOffset;
|
| + }
|
| + integerPart = m_input->getUnsignedInt(intStartPos, intEndPos);
|
| + fractionPart = m_input->getUnsignedInt(fractionStartPos, fractionEndPos);
|
| + fractionDigits = fractionEndPos - fractionStartPos;
|
| + exponentPart = m_input->getUnsignedInt(exponentStartPos, exponentEndPos);
|
| + value = sign * (integerPart + fractionPart * pow(10, -1 * fractionDigits)) * pow(10, exponentSign * exponentPart);
|
| +
|
| + m_input->advance(peekOffset);
|
| + // FIXME - Always returning an Integer type. Need to look at fractions, etc.
|
| +
|
| + return CSSToken(NumberToken, repr, value, type);
|
| +}
|
| +
|
| +// http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token
|
| +CSSToken NewCSSTokenizer::consumeNumericToken()
|
| +{
|
| + CSSToken token = consumeNumber();
|
| + if (nextCharsAreIdentifier())
|
| + token.convertToDimensionWithUnit(consumeName());
|
| + else if (consumeIfNext('%'))
|
| + token.convertToPercentage();
|
| + return token;
|
| +}
|
| +
|
| +// http://www.w3.org/TR/css3-syntax/#consume-an-ident-like-token
|
| +CSSToken NewCSSTokenizer::consumeIdentLikeToken()
|
| +{
|
| + String name = consumeName();
|
| + if (consumeIfNext('(')) {
|
| + if (equalIgnoringCase(name, "url"))
|
| + return consumeURLToken();
|
| + return CSSToken(FunctionToken, name);
|
| + }
|
| + return CSSToken(IdentToken, name);
|
| +}
|
| +
|
| +// http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
|
| +CSSToken NewCSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
|
| +{
|
| + // FIXME: Implement.
|
| + return CSSToken(BadStringToken);
|
| +}
|
| +
|
| +// http://www.w3.org/TR/css3-syntax/#consume-a-url-token
|
| +CSSToken NewCSSTokenizer::consumeURLToken()
|
| +{
|
| + // FIXME: Implement.
|
| + return CSSToken(BadURLToken);
|
| +}
|
| +
|
| +void NewCSSTokenizer::consumeUntilNotWhitespace()
|
| +{
|
| + while (m_input->currentInputChar() == '\t' || m_input->currentInputChar() == ' ' || m_input->currentInputChar() == '\n')
|
| + consume();
|
| +}
|
| +
|
| +void NewCSSTokenizer::consumeThroughCommentEndOrUntilEOF()
|
| +{
|
| + // FIXME: Implement.
|
| +}
|
| +
|
| +bool NewCSSTokenizer::consumeIfNext(UChar character)
|
| +{
|
| + return (m_input->currentInputChar() == character);
|
| +}
|
| +
|
| +bool NewCSSTokenizer::consumeIfNext(String str)
|
| +{
|
| + for (unsigned i = 0; i < str.length(); ++i) {
|
| + if (str[i] != m_input->peek(i))
|
| + return false;
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +// http://www.w3.org/TR/css3-syntax/#consume-a-name
|
| +String NewCSSTokenizer::consumeName()
|
| +{
|
| + // FIXME: This is written to match the spec
|
| + // but could be much more efficient.
|
| + String result("");
|
| + while (true) {
|
| + if (isNameChar(m_input->currentInputChar())) {
|
| + result.append(consume());
|
| + continue;
|
| + }
|
| + if (nextTwoCharsAreValidEscape()) {
|
| + consume(); // SPEC BUG: Emailed Tab.
|
| + result.append(consumeEscape());
|
| + continue;
|
| + }
|
| + return result;
|
| + }
|
| +}
|
| +
|
| +// http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
|
| +UChar NewCSSTokenizer::consumeEscape()
|
| +{
|
| + UChar cc = consume();
|
| + ASSERT(cc != '\n');
|
| + if (isASCIIHexDigit(cc)) {
|
| + unsigned consumedHexDigits = 1;
|
| + String hexChars;
|
| + do {
|
| + hexChars.append(cc);
|
| + cc = consume();
|
| + consumedHexDigits++;
|
| + } while (consumedHexDigits < 6 && isASCIIHexDigit(cc));
|
| + bool ok = false;
|
| + UChar codePoint = hexChars.toUIntStrict(&ok, 16);
|
| + if (!ok)
|
| + return WTF::Unicode::replacementCharacter;
|
| + return codePoint;
|
| + }
|
| + if (cc == kEndOfFileMarker)
|
| + return WTF::Unicode::replacementCharacter;
|
| + return cc;
|
| +}
|
| +
|
| +bool NewCSSTokenizer::nextCharIsNameChar()
|
| +{
|
| + return isNameChar(m_input->currentInputChar());
|
| +}
|
| +
|
| +bool NewCSSTokenizer::nextTwoCharsAreValidEscape()
|
| +{
|
| + return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2));
|
| +}
|
| +
|
| +// http://www.w3.org/TR/css3-syntax/#starts-with-a-number
|
| +bool NewCSSTokenizer::nextCharsAreNumber()
|
| +{
|
| + UChar first = m_input->currentInputChar();
|
| + UChar second = m_input->peek(1);
|
| + if (isASCIIDigit(first))
|
| + return true;
|
| + if (first == '+' || first == '-')
|
| + return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input->peek(2))));
|
| + if (first =='.')
|
| + return (isASCIIDigit(second));
|
| + return false;
|
| +}
|
| +
|
| +// http://www.w3.org/TR/css3-syntax/#would-start-an-identifier
|
| +bool NewCSSTokenizer::nextCharsAreIdentifier()
|
| +{
|
| + UChar firstChar = m_input->currentInputChar();
|
| + if (isNameStart(firstChar) || nextTwoCharsAreValidEscape())
|
| + return true;
|
| +
|
| + if (firstChar == '-') {
|
| + if (isNameStart(m_input->peek(1)))
|
| + return true;
|
| + return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2));
|
| + }
|
| +
|
| + return false;
|
| +}
|
| +
|
| +} // namespace WebCore
|
|
|