Chromium Code Reviews| Index: Source/core/css/parser/NewCSSTokenizer.cpp |
| diff --git a/Source/core/css/parser/NewCSSTokenizer.cpp b/Source/core/css/parser/NewCSSTokenizer.cpp |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..e2c595ba7d3176ba7c3ca0929e45ef37ce4c819c |
| --- /dev/null |
| +++ b/Source/core/css/parser/NewCSSTokenizer.cpp |
| @@ -0,0 +1,389 @@ |
| +/* |
| + * Copyright (C) 2013 Google Inc. All rights reserved. |
| + * |
| + * Redistribution and use in source and binary forms, with or without |
| + * modification, are permitted provided that the following conditions are |
| + * met: |
| + * |
| + * * Redistributions of source code must retain the above copyright |
| + * notice, this list of conditions and the following disclaimer. |
| + * * Redistributions in binary form must reproduce the above |
| + * copyright notice, this list of conditions and the following disclaimer |
| + * in the documentation and/or other materials provided with the |
| + * distribution. |
| + * * Neither the name of Google Inc. nor the names of its |
| + * contributors may be used to endorse or promote products derived from |
| + * this software without specific prior written permission. |
| + * |
| + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| + */ |
| + |
| +#include "config.h" |
| +#include "core/css/parser/NewCSSTokenizer.h" |
| + |
| +#include "core/css/parser/CSSParserIdioms.h" |
| +#include "platform/text/SegmentedString.h" |
| +#include "wtf/TemporaryChange.h" |
| +#include "wtf/unicode/CharacterNames.h" |
| + |
| +namespace WebCore { |
| + |
| +CSSInputStream::CSSInputStream(String input) |
| + : m_offset(0) |
| + , m_string(input) |
| +{ |
| + m_string.append(kEndOfFileMarker); |
|
abarth-chromium
2014/01/01 18:47:51
Hum... String::append is monstrously slow...
|
| +} |
| + |
| +UChar CSSInputStream::currentInputChar() |
| +{ |
| + ASSERT(m_offset < m_string.length()); |
| + return m_string[m_offset]; |
| +} |
| + |
| +UChar CSSInputStream::nextInputChar() |
| +{ |
| + return m_string[m_offset + 1]; |
| +} |
| + |
| +UChar CSSInputStream::peek2() |
| +{ |
| + return m_string[m_offset + 2]; |
| +} |
| + |
| +UChar CSSInputStream::peek3() |
| +{ |
| + return m_string[m_offset + 3]; |
| +} |
| + |
| +void CSSInputStream::advance() |
| +{ |
| + m_offset++; |
| +} |
| + |
| +void CSSInputStream::pushBack(UChar cc) |
| +{ |
| + m_offset--; |
| + ASSERT(currentInputChar() == cc); |
| +} |
| + |
| +// http://dev.w3.org/csswg/css-syntax/#name-start-code-point |
| +static bool isNameStart(UChar c) |
| +{ |
| + if (isASCIIAlpha(c)) |
| + return true; |
| + if (c == '_') |
| + return true; |
| + return !isASCII(c); |
| +} |
| + |
| +// http://www.w3.org/TR/css-syntax-3/#name-code-point |
| +static bool isNameChar(UChar c) |
| +{ |
| + return isNameStart(c) || isASCIIDigit(c) || c == '-'; |
| +} |
| + |
| +NewCSSTokenizer::NewCSSTokenizer() |
| +{ |
| +} |
| + |
| +void NewCSSTokenizer::reconsume(UChar c) |
| +{ |
| + m_input->pushBack(c); |
| +} |
| + |
| +UChar NewCSSTokenizer::consume() |
| +{ |
| + UChar current = m_input->currentInputChar(); |
| + m_input->advance(); |
| + return current; |
| +} |
| + |
| +CSSToken NewCSSTokenizer::nextToken(CSSInputStream& input) |
| +{ |
| + // Unlike the HTMLTokenizer, the CSS Syntax spec is written |
| + // as a stateless, (fixed-size) look-ahead tokenizer. |
| + // We could move to the stateful model and instead create |
| + // states for all the "next 3 codepoints are X" cases. |
| + // State-machine tokenizers are easier to write to handle |
| + // incremental tokenization of partial sources. |
| + // However, for now we follow the spec exactly. |
| + m_input = &input; |
| + UChar cc = consume(); |
| + |
| + if (isCSSSpace(cc)) { |
|
abarth-chromium
2014/01/01 18:47:51
I bet it's faster to implement this if-cascade usi
|
| + // CSS Tokenization is currently lossy, but we could record |
| + // the exact whitespace instead of discarding it here. |
| + consumeUntilNotWhitespace(); |
| + return CSSToken(WhitespaceToken); |
| + } |
| + if (cc == '\"' || cc == '\'') |
| + return consumeStringTokenUntil(cc); |
| + if (cc == '#') { |
| + if (nextCharIsName() || nextTwoCharsAreValidEscape()) { |
| + HashTokenType hashType = UnrestrictedHashToken; |
| + if (nextCharsAreIdentifier()) |
| + hashType = IdHashToken; |
| + return CSSToken(HashToken, consumeName(), hashType); |
| + } |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == '$') { |
| + if (consumeIfNext('=')) |
| + return CSSToken(SuffixMatchToken); |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == '(') |
| + return CSSToken(LeftParenToken); |
| + if (cc == ')') |
| + return CSSToken(RightParenToken); |
| + if (cc == '*') { |
| + if (consumeIfNext('=')) |
| + return CSSToken(SubstringMatchToken); |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == '+' || cc == '.') { |
| + if (nextCharsAreNumber()) { |
| + reconsume(cc); |
| + return consumeNumericToken(); |
| + } |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == ',') |
| + return CSSToken(CommaToken); |
| + if (cc == '-') { |
| + if (nextCharsAreNumber()) { |
| + reconsume(cc); |
| + return consumeNumericToken(); |
| + } |
| + if (nextCharsAreIdentifier()) { |
| + reconsume(cc); |
| + return consumeIdentLikeToken(); |
| + } |
| + if (consumeIfNext("->")) |
| + return CSSToken(CDCToken); |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == '/') { |
| + if (consumeIfNext('*')) { |
| + consumeThroughCommentEndOrUntilEOF(); |
| + return nextToken(*m_input); |
| + } |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == ':') |
| + return CSSToken(ColonToken); |
| + if (cc == ';') |
| + return CSSToken(SemicolonToken); |
| + if (cc == '<') { |
| + if (consumeIfNext("!--")) |
| + return CSSToken(CDOToken); |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == '@') { |
| + if (nextCharsAreIdentifier()) |
| + return CSSToken(AtKeywordToken, consumeName()); |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == '[') |
| + return CSSToken(LeftBracketToken); |
| + if (cc == '\\') { |
| + if (nextIsValidEscape()) { |
| + reconsume(cc); |
| + return consumeIdentLikeToken(); |
| + } |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == ']') |
| + return CSSToken(RightBracketToken); |
| + if (cc == '^') { |
| + if (consumeIfNext('=')) |
| + return CSSToken(PrefixMatchToken); |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == '{') |
| + return CSSToken(LeftBraceToken); |
| + if (cc == '{') |
| + return CSSToken(RightBraceToken); |
| + if (isASCIIDigit(cc)) |
| + return consumeNumericToken(); |
| + // if (cc == 'U' || cc == 'u') { |
| + // // U+0055 LATIN CAPITAL LETTER U (U) |
| + // // U+0075 LATIN SMALL LETTER U (u) |
| + // // If the next 2 input code points are U+002B PLUS SIGN (+) followed by a hex digit or U+003F QUESTION MARK (?), consume the next input code point. Note: don’t consume both of them. Consume a unicode-range token and return it. |
| + // // Otherwise, reconsume the current input code point, consume an ident-like token, and return it. |
| + // reconsume(cc); |
| + // return consumeIdentLikeToken(); |
| + // } |
| + if (isNameStart(cc)) { |
| + reconsume(cc); |
| + return consumeIdentLikeToken(); |
| + } |
| + if (cc == '|') { |
| + if (consumeIfNext('=')) |
| + return CSSToken(DashMatchToken); |
| + if (consumeIfNext('|')) |
| + return CSSToken(ColumnToken); |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == '~') { |
| + if (consumeIfNext('=')) |
| + return CSSToken(IncludeMatchToken); |
| + return CSSToken(DelimToken, cc); |
| + } |
| + if (cc == kEndOfFileMarker) |
| + return CSSToken(EOFToken); |
| + return CSSToken(DelimToken, cc); |
| +} |
| + |
| +CSSToken NewCSSTokenizer::consumeNumber() |
| +{ |
| + ASSERT(nextCharsAreNumber()); |
| + String repr; |
| + NumericValueType type = IntegerValueType; |
| + double value = 0; |
| + |
| + // FIXME: Needs implementation. |
| + // http://dev.w3.org/csswg/css-syntax/#consume-a-number0 |
| + return CSSToken(NumberToken, repr, value, type); |
| +} |
| + |
| +CSSToken NewCSSTokenizer::consumeNumericToken() |
| +{ |
| + CSSToken token = consumeNumber(); |
| + if (nextCharsAreIdentifier()) |
| + token.convertToDimensionWithUnit(consumeName()); |
| + else if (consumeIfNext("%")) |
| + token.convertToPercentage(); |
| + return token; |
| +} |
| + |
| +CSSToken NewCSSTokenizer::consumeIdentLikeToken() |
| +{ |
| + String name = consumeName(); |
| + if (consumeIfNext('(')) { |
| + if (equalIgnoringCase(name, "url")) |
| + return consumeURLToken(); |
| + return CSSToken(FunctionToken, name); |
| + } |
| + return CSSToken(IdentToken, name); |
| +} |
| + |
| +CSSToken NewCSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint) |
| +{ |
| + // FIXME: Implement. |
| + // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token |
| + return CSSToken(BadStringToken); |
| +} |
| + |
| +CSSToken NewCSSTokenizer::consumeURLToken() |
| +{ |
| + return CSSToken(BadURLToken); |
| +} |
| + |
| +void NewCSSTokenizer::consumeUntilNotWhitespace() |
| +{ |
| + |
| +} |
| + |
| +void NewCSSTokenizer::consumeThroughCommentEndOrUntilEOF() |
| +{ |
| + |
| +} |
| + |
| +bool NewCSSTokenizer::consumeIfNext(UChar) |
| +{ |
| + return false; |
| +} |
| + |
| +bool NewCSSTokenizer::consumeIfNext(String) |
| +{ |
| + return false; |
| +} |
| + |
| +String NewCSSTokenizer::consumeName() |
| +{ |
| + // FIXME: This is written to match the spec |
| + // but could be much more efficient. |
| + String result(""); |
| + while (true) { |
| + if (isNameChar(m_input->currentInputChar())) { |
| + result.append(consume()); |
|
abarth-chromium
2014/01/01 18:47:51
Please use StringBuilder rather than String. Stri
|
| + continue; |
| + } |
| + if (nextTwoCharsAreValidEscape()) { |
| + consume(); // SPEC BUG: Emailed Tab. |
| + result.append(consumeEscape()); |
| + continue; |
| + } |
| + return result; |
| + } |
| +} |
| + |
| +// http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point |
| +UChar NewCSSTokenizer::consumeEscape() |
| +{ |
| + UChar cc = consume(); |
| + ASSERT(cc != '\n'); |
| + if (isASCIIHexDigit(cc)) { |
| + unsigned consumedHexDigits = 1; |
| + String hexChars; |
|
abarth-chromium
2014/01/01 18:47:51
StringBuilder
|
| + do { |
| + hexChars.append(cc); |
| + cc = consume(); |
| + consumedHexDigits++; |
| + } while (consumedHexDigits < 6 && isASCIIHexDigit(cc)); |
|
abarth-chromium
2014/01/01 18:47:51
You can reserve capacity 6 in the StringBuilder to
|
| + bool ok = false; |
| + UChar codePoint = hexChars.toUIntStrict(&ok, 16); |
|
abarth-chromium
2014/01/01 18:47:51
Oh, actually, you don't need to malloc at all in t
|
| + if (!ok) |
| + return WTF::Unicode::replacementCharacter; |
| + return codePoint; |
| + } |
| + if (cc == kEndOfFileMarker) |
| + return WTF::Unicode::replacementCharacter; |
| + return cc; |
| +} |
| + |
| +bool NewCSSTokenizer::nextIsValidEscape() |
| +{ |
| + return false; |
| +} |
| + |
| +bool NewCSSTokenizer::nextCharIsName() |
| +{ |
| + return false; |
| +} |
| + |
| +// http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escapeare-a-valid-escapestarts-with-a-valid-escape |
| +bool NewCSSTokenizer::nextTwoCharsAreValidEscape() |
| +{ |
| + UChar firstChar = m_input->nextInputChar(); |
| + UChar secondChar = m_input->peek2(); |
| + if (firstChar != '\\') |
| + return false; |
| + if (secondChar == '\n' || secondChar == kEndOfFileMarker) |
| + return false; |
| + return true; |
| +} |
| + |
| +bool NewCSSTokenizer::nextCharsAreNumber() |
| +{ |
| + return false; |
| +} |
| + |
| +bool NewCSSTokenizer::nextCharsAreIdentifier() |
| +{ |
| + return false; |
| +} |
| + |
| +} // namespace WebCore |