Index: Source/core/css/parser/NewCSSTokenizer.cpp |
diff --git a/Source/core/css/parser/NewCSSTokenizer.cpp b/Source/core/css/parser/NewCSSTokenizer.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..d053944c3b92d67a87ac6cadbed86cfca4920a64 |
--- /dev/null |
+++ b/Source/core/css/parser/NewCSSTokenizer.cpp |
@@ -0,0 +1,437 @@ |
+/* |
+ * Copyright (C) 2013 Google Inc. All rights reserved. |
+ * |
+ * Redistribution and use in source and binary forms, with or without |
+ * modification, are permitted provided that the following conditions are |
+ * met: |
+ * |
+ * * Redistributions of source code must retain the above copyright |
+ * notice, this list of conditions and the following disclaimer. |
+ * * Redistributions in binary form must reproduce the above |
+ * copyright notice, this list of conditions and the following disclaimer |
+ * in the documentation and/or other materials provided with the |
+ * distribution. |
+ * * Neither the name of Google Inc. nor the names of its |
+ * contributors may be used to endorse or promote products derived from |
+ * this software without specific prior written permission. |
+ * |
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
+ */ |
+ |
+#include "config.h" |
+#include "core/css/parser/NewCSSTokenizer.h" |
+ |
+#include "core/css/parser/CSSInputStream.h" |
+#include "core/css/parser/CSSParserIdioms.h" |
+#include "platform/text/SegmentedString.h" |
+#include "wtf/TemporaryChange.h" |
+#include "wtf/unicode/CharacterNames.h" |
+ |
+namespace WebCore { |
+ |
+// http://dev.w3.org/csswg/css-syntax/#name-start-code-point |
+static bool isNameStart(UChar c) |
+{ |
+ if (isASCIIAlpha(c)) |
+ return true; |
+ if (c == '_') |
+ return true; |
+ return !isASCII(c); |
+} |
+ |
+// http://www.w3.org/TR/css-syntax-3/#name-code-point |
+static bool isNameChar(UChar c) |
+{ |
+ return isNameStart(c) || isASCIIDigit(c) || c == '-'; |
+} |
+ |
+// http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape |
+static bool twoCharsAreValidEscape(UChar first, UChar second) |
+{ |
+ return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker)); |
+} |
+ |
+NewCSSTokenizer::NewCSSTokenizer() |
+{ |
+} |
+ |
+void NewCSSTokenizer::reconsume(UChar c) |
+{ |
+ m_input->pushBack(c); |
+} |
+ |
+UChar NewCSSTokenizer::consume() |
+{ |
+ UChar current = m_input->currentInputChar(); |
+ m_input->advance(); |
+ return current; |
+} |
+ |
+void NewCSSTokenizer::tokenize(String string, Vector<CSSToken>& outTokens) |
+{ |
+ NewCSSTokenizer tokenizer; |
+ CSSInputStream input(string); |
+ while (true) { |
+ outTokens.append(tokenizer.nextToken(input)); |
+ if (outTokens.last().type() == EOFToken) |
+ return; |
+ } |
+} |
+ |
+CSSToken NewCSSTokenizer::nextToken(CSSInputStream& input) |
+{ |
+ // Unlike the HTMLTokenizer, the CSS Syntax spec is written |
+ // as a stateless, (fixed-size) look-ahead tokenizer. |
+ // We could move to the stateful model and instead create |
+ // states for all the "next 3 codepoints are X" cases. |
+ // State-machine tokenizers are easier to write to handle |
+ // incremental tokenization of partial sources. |
+ // However, for now we follow the spec exactly. |
+ m_input = &input; |
+ UChar cc = consume(); |
+ |
+ if (isCSSSpace(cc)) { |
+ // CSS Tokenization is currently lossy, but we could record |
+ // the exact whitespace instead of discarding it here. |
+ consumeUntilNotWhitespace(); |
+ return CSSToken(WhitespaceToken); |
+ } |
+ if (cc == '\"' || cc == '\'') |
+ return consumeStringTokenUntil(cc); |
+ if (cc == '#') { |
+ if (nextCharIsNameChar() || nextTwoCharsAreValidEscape()) { |
+ HashTokenType hashType = UnrestrictedHashToken; |
+ if (nextCharsAreIdentifier()) |
+ hashType = IdHashToken; |
+ return CSSToken(HashToken, consumeName(), hashType); |
+ } |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == '$') { |
+ if (consumeIfNext('=')) |
+ return CSSToken(SuffixMatchToken); |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == '(') |
+ return CSSToken(LeftParenToken); |
+ if (cc == ')') |
+ return CSSToken(RightParenToken); |
+ if (cc == '*') { |
+ if (consumeIfNext('=')) |
+ return CSSToken(SubstringMatchToken); |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == '+' || cc == '.') { |
+ if (nextCharsAreNumber()) { |
+ reconsume(cc); |
+ return consumeNumericToken(); |
+ } |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == ',') |
+ return CSSToken(CommaToken); |
+ if (cc == '-') { |
+ if (nextCharsAreNumber()) { |
+ reconsume(cc); |
+ return consumeNumericToken(); |
+ } |
+ if (nextCharsAreIdentifier()) { |
+ reconsume(cc); |
+ return consumeIdentLikeToken(); |
+ } |
+ if (consumeIfNext("->")) |
+ return CSSToken(CDCToken); |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == '/') { |
+ if (consumeIfNext('*')) { |
+ consumeThroughCommentEndOrUntilEOF(); |
+ return nextToken(*m_input); |
+ } |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == ':') |
+ return CSSToken(ColonToken); |
+ if (cc == ';') |
+ return CSSToken(SemicolonToken); |
+ if (cc == '<') { |
+ if (consumeIfNext("!--")) |
+ return CSSToken(CDOToken); |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == '@') { |
+ if (nextCharsAreIdentifier()) |
+ return CSSToken(AtKeywordToken, consumeName()); |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == '[') |
+ return CSSToken(LeftBracketToken); |
+ if (cc == '\\') { |
+ if (twoCharsAreValidEscape(cc, m_input->currentInputChar())) { |
+ reconsume(cc); |
+ return consumeIdentLikeToken(); |
+ } |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == ']') |
+ return CSSToken(RightBracketToken); |
+ if (cc == '^') { |
+ if (consumeIfNext('=')) |
+ return CSSToken(PrefixMatchToken); |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == '{') |
+ return CSSToken(LeftBraceToken); |
+ if (cc == '{') |
+ return CSSToken(RightBraceToken); |
+ if (isASCIIDigit(cc)) { |
+ // "reconsume" here is not according to spec, but required AFAICT. |
+ // https://www.w3.org/Bugs/Public/show_bug.cgi?id=24661 |
+ reconsume(cc); |
+ return consumeNumericToken(); |
+ } |
+ // if (cc == 'U' || cc == 'u') { |
+ // // U+0055 LATIN CAPITAL LETTER U (U) |
+ // // U+0075 LATIN SMALL LETTER U (u) |
+ // // If the next 2 input code points are U+002B PLUS SIGN (+) followed by a hex digit or U+003F QUESTION MARK (?), consume the next input code point. Note: don’t consume both of them. Consume a unicode-range token and return it. |
+ // // Otherwise, reconsume the current input code point, consume an ident-like token, and return it. |
+ // reconsume(cc); |
+ // return consumeIdentLikeToken(); |
+ // } |
+ if (isNameStart(cc)) { |
+ reconsume(cc); |
+ return consumeIdentLikeToken(); |
+ } |
+ if (cc == '|') { |
+ if (consumeIfNext('=')) |
+ return CSSToken(DashMatchToken); |
+ if (consumeIfNext('|')) |
+ return CSSToken(ColumnToken); |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == '~') { |
+ if (consumeIfNext('=')) |
+ return CSSToken(IncludeMatchToken); |
+ return CSSToken(DelimToken, cc); |
+ } |
+ if (cc == kEndOfFileMarker) |
+ return CSSToken(EOFToken); |
+ return CSSToken(DelimToken, cc); |
+} |
+ |
+// This method merges the following spec sections for efficiency |
+// http://www.w3.org/TR/css3-syntax/#consume-a-number |
+// http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number |
+CSSToken NewCSSTokenizer::consumeNumber() |
+{ |
+ ASSERT(nextCharsAreNumber()); |
+ // FIXME - repr should get the value as a string, even though I'm not sure it's useful |
+ String repr; |
+ NumericValueType type = IntegerValueType; |
+ double value = 0; |
+ int sign = 1; |
+ unsigned peekOffset = 0; |
+ int exponentSign = 1; |
+ unsigned exponentStartPos = 0; |
+ unsigned exponentEndPos = 0; |
+ unsigned fractionStartPos = 0; |
+ unsigned fractionEndPos = 0; |
+ unsigned integerPart; |
+ unsigned fractionPart; |
+ unsigned fractionDigits; |
+ unsigned exponentPart; |
+ if (m_input->currentInputChar() == '+') { |
+ ++peekOffset; |
+ } else if (m_input->peek(peekOffset) == '-') { |
+ sign = -1; |
+ ++peekOffset; |
+ } |
+ unsigned intStartPos = peekOffset; |
+ peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset); |
+ unsigned intEndPos = peekOffset; |
+ if (m_input->peek(peekOffset) == '.' && isASCIIDigit(m_input->peek(++peekOffset))) { |
+ fractionStartPos = peekOffset; |
+ peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset); |
+ fractionEndPos = peekOffset; |
+ } |
+ if ((m_input->peek(peekOffset) == 'E' || m_input->peek(peekOffset) == 'e')) { |
+ ++peekOffset; |
+ if (m_input->peek(peekOffset) == '+') { |
+ ++peekOffset; |
+ } else if (m_input->peek(peekOffset) =='-') { |
+ exponentSign = -1; |
+ ++peekOffset; |
+ } |
+ exponentStartPos = peekOffset; |
+ peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset); |
+ exponentEndPos = peekOffset; |
+ } |
+ integerPart = m_input->getUnsignedInt(intStartPos, intEndPos); |
+ fractionPart = m_input->getUnsignedInt(fractionStartPos, fractionEndPos); |
+ fractionDigits = fractionEndPos - fractionStartPos; |
+ exponentPart = m_input->getUnsignedInt(exponentStartPos, exponentEndPos); |
+ value = sign * (integerPart + fractionPart * pow(10, -1 * fractionDigits)) * pow(10, exponentSign * exponentPart); |
+ |
+ m_input->advance(peekOffset); |
+ // FIXME - Always returning an Integer type. Need to look at fractions, etc. |
+ |
+ return CSSToken(NumberToken, repr, value, type); |
+} |
+ |
+// http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token |
+CSSToken NewCSSTokenizer::consumeNumericToken() |
+{ |
+ CSSToken token = consumeNumber(); |
+ if (nextCharsAreIdentifier()) |
+ token.convertToDimensionWithUnit(consumeName()); |
+ else if (consumeIfNext('%')) |
+ token.convertToPercentage(); |
+ return token; |
+} |
+ |
+// http://www.w3.org/TR/css3-syntax/#consume-an-ident-like-token |
+CSSToken NewCSSTokenizer::consumeIdentLikeToken() |
+{ |
+ String name = consumeName(); |
+ if (consumeIfNext('(')) { |
+ if (equalIgnoringCase(name, "url")) |
+ return consumeURLToken(); |
+ return CSSToken(FunctionToken, name); |
+ } |
+ return CSSToken(IdentToken, name); |
+} |
+ |
+// http://dev.w3.org/csswg/css-syntax/#consume-a-string-token |
+CSSToken NewCSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint) |
+{ |
+ // FIXME: Implement. |
+ return CSSToken(BadStringToken); |
+} |
+ |
+// http://www.w3.org/TR/css3-syntax/#consume-a-url-token |
+CSSToken NewCSSTokenizer::consumeURLToken() |
+{ |
+ // FIXME: Implement. |
+ return CSSToken(BadURLToken); |
+} |
+ |
+void NewCSSTokenizer::consumeUntilNotWhitespace() |
+{ |
+ while (m_input->currentInputChar() == '\t' || m_input->currentInputChar() == ' ' || m_input->currentInputChar() == '\n') |
+ consume(); |
+} |
+ |
+void NewCSSTokenizer::consumeThroughCommentEndOrUntilEOF() |
+{ |
+ // FIXME: Implement. |
+} |
+ |
+bool NewCSSTokenizer::consumeIfNext(UChar character) |
+{ |
+ return (m_input->currentInputChar() == character); |
+} |
+ |
+bool NewCSSTokenizer::consumeIfNext(String str) |
+{ |
+ for (unsigned i = 0; i < str.length(); ++i) { |
+ if (str[i] != m_input->peek(i)) |
+ return false; |
+ } |
+ return true; |
+} |
+ |
+// http://www.w3.org/TR/css3-syntax/#consume-a-name |
+String NewCSSTokenizer::consumeName() |
+{ |
+ // FIXME: This is written to match the spec |
+ // but could be much more efficient. |
+ String result(""); |
+ while (true) { |
+ if (isNameChar(m_input->currentInputChar())) { |
+ result.append(consume()); |
+ continue; |
+ } |
+ if (nextTwoCharsAreValidEscape()) { |
+ consume(); // SPEC BUG: Emailed Tab. |
+ result.append(consumeEscape()); |
+ continue; |
+ } |
+ return result; |
+ } |
+} |
+ |
+// http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point |
+UChar NewCSSTokenizer::consumeEscape() |
+{ |
+ UChar cc = consume(); |
+ ASSERT(cc != '\n'); |
+ if (isASCIIHexDigit(cc)) { |
+ unsigned consumedHexDigits = 1; |
+ String hexChars; |
+ do { |
+ hexChars.append(cc); |
+ cc = consume(); |
+ consumedHexDigits++; |
+ } while (consumedHexDigits < 6 && isASCIIHexDigit(cc)); |
+ bool ok = false; |
+ UChar codePoint = hexChars.toUIntStrict(&ok, 16); |
+ if (!ok) |
+ return WTF::Unicode::replacementCharacter; |
+ return codePoint; |
+ } |
+ if (cc == kEndOfFileMarker) |
+ return WTF::Unicode::replacementCharacter; |
+ return cc; |
+} |
+ |
+bool NewCSSTokenizer::nextCharIsNameChar() |
+{ |
+ return isNameChar(m_input->currentInputChar()); |
+} |
+ |
+bool NewCSSTokenizer::nextTwoCharsAreValidEscape() |
+{ |
+ return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2)); |
+} |
+ |
+// http://www.w3.org/TR/css3-syntax/#starts-with-a-number |
+bool NewCSSTokenizer::nextCharsAreNumber() |
+{ |
+ UChar first = m_input->currentInputChar(); |
+ UChar second = m_input->peek(1); |
+ if (isASCIIDigit(first)) |
+ return true; |
+ if (first == '+' || first == '-') |
+ return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input->peek(2)))); |
+ if (first =='.') |
+ return (isASCIIDigit(second)); |
+ return false; |
+} |
+ |
+// http://www.w3.org/TR/css3-syntax/#would-start-an-identifier |
+bool NewCSSTokenizer::nextCharsAreIdentifier() |
+{ |
+ UChar firstChar = m_input->currentInputChar(); |
+ if (isNameStart(firstChar) || nextTwoCharsAreValidEscape()) |
+ return true; |
+ |
+ if (firstChar == '-') { |
+ if (isNameStart(m_input->peek(1))) |
+ return true; |
+ return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2)); |
+ } |
+ |
+ return false; |
+} |
+ |
+} // namespace WebCore |