Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(549)

Unified Diff: Source/core/css/parser/NewCSSTokenizer.cpp

Issue 171383002: A thread-safe Media Query Parser (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master
Patch Set: Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: Source/core/css/parser/NewCSSTokenizer.cpp
diff --git a/Source/core/css/parser/NewCSSTokenizer.cpp b/Source/core/css/parser/NewCSSTokenizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d053944c3b92d67a87ac6cadbed86cfca4920a64
--- /dev/null
+++ b/Source/core/css/parser/NewCSSTokenizer.cpp
@@ -0,0 +1,437 @@
+/*
+ * Copyright (C) 2013 Google Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "core/css/parser/NewCSSTokenizer.h"
+
+#include "core/css/parser/CSSInputStream.h"
+#include "core/css/parser/CSSParserIdioms.h"
+#include "platform/text/SegmentedString.h"
+#include "wtf/TemporaryChange.h"
+#include "wtf/unicode/CharacterNames.h"
+
+namespace WebCore {
+
+// http://dev.w3.org/csswg/css-syntax/#name-start-code-point
+static bool isNameStart(UChar c)
+{
+ if (isASCIIAlpha(c))
+ return true;
+ if (c == '_')
+ return true;
+ return !isASCII(c);
+}
+
+// http://www.w3.org/TR/css-syntax-3/#name-code-point
+static bool isNameChar(UChar c)
+{
+ return isNameStart(c) || isASCIIDigit(c) || c == '-';
+}
+
+// http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escape
+static bool twoCharsAreValidEscape(UChar first, UChar second)
+{
+ return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker));
+}
+
+NewCSSTokenizer::NewCSSTokenizer()
+{
+}
+
+void NewCSSTokenizer::reconsume(UChar c)
+{
+ m_input->pushBack(c);
+}
+
+UChar NewCSSTokenizer::consume()
+{
+ UChar current = m_input->currentInputChar();
+ m_input->advance();
+ return current;
+}
+
+void NewCSSTokenizer::tokenize(String string, Vector<CSSToken>& outTokens)
+{
+ NewCSSTokenizer tokenizer;
+ CSSInputStream input(string);
+ while (true) {
+ outTokens.append(tokenizer.nextToken(input));
+ if (outTokens.last().type() == EOFToken)
+ return;
+ }
+}
+
+CSSToken NewCSSTokenizer::nextToken(CSSInputStream& input)
+{
+ // Unlike the HTMLTokenizer, the CSS Syntax spec is written
+ // as a stateless, (fixed-size) look-ahead tokenizer.
+ // We could move to the stateful model and instead create
+ // states for all the "next 3 codepoints are X" cases.
+ // State-machine tokenizers are easier to write to handle
+ // incremental tokenization of partial sources.
+ // However, for now we follow the spec exactly.
+ m_input = &input;
+ UChar cc = consume();
+
+ if (isCSSSpace(cc)) {
+ // CSS Tokenization is currently lossy, but we could record
+ // the exact whitespace instead of discarding it here.
+ consumeUntilNotWhitespace();
+ return CSSToken(WhitespaceToken);
+ }
+ if (cc == '\"' || cc == '\'')
+ return consumeStringTokenUntil(cc);
+ if (cc == '#') {
+ if (nextCharIsNameChar() || nextTwoCharsAreValidEscape()) {
+ HashTokenType hashType = UnrestrictedHashToken;
+ if (nextCharsAreIdentifier())
+ hashType = IdHashToken;
+ return CSSToken(HashToken, consumeName(), hashType);
+ }
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == '$') {
+ if (consumeIfNext('='))
+ return CSSToken(SuffixMatchToken);
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == '(')
+ return CSSToken(LeftParenToken);
+ if (cc == ')')
+ return CSSToken(RightParenToken);
+ if (cc == '*') {
+ if (consumeIfNext('='))
+ return CSSToken(SubstringMatchToken);
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == '+' || cc == '.') {
+ if (nextCharsAreNumber()) {
+ reconsume(cc);
+ return consumeNumericToken();
+ }
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == ',')
+ return CSSToken(CommaToken);
+ if (cc == '-') {
+ if (nextCharsAreNumber()) {
+ reconsume(cc);
+ return consumeNumericToken();
+ }
+ if (nextCharsAreIdentifier()) {
+ reconsume(cc);
+ return consumeIdentLikeToken();
+ }
+ if (consumeIfNext("->"))
+ return CSSToken(CDCToken);
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == '/') {
+ if (consumeIfNext('*')) {
+ consumeThroughCommentEndOrUntilEOF();
+ return nextToken(*m_input);
+ }
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == ':')
+ return CSSToken(ColonToken);
+ if (cc == ';')
+ return CSSToken(SemicolonToken);
+ if (cc == '<') {
+ if (consumeIfNext("!--"))
+ return CSSToken(CDOToken);
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == '@') {
+ if (nextCharsAreIdentifier())
+ return CSSToken(AtKeywordToken, consumeName());
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == '[')
+ return CSSToken(LeftBracketToken);
+ if (cc == '\\') {
+ if (twoCharsAreValidEscape(cc, m_input->currentInputChar())) {
+ reconsume(cc);
+ return consumeIdentLikeToken();
+ }
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == ']')
+ return CSSToken(RightBracketToken);
+ if (cc == '^') {
+ if (consumeIfNext('='))
+ return CSSToken(PrefixMatchToken);
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == '{')
+ return CSSToken(LeftBraceToken);
+ if (cc == '{')
+ return CSSToken(RightBraceToken);
+ if (isASCIIDigit(cc)) {
+ // "reconsume" here is not according to spec, but required AFAICT.
+ // https://www.w3.org/Bugs/Public/show_bug.cgi?id=24661
+ reconsume(cc);
+ return consumeNumericToken();
+ }
+ // if (cc == 'U' || cc == 'u') {
+ // // U+0055 LATIN CAPITAL LETTER U (U)
+ // // U+0075 LATIN SMALL LETTER U (u)
+ // // If the next 2 input code points are U+002B PLUS SIGN (+) followed by a hex digit or U+003F QUESTION MARK (?), consume the next input code point. Note: don’t consume both of them. Consume a unicode-range token and return it.
+ // // Otherwise, reconsume the current input code point, consume an ident-like token, and return it.
+ // reconsume(cc);
+ // return consumeIdentLikeToken();
+ // }
+ if (isNameStart(cc)) {
+ reconsume(cc);
+ return consumeIdentLikeToken();
+ }
+ if (cc == '|') {
+ if (consumeIfNext('='))
+ return CSSToken(DashMatchToken);
+ if (consumeIfNext('|'))
+ return CSSToken(ColumnToken);
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == '~') {
+ if (consumeIfNext('='))
+ return CSSToken(IncludeMatchToken);
+ return CSSToken(DelimToken, cc);
+ }
+ if (cc == kEndOfFileMarker)
+ return CSSToken(EOFToken);
+ return CSSToken(DelimToken, cc);
+}
+
+// This method merges the following spec sections for efficiency
+// http://www.w3.org/TR/css3-syntax/#consume-a-number
+// http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number
+CSSToken NewCSSTokenizer::consumeNumber()
+{
+ ASSERT(nextCharsAreNumber());
+ // FIXME - repr should get the value as a string, even though I'm not sure it's useful
+ String repr;
+ NumericValueType type = IntegerValueType;
+ double value = 0;
+ int sign = 1;
+ unsigned peekOffset = 0;
+ int exponentSign = 1;
+ unsigned exponentStartPos = 0;
+ unsigned exponentEndPos = 0;
+ unsigned fractionStartPos = 0;
+ unsigned fractionEndPos = 0;
+ unsigned integerPart;
+ unsigned fractionPart;
+ unsigned fractionDigits;
+ unsigned exponentPart;
+ if (m_input->currentInputChar() == '+') {
+ ++peekOffset;
+ } else if (m_input->peek(peekOffset) == '-') {
+ sign = -1;
+ ++peekOffset;
+ }
+ unsigned intStartPos = peekOffset;
+ peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);
+ unsigned intEndPos = peekOffset;
+ if (m_input->peek(peekOffset) == '.' && isASCIIDigit(m_input->peek(++peekOffset))) {
+ fractionStartPos = peekOffset;
+ peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);
+ fractionEndPos = peekOffset;
+ }
+ if ((m_input->peek(peekOffset) == 'E' || m_input->peek(peekOffset) == 'e')) {
+ ++peekOffset;
+ if (m_input->peek(peekOffset) == '+') {
+ ++peekOffset;
+ } else if (m_input->peek(peekOffset) =='-') {
+ exponentSign = -1;
+ ++peekOffset;
+ }
+ exponentStartPos = peekOffset;
+ peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);
+ exponentEndPos = peekOffset;
+ }
+ integerPart = m_input->getUnsignedInt(intStartPos, intEndPos);
+ fractionPart = m_input->getUnsignedInt(fractionStartPos, fractionEndPos);
+ fractionDigits = fractionEndPos - fractionStartPos;
+ exponentPart = m_input->getUnsignedInt(exponentStartPos, exponentEndPos);
+ value = sign * (integerPart + fractionPart * pow(10, -1 * fractionDigits)) * pow(10, exponentSign * exponentPart);
+
+ m_input->advance(peekOffset);
+ // FIXME - Always returning an Integer type. Need to look at fractions, etc.
+
+ return CSSToken(NumberToken, repr, value, type);
+}
+
+// http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token
+CSSToken NewCSSTokenizer::consumeNumericToken()
+{
+ CSSToken token = consumeNumber();
+ if (nextCharsAreIdentifier())
+ token.convertToDimensionWithUnit(consumeName());
+ else if (consumeIfNext('%'))
+ token.convertToPercentage();
+ return token;
+}
+
+// http://www.w3.org/TR/css3-syntax/#consume-an-ident-like-token
+CSSToken NewCSSTokenizer::consumeIdentLikeToken()
+{
+ String name = consumeName();
+ if (consumeIfNext('(')) {
+ if (equalIgnoringCase(name, "url"))
+ return consumeURLToken();
+ return CSSToken(FunctionToken, name);
+ }
+ return CSSToken(IdentToken, name);
+}
+
+// http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
+CSSToken NewCSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
+{
+ // FIXME: Implement.
+ return CSSToken(BadStringToken);
+}
+
+// http://www.w3.org/TR/css3-syntax/#consume-a-url-token
+CSSToken NewCSSTokenizer::consumeURLToken()
+{
+ // FIXME: Implement.
+ return CSSToken(BadURLToken);
+}
+
+void NewCSSTokenizer::consumeUntilNotWhitespace()
+{
+ while (m_input->currentInputChar() == '\t' || m_input->currentInputChar() == ' ' || m_input->currentInputChar() == '\n')
+ consume();
+}
+
+void NewCSSTokenizer::consumeThroughCommentEndOrUntilEOF()
+{
+ // FIXME: Implement.
+}
+
+bool NewCSSTokenizer::consumeIfNext(UChar character)
+{
+ return (m_input->currentInputChar() == character);
+}
+
+bool NewCSSTokenizer::consumeIfNext(String str)
+{
+ for (unsigned i = 0; i < str.length(); ++i) {
+ if (str[i] != m_input->peek(i))
+ return false;
+ }
+ return true;
+}
+
+// http://www.w3.org/TR/css3-syntax/#consume-a-name
+String NewCSSTokenizer::consumeName()
+{
+ // FIXME: This is written to match the spec
+ // but could be much more efficient.
+ String result("");
+ while (true) {
+ if (isNameChar(m_input->currentInputChar())) {
+ result.append(consume());
+ continue;
+ }
+ if (nextTwoCharsAreValidEscape()) {
+ consume(); // SPEC BUG: Emailed Tab.
+ result.append(consumeEscape());
+ continue;
+ }
+ return result;
+ }
+}
+
+// http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
+UChar NewCSSTokenizer::consumeEscape()
+{
+ UChar cc = consume();
+ ASSERT(cc != '\n');
+ if (isASCIIHexDigit(cc)) {
+ unsigned consumedHexDigits = 1;
+ String hexChars;
+ do {
+ hexChars.append(cc);
+ cc = consume();
+ consumedHexDigits++;
+ } while (consumedHexDigits < 6 && isASCIIHexDigit(cc));
+ bool ok = false;
+ UChar codePoint = hexChars.toUIntStrict(&ok, 16);
+ if (!ok)
+ return WTF::Unicode::replacementCharacter;
+ return codePoint;
+ }
+ if (cc == kEndOfFileMarker)
+ return WTF::Unicode::replacementCharacter;
+ return cc;
+}
+
+bool NewCSSTokenizer::nextCharIsNameChar()
+{
+ return isNameChar(m_input->currentInputChar());
+}
+
+bool NewCSSTokenizer::nextTwoCharsAreValidEscape()
+{
+ return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2));
+}
+
+// http://www.w3.org/TR/css3-syntax/#starts-with-a-number
+bool NewCSSTokenizer::nextCharsAreNumber()
+{
+ UChar first = m_input->currentInputChar();
+ UChar second = m_input->peek(1);
+ if (isASCIIDigit(first))
+ return true;
+ if (first == '+' || first == '-')
+ return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input->peek(2))));
+ if (first =='.')
+ return (isASCIIDigit(second));
+ return false;
+}
+
+// http://www.w3.org/TR/css3-syntax/#would-start-an-identifier
+bool NewCSSTokenizer::nextCharsAreIdentifier()
+{
+ UChar firstChar = m_input->currentInputChar();
+ if (isNameStart(firstChar) || nextTwoCharsAreValidEscape())
+ return true;
+
+ if (firstChar == '-') {
+ if (isNameStart(m_input->peek(1)))
+ return true;
+ return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2));
+ }
+
+ return false;
+}
+
+} // namespace WebCore

Powered by Google App Engine
This is Rietveld 408576698