third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp - Issue 2503683003: [WIP] Streaming CSS parser

Unified Diff: third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp

Issue 2503683003: [WIP] Streaming CSS parser (Closed)

Patch Set: rebase Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« third_party/WebKit/Source/core/css/parser/CSSSelectorParser.cpp ('K') | « third_party/WebKit/Source/core/css/parser/CSSTokenizer.h ('k') | third_party/WebKit/Source/core/inspector/InspectorStyleSheet.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp

diff --git a/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp b/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp

index 26de5664a0ae380942d1bbbf6d78b3d8d6c6a11b..9e1842beb9e8d6ba5c7d6c021e4d3de002bd84cd 100644

--- a/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp

+++ b/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp

@@ -9,70 +9,55 @@ namespace blink {

}

#include "core/css/parser/CSSParserIdioms.h"

-#include "core/css/parser/CSSParserObserverWrapper.h"

+#include "core/css/parser/CSSParserObserver.h"

#include "core/css/parser/CSSParserTokenRange.h"

#include "core/html/parser/HTMLParserIdioms.h"

#include "wtf/text/CharacterNames.h"

namespace blink {

-CSSTokenizer::CSSTokenizer(const String& string) : m_input(string) {

- // According to the spec, we should perform preprocessing here.

- // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing

- //

- // However, we can skip this step since:

- // * We're using HTML spaces (which accept \r and \f as a valid white space)

- // * Do not count white spaces

- // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement

- // characters

- if (string.isEmpty())

- return;

+// We handle input preprocessing substitutions during tokenization:

+// * We also accept \r and \f as white space

+// * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement

+// characters

+CSSTokenizer::CSSTokenizer(const String& string, size_t startOffset)

+ : m_input(string) {

+ m_input.advance(startOffset);

Charlie Harrison 2017/01/09 21:35:07 It looks like advance() does not change the underl

+CSSParserTokenRange CSSTokenizer::tokenRange() {

+ if (m_finishedTokenizing)

+ return m_tokens;

- // To avoid resizing we err on the side of reserving too much space.

- // Most strings we tokenize have about 3.5 to 5 characters per token.

- m_tokens.reserveInitialCapacity(string.length() / 3);

Charlie Harrison 2017/01/09 21:35:07 w000t!

+ // Try to avoid resizing the Vector by reserving space.

+ m_tokens.reserveInitialCapacity(m_input.length() / 3);

while (true) {

CSSParserToken token = nextToken();

if (token.type() == CommentToken)

continue;

if (token.type() == EOFToken)

- return;

+ break;

m_tokens.push_back(token);

}

+ m_finishedTokenizing = true;

+ return m_tokens;

}

-CSSTokenizer::CSSTokenizer(const String& string,

- CSSParserObserverWrapper& wrapper)

- : m_input(string) {

- if (string.isEmpty())

+void CSSTokenizer::tokenizeSingle() {

Charlie Harrison 2017/01/09 21:35:07 Maybe tokenizeSingle could return something useful

+ if (m_finishedTokenizing)

return;

- unsigned offset = 0;

while (true) {

CSSParserToken token = nextToken();

+ if (token.type() == CommentToken)

+ continue;

if (token.type() == EOFToken)

- break;

- if (token.type() == CommentToken) {

- wrapper.addComment(offset, m_input.offset(), m_tokens.size());

- } else {

+ m_finishedTokenizing = true;

+ else

m_tokens.push_back(token);

- wrapper.addToken(offset);

- }

- offset = m_input.offset();

+ return;

}

- wrapper.addToken(offset);

- wrapper.finalizeConstruction(m_tokens.begin());

-CSSParserTokenRange CSSTokenizer::tokenRange() {

- return m_tokens;

-unsigned CSSTokenizer::tokenCount() {

- return m_tokens.size();

}

static bool isNewLine(UChar cc) {

@@ -681,4 +666,196 @@ StringView CSSTokenizer::registerString(const String& string) {

return string;

}

+bool isCSSIdentCharacter(UChar cc) {

+ return isASCIIAlphanumeric(cc) || cc == '-' || cc == '_' || cc > 128;

+// If the next character is an ASCII insensitive match for cc, skips that.

+// Otherwise, skips the remaining ident characters.

+bool CSSTokenizer::skipIdentCharacters(UChar cc) {

+ UChar nextChar = consume();

+ if (isASCIIAlphaCaselessEqual(nextChar, cc))

Charlie Harrison 2017/01/09 21:35:07 Hm, this function has a compiler hint that the bra

+ return true;

+ if (nextChar == '\\') {

+ if (m_input.peekWithoutReplacement(0) == '\n') {

+ m_input.advance();

+ return false;

+ }

+ if (isASCIIAlphaCaselessEqual(consumeEscape(), cc))

+ return true;

+ }

+ skipIdentCharacters();

+ return false;

+void CSSTokenizer::skipIdentCharacters() {

+ while (true) {

+ UChar nextChar = m_input.peekWithoutReplacement(0);

+ if (nextChar == '\\') {

+ if (m_input.peekWithoutReplacement(1) == '\n') {

+ m_input.advance(2);

+ return;

+ }

+ m_input.advance();

+ consumeEscape();

+ }

+ if (!isCSSIdentCharacter(nextChar))

+ return;

+ m_input.advance();

+ }

+void CSSTokenizer::skipToBlockEnd() {

+ DCHECK(!m_blockStack.isEmpty());

+ DCHECK_EQ(m_blockStack.back(), LeftBraceToken);

+ int nesting = 1;

+ do {

+ UChar nextChar = m_input.peekWithoutReplacement(0);

+ // url's are special as their error recovery doesn't match blocks.

+ // url(([{) is a single <bad-url-token>, while moo(([{) needs }])) to

+ // close the component value. We need enough tokenization logic to be

+ // able to identify urls.

+ // Technically we should handle our non-standard unicode range tokens

+ // but probably no one will encounter these cases.

+ if (isCSSIdentCharacter(nextChar) || nextChar == '\\') {

+ if (!skipIdentCharacters('u') || !skipIdentCharacters('r') ||

+ !skipIdentCharacters('l'))

+ continue;

+ if (m_input.peekWithoutReplacement(0) != '(') {

+ skipIdentCharacters();

+ continue;

+ }

+ skipWhitespaceAndComments();

+ UChar innerChar = m_input.nextInputChar();

+ if (innerChar == '"' || innerChar == '\'') {

+ nesting++;

+ m_blockStack.append(LeftParenthesisToken);

+ continue;

+ }

+ while (innerChar != ')' && innerChar != '\0') {

+ m_input.advance();

+ if (innerChar == '\\') {

+ if (m_input.peekWithoutReplacement(0) == '\n')

+ m_input.advance();

+ else

+ consumeEscape();

+ }

+ innerChar = m_input.nextInputChar();

+ }

+ m_input.advance();

+ continue;

+ }

+ m_input.advance();

+ switch (nextChar) {

+ case '/':

+ if (m_input.peekWithoutReplacement(0) == '*') {

+ m_input.advance();

+ consumeUntilCommentEndFound();

+ }

+ break;

+ case '"':

+ case '\'': {

+ while (true) {

+ UChar cc = m_input.peekWithoutReplacement(0);

+ m_input.advance(1);

+ if (cc == nextChar || isNewLine(cc))

+ break;

+ if (cc == '\\') {

+ // This is to handle escapes delimited by newlines

+ consumeEscape();

+ continue;

+ }

+ if (cc == '\0' && m_input.nextInputChar() == '\0')

+ return; // EOF

+ }

+ break;

+ }

+ case '<':

+ // We need to handle this so <!--url( makes an url token

+ if (m_input.peekWithoutReplacement(0) == '!' &&

+ m_input.peekWithoutReplacement(1) == '-' &&

+ m_input.peekWithoutReplacement(2) == '-') {

+ m_input.advance(3);

+ }

+ break;

+ case '#':

+ case '@':

+ skipIdentCharacters();

+ break;

+ case '(':

+ nesting++;

+ m_blockStack.append(LeftParenthesisToken);

+ break;

+ case '[':

+ nesting++;

+ m_blockStack.append(LeftBracketToken);

+ break;

+ case '{':

+ nesting++;

+ m_blockStack.append(LeftBraceToken);

+ break;

+ case ')':

+ if (nesting && m_blockStack.back() == LeftParenthesisToken) {

+ nesting--;

+ m_blockStack.pop_back();

+ }

+ break;

+ case ']':

+ if (nesting && m_blockStack.back() == LeftBracketToken) {

+ nesting--;

+ m_blockStack.pop_back();

+ }

+ break;

+ case '}':

+ if (nesting && m_blockStack.back() == LeftBraceToken) {

+ nesting--;

+ m_blockStack.pop_back();

+ }

+ break;

+ case '\0':

+ if (m_input.nextInputChar() == '\0')

+ return;

+ skipIdentCharacters();

+ break;

+ }

+ } while (nesting);

+ m_tokens.pop_back();

+void CSSTokenizer::skipComments() {

+ while (m_input.peekWithoutReplacement(0) == '/' &&

+ m_input.peekWithoutReplacement(1) == '*') {

+ m_input.advance(2);

+ consumeUntilCommentEndFound();

+ }

+void CSSTokenizer::skipWhitespaceAndComments() {

+ while (true) {

+ m_input.advanceUntilNonWhitespace();

+ if (m_input.peekWithoutReplacement(0) == '/' &&

+ m_input.peekWithoutReplacement(1) == '*') {

+ m_input.advance(2);

+ consumeUntilCommentEndFound();

+ } else {

+ break;

+ }

+void CSSTokenizer::yieldComments(CSSParserObserver& observer) {

+ while (m_input.peekWithoutReplacement(0) == '/' &&

+ m_input.peekWithoutReplacement(1) == '*') {

+ size_t startOffset = m_input.offset();

+ m_input.advance(2);

+ consumeUntilCommentEndFound();

+ observer.observeComment(startOffset, m_input.offset());

+ }

} // namespace blink