Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(178)

Unified Diff: third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp

Issue 2503683003: [WIP] Streaming CSS parser (Closed)
Patch Set: rebase Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp
diff --git a/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp b/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp
index 26de5664a0ae380942d1bbbf6d78b3d8d6c6a11b..9e1842beb9e8d6ba5c7d6c021e4d3de002bd84cd 100644
--- a/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp
+++ b/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp
@@ -9,70 +9,55 @@ namespace blink {
}
#include "core/css/parser/CSSParserIdioms.h"
-#include "core/css/parser/CSSParserObserverWrapper.h"
+#include "core/css/parser/CSSParserObserver.h"
#include "core/css/parser/CSSParserTokenRange.h"
#include "core/html/parser/HTMLParserIdioms.h"
#include "wtf/text/CharacterNames.h"
namespace blink {
-CSSTokenizer::CSSTokenizer(const String& string) : m_input(string) {
- // According to the spec, we should perform preprocessing here.
- // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing
- //
- // However, we can skip this step since:
- // * We're using HTML spaces (which accept \r and \f as a valid white space)
- // * Do not count white spaces
- // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement
- // characters
-
- if (string.isEmpty())
- return;
+// We handle input preprocessing substitutions during tokenization:
+// * We also accept \r and \f as white space
+// * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement
+// characters
+
+CSSTokenizer::CSSTokenizer(const String& string, size_t startOffset)
+ : m_input(string) {
+ m_input.advance(startOffset);
Charlie Harrison 2017/01/09 21:35:07 It looks like advance() does not change the underl
+}
+
+CSSParserTokenRange CSSTokenizer::tokenRange() {
+ if (m_finishedTokenizing)
+ return m_tokens;
- // To avoid resizing we err on the side of reserving too much space.
- // Most strings we tokenize have about 3.5 to 5 characters per token.
- m_tokens.reserveInitialCapacity(string.length() / 3);
Charlie Harrison 2017/01/09 21:35:07 w000t!
+ // Try to avoid resizing the Vector by reserving space.
+ m_tokens.reserveInitialCapacity(m_input.length() / 3);
while (true) {
CSSParserToken token = nextToken();
if (token.type() == CommentToken)
continue;
if (token.type() == EOFToken)
- return;
+ break;
m_tokens.push_back(token);
}
+ m_finishedTokenizing = true;
+ return m_tokens;
}
-CSSTokenizer::CSSTokenizer(const String& string,
- CSSParserObserverWrapper& wrapper)
- : m_input(string) {
- if (string.isEmpty())
+void CSSTokenizer::tokenizeSingle() {
Charlie Harrison 2017/01/09 21:35:07 Maybe tokenizeSingle could return something useful
+ if (m_finishedTokenizing)
return;
-
- unsigned offset = 0;
while (true) {
CSSParserToken token = nextToken();
+ if (token.type() == CommentToken)
+ continue;
if (token.type() == EOFToken)
- break;
- if (token.type() == CommentToken) {
- wrapper.addComment(offset, m_input.offset(), m_tokens.size());
- } else {
+ m_finishedTokenizing = true;
+ else
m_tokens.push_back(token);
- wrapper.addToken(offset);
- }
- offset = m_input.offset();
+ return;
}
-
- wrapper.addToken(offset);
- wrapper.finalizeConstruction(m_tokens.begin());
-}
-
-CSSParserTokenRange CSSTokenizer::tokenRange() {
- return m_tokens;
-}
-
-unsigned CSSTokenizer::tokenCount() {
- return m_tokens.size();
}
static bool isNewLine(UChar cc) {
@@ -681,4 +666,196 @@ StringView CSSTokenizer::registerString(const String& string) {
return string;
}
+bool isCSSIdentCharacter(UChar cc) {
+ return isASCIIAlphanumeric(cc) || cc == '-' || cc == '_' || cc > 128;
+}
+
+// If the next character is an ASCII insensitive match for cc, skips that.
+// Otherwise, skips the remaining ident characters.
+bool CSSTokenizer::skipIdentCharacters(UChar cc) {
+ UChar nextChar = consume();
+ if (isASCIIAlphaCaselessEqual(nextChar, cc))
Charlie Harrison 2017/01/09 21:35:07 Hm, this function has a compiler hint that the bra
+ return true;
+ if (nextChar == '\\') {
+ if (m_input.peekWithoutReplacement(0) == '\n') {
+ m_input.advance();
+ return false;
+ }
+ if (isASCIIAlphaCaselessEqual(consumeEscape(), cc))
+ return true;
+ }
+ skipIdentCharacters();
+ return false;
+}
+
+void CSSTokenizer::skipIdentCharacters() {
+ while (true) {
+ UChar nextChar = m_input.peekWithoutReplacement(0);
+ if (nextChar == '\\') {
+ if (m_input.peekWithoutReplacement(1) == '\n') {
+ m_input.advance(2);
+ return;
+ }
+ m_input.advance();
+ consumeEscape();
+ }
+ if (!isCSSIdentCharacter(nextChar))
+ return;
+ m_input.advance();
+ }
+}
+
+void CSSTokenizer::skipToBlockEnd() {
+ DCHECK(!m_blockStack.isEmpty());
+ DCHECK_EQ(m_blockStack.back(), LeftBraceToken);
+ int nesting = 1;
+ do {
+ UChar nextChar = m_input.peekWithoutReplacement(0);
+
+ // url's are special as their error recovery doesn't match blocks.
+ // url(([{) is a single <bad-url-token>, while moo(([{) needs }])) to
+ // close the component value. We need enough tokenization logic to be
+ // able to identify urls.
+ // Technically we should handle our non-standard unicode range tokens
+ // but probably no one will encounter these cases.
+ if (isCSSIdentCharacter(nextChar) || nextChar == '\\') {
+ if (!skipIdentCharacters('u') || !skipIdentCharacters('r') ||
+ !skipIdentCharacters('l'))
+ continue;
+
+ if (m_input.peekWithoutReplacement(0) != '(') {
+ skipIdentCharacters();
+ continue;
+ }
+
+ skipWhitespaceAndComments();
+ UChar innerChar = m_input.nextInputChar();
+ if (innerChar == '"' || innerChar == '\'') {
+ nesting++;
+ m_blockStack.append(LeftParenthesisToken);
+ continue;
+ }
+ while (innerChar != ')' && innerChar != '\0') {
+ m_input.advance();
+ if (innerChar == '\\') {
+ if (m_input.peekWithoutReplacement(0) == '\n')
+ m_input.advance();
+ else
+ consumeEscape();
+ }
+ innerChar = m_input.nextInputChar();
+ }
+ m_input.advance();
+ continue;
+ }
+
+ m_input.advance();
+
+ switch (nextChar) {
+ case '/':
+ if (m_input.peekWithoutReplacement(0) == '*') {
+ m_input.advance();
+ consumeUntilCommentEndFound();
+ }
+ break;
+ case '"':
+ case '\'': {
+ while (true) {
+ UChar cc = m_input.peekWithoutReplacement(0);
+ m_input.advance(1);
+ if (cc == nextChar || isNewLine(cc))
+ break;
+ if (cc == '\\') {
+ // This is to handle escapes delimited by newlines
+ consumeEscape();
+ continue;
+ }
+ if (cc == '\0' && m_input.nextInputChar() == '\0')
+ return; // EOF
+ }
+ break;
+ }
+ case '<':
+ // We need to handle this so <!--url( makes an url token
+ if (m_input.peekWithoutReplacement(0) == '!' &&
+ m_input.peekWithoutReplacement(1) == '-' &&
+ m_input.peekWithoutReplacement(2) == '-') {
+ m_input.advance(3);
+ }
+ break;
+ case '#':
+ case '@':
+ skipIdentCharacters();
+ break;
+ case '(':
+ nesting++;
+ m_blockStack.append(LeftParenthesisToken);
+ break;
+ case '[':
+ nesting++;
+ m_blockStack.append(LeftBracketToken);
+ break;
+ case '{':
+ nesting++;
+ m_blockStack.append(LeftBraceToken);
+ break;
+ case ')':
+ if (nesting && m_blockStack.back() == LeftParenthesisToken) {
+ nesting--;
+ m_blockStack.pop_back();
+ }
+ break;
+ case ']':
+ if (nesting && m_blockStack.back() == LeftBracketToken) {
+ nesting--;
+ m_blockStack.pop_back();
+ }
+ break;
+ case '}':
+ if (nesting && m_blockStack.back() == LeftBraceToken) {
+ nesting--;
+ m_blockStack.pop_back();
+ }
+ break;
+ case '\0':
+ if (m_input.nextInputChar() == '\0')
+ return;
+ skipIdentCharacters();
+ break;
+ }
+ } while (nesting);
+ m_tokens.pop_back();
+}
+
+void CSSTokenizer::skipComments() {
+ while (m_input.peekWithoutReplacement(0) == '/' &&
+ m_input.peekWithoutReplacement(1) == '*') {
+ m_input.advance(2);
+ consumeUntilCommentEndFound();
+ }
+}
+
+void CSSTokenizer::skipWhitespaceAndComments() {
+ while (true) {
+ m_input.advanceUntilNonWhitespace();
+ if (m_input.peekWithoutReplacement(0) == '/' &&
+ m_input.peekWithoutReplacement(1) == '*') {
+ m_input.advance(2);
+ consumeUntilCommentEndFound();
+ } else {
+ break;
+ }
+ }
+}
+
+void CSSTokenizer::yieldComments(CSSParserObserver& observer) {
+ while (m_input.peekWithoutReplacement(0) == '/' &&
+ m_input.peekWithoutReplacement(1) == '*') {
+ size_t startOffset = m_input.offset();
+ m_input.advance(2);
+ consumeUntilCommentEndFound();
+ observer.observeComment(startOffset, m_input.offset());
+ }
+}
+
} // namespace blink

Powered by Google App Engine
This is Rietveld 408576698