Index: third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp |
diff --git a/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp b/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp |
index 26de5664a0ae380942d1bbbf6d78b3d8d6c6a11b..9e1842beb9e8d6ba5c7d6c021e4d3de002bd84cd 100644 |
--- a/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp |
+++ b/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp |
@@ -9,70 +9,55 @@ namespace blink { |
} |
#include "core/css/parser/CSSParserIdioms.h" |
-#include "core/css/parser/CSSParserObserverWrapper.h" |
+#include "core/css/parser/CSSParserObserver.h" |
#include "core/css/parser/CSSParserTokenRange.h" |
#include "core/html/parser/HTMLParserIdioms.h" |
#include "wtf/text/CharacterNames.h" |
namespace blink { |
-CSSTokenizer::CSSTokenizer(const String& string) : m_input(string) { |
- // According to the spec, we should perform preprocessing here. |
- // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing |
- // |
- // However, we can skip this step since: |
- // * We're using HTML spaces (which accept \r and \f as a valid white space) |
- // * Do not count white spaces |
- // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement |
- // characters |
- |
- if (string.isEmpty()) |
- return; |
+// We handle input preprocessing substitutions during tokenization: |
+// * We also accept \r and \f as white space |
+// * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement |
+// characters |
+ |
+CSSTokenizer::CSSTokenizer(const String& string, size_t startOffset) |
+ : m_input(string) { |
+ m_input.advance(startOffset); |
Charlie Harrison
2017/01/09 21:35:07
It looks like advance() does not change the underl
|
+} |
+ |
+CSSParserTokenRange CSSTokenizer::tokenRange() { |
+ if (m_finishedTokenizing) |
+ return m_tokens; |
- // To avoid resizing we err on the side of reserving too much space. |
- // Most strings we tokenize have about 3.5 to 5 characters per token. |
- m_tokens.reserveInitialCapacity(string.length() / 3); |
Charlie Harrison
2017/01/09 21:35:07
w000t!
|
+ // Try to avoid resizing the Vector by reserving space. |
+ m_tokens.reserveInitialCapacity(m_input.length() / 3); |
while (true) { |
CSSParserToken token = nextToken(); |
if (token.type() == CommentToken) |
continue; |
if (token.type() == EOFToken) |
- return; |
+ break; |
m_tokens.push_back(token); |
} |
+ m_finishedTokenizing = true; |
+ return m_tokens; |
} |
-CSSTokenizer::CSSTokenizer(const String& string, |
- CSSParserObserverWrapper& wrapper) |
- : m_input(string) { |
- if (string.isEmpty()) |
+void CSSTokenizer::tokenizeSingle() { |
Charlie Harrison
2017/01/09 21:35:07
Maybe tokenizeSingle could return something useful
|
+ if (m_finishedTokenizing) |
return; |
- |
- unsigned offset = 0; |
while (true) { |
CSSParserToken token = nextToken(); |
+ if (token.type() == CommentToken) |
+ continue; |
if (token.type() == EOFToken) |
- break; |
- if (token.type() == CommentToken) { |
- wrapper.addComment(offset, m_input.offset(), m_tokens.size()); |
- } else { |
+ m_finishedTokenizing = true; |
+ else |
m_tokens.push_back(token); |
- wrapper.addToken(offset); |
- } |
- offset = m_input.offset(); |
+ return; |
} |
- |
- wrapper.addToken(offset); |
- wrapper.finalizeConstruction(m_tokens.begin()); |
-} |
- |
-CSSParserTokenRange CSSTokenizer::tokenRange() { |
- return m_tokens; |
-} |
- |
-unsigned CSSTokenizer::tokenCount() { |
- return m_tokens.size(); |
} |
static bool isNewLine(UChar cc) { |
@@ -681,4 +666,196 @@ StringView CSSTokenizer::registerString(const String& string) { |
return string; |
} |
+bool isCSSIdentCharacter(UChar cc) { |
+ return isASCIIAlphanumeric(cc) || cc == '-' || cc == '_' || cc > 128; |
+} |
+ |
+// If the next character is an ASCII insensitive match for cc, skips that. |
+// Otherwise, skips the remaining ident characters. |
+bool CSSTokenizer::skipIdentCharacters(UChar cc) { |
+ UChar nextChar = consume(); |
+ if (isASCIIAlphaCaselessEqual(nextChar, cc)) |
Charlie Harrison
2017/01/09 21:35:07
Hm, this function has a compiler hint that the bra
|
+ return true; |
+ if (nextChar == '\\') { |
+ if (m_input.peekWithoutReplacement(0) == '\n') { |
+ m_input.advance(); |
+ return false; |
+ } |
+ if (isASCIIAlphaCaselessEqual(consumeEscape(), cc)) |
+ return true; |
+ } |
+ skipIdentCharacters(); |
+ return false; |
+} |
+ |
+void CSSTokenizer::skipIdentCharacters() { |
+ while (true) { |
+ UChar nextChar = m_input.peekWithoutReplacement(0); |
+ if (nextChar == '\\') { |
+ if (m_input.peekWithoutReplacement(1) == '\n') { |
+ m_input.advance(2); |
+ return; |
+ } |
+ m_input.advance(); |
+ consumeEscape(); |
+ } |
+ if (!isCSSIdentCharacter(nextChar)) |
+ return; |
+ m_input.advance(); |
+ } |
+} |
+ |
+void CSSTokenizer::skipToBlockEnd() { |
+ DCHECK(!m_blockStack.isEmpty()); |
+ DCHECK_EQ(m_blockStack.back(), LeftBraceToken); |
+ int nesting = 1; |
+ do { |
+ UChar nextChar = m_input.peekWithoutReplacement(0); |
+ |
+ // url's are special as their error recovery doesn't match blocks. |
+ // url(([{) is a single <bad-url-token>, while moo(([{) needs }])) to |
+ // close the component value. We need enough tokenization logic to be |
+ // able to identify urls. |
+ // Technically we should handle our non-standard unicode range tokens |
+ // but probably no one will encounter these cases. |
+ if (isCSSIdentCharacter(nextChar) || nextChar == '\\') { |
+ if (!skipIdentCharacters('u') || !skipIdentCharacters('r') || |
+ !skipIdentCharacters('l')) |
+ continue; |
+ |
+ if (m_input.peekWithoutReplacement(0) != '(') { |
+ skipIdentCharacters(); |
+ continue; |
+ } |
+ |
+ skipWhitespaceAndComments(); |
+ UChar innerChar = m_input.nextInputChar(); |
+ if (innerChar == '"' || innerChar == '\'') { |
+ nesting++; |
+ m_blockStack.append(LeftParenthesisToken); |
+ continue; |
+ } |
+ while (innerChar != ')' && innerChar != '\0') { |
+ m_input.advance(); |
+ if (innerChar == '\\') { |
+ if (m_input.peekWithoutReplacement(0) == '\n') |
+ m_input.advance(); |
+ else |
+ consumeEscape(); |
+ } |
+ innerChar = m_input.nextInputChar(); |
+ } |
+ m_input.advance(); |
+ continue; |
+ } |
+ |
+ m_input.advance(); |
+ |
+ switch (nextChar) { |
+ case '/': |
+ if (m_input.peekWithoutReplacement(0) == '*') { |
+ m_input.advance(); |
+ consumeUntilCommentEndFound(); |
+ } |
+ break; |
+ case '"': |
+ case '\'': { |
+ while (true) { |
+ UChar cc = m_input.peekWithoutReplacement(0); |
+ m_input.advance(1); |
+ if (cc == nextChar || isNewLine(cc)) |
+ break; |
+ if (cc == '\\') { |
+ // This is to handle escapes delimited by newlines |
+ consumeEscape(); |
+ continue; |
+ } |
+ if (cc == '\0' && m_input.nextInputChar() == '\0') |
+ return; // EOF |
+ } |
+ break; |
+ } |
+ case '<': |
+ // We need to handle this so <!--url( makes an url token |
+ if (m_input.peekWithoutReplacement(0) == '!' && |
+ m_input.peekWithoutReplacement(1) == '-' && |
+ m_input.peekWithoutReplacement(2) == '-') { |
+ m_input.advance(3); |
+ } |
+ break; |
+ case '#': |
+ case '@': |
+ skipIdentCharacters(); |
+ break; |
+ case '(': |
+ nesting++; |
+ m_blockStack.append(LeftParenthesisToken); |
+ break; |
+ case '[': |
+ nesting++; |
+ m_blockStack.append(LeftBracketToken); |
+ break; |
+ case '{': |
+ nesting++; |
+ m_blockStack.append(LeftBraceToken); |
+ break; |
+ case ')': |
+ if (nesting && m_blockStack.back() == LeftParenthesisToken) { |
+ nesting--; |
+ m_blockStack.pop_back(); |
+ } |
+ break; |
+ case ']': |
+ if (nesting && m_blockStack.back() == LeftBracketToken) { |
+ nesting--; |
+ m_blockStack.pop_back(); |
+ } |
+ break; |
+ case '}': |
+ if (nesting && m_blockStack.back() == LeftBraceToken) { |
+ nesting--; |
+ m_blockStack.pop_back(); |
+ } |
+ break; |
+ case '\0': |
+ if (m_input.nextInputChar() == '\0') |
+ return; |
+ skipIdentCharacters(); |
+ break; |
+ } |
+ } while (nesting); |
+ m_tokens.pop_back(); |
+} |
+ |
+void CSSTokenizer::skipComments() { |
+ while (m_input.peekWithoutReplacement(0) == '/' && |
+ m_input.peekWithoutReplacement(1) == '*') { |
+ m_input.advance(2); |
+ consumeUntilCommentEndFound(); |
+ } |
+} |
+ |
+void CSSTokenizer::skipWhitespaceAndComments() { |
+ while (true) { |
+ m_input.advanceUntilNonWhitespace(); |
+ if (m_input.peekWithoutReplacement(0) == '/' && |
+ m_input.peekWithoutReplacement(1) == '*') { |
+ m_input.advance(2); |
+ consumeUntilCommentEndFound(); |
+ } else { |
+ break; |
+ } |
+ } |
+} |
+ |
+void CSSTokenizer::yieldComments(CSSParserObserver& observer) { |
+ while (m_input.peekWithoutReplacement(0) == '/' && |
+ m_input.peekWithoutReplacement(1) == '*') { |
+ size_t startOffset = m_input.offset(); |
+ m_input.advance(2); |
+ consumeUntilCommentEndFound(); |
+ observer.observeComment(startOffset, m_input.offset()); |
+ } |
+} |
+ |
} // namespace blink |