Chromium Code Reviews| Index: third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp |
| diff --git a/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp b/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp |
| index 26de5664a0ae380942d1bbbf6d78b3d8d6c6a11b..9e1842beb9e8d6ba5c7d6c021e4d3de002bd84cd 100644 |
| --- a/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp |
| +++ b/third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp |
| @@ -9,70 +9,55 @@ namespace blink { |
| } |
| #include "core/css/parser/CSSParserIdioms.h" |
| -#include "core/css/parser/CSSParserObserverWrapper.h" |
| +#include "core/css/parser/CSSParserObserver.h" |
| #include "core/css/parser/CSSParserTokenRange.h" |
| #include "core/html/parser/HTMLParserIdioms.h" |
| #include "wtf/text/CharacterNames.h" |
| namespace blink { |
| -CSSTokenizer::CSSTokenizer(const String& string) : m_input(string) { |
| - // According to the spec, we should perform preprocessing here. |
| - // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing |
| - // |
| - // However, we can skip this step since: |
| - // * We're using HTML spaces (which accept \r and \f as a valid white space) |
| - // * Do not count white spaces |
| - // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement |
| - // characters |
| - |
| - if (string.isEmpty()) |
| - return; |
| +// We handle input preprocessing substitutions during tokenization: |
| +// * We also accept \r and \f as white space |
| +// * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement |
| +// characters |
| + |
| +CSSTokenizer::CSSTokenizer(const String& string, size_t startOffset) |
| + : m_input(string) { |
| + m_input.advance(startOffset); |
|
Charlie Harrison
2017/01/09 21:35:07
It looks like advance() does not change the underl
|
| +} |
| + |
| +CSSParserTokenRange CSSTokenizer::tokenRange() { |
| + if (m_finishedTokenizing) |
| + return m_tokens; |
| - // To avoid resizing we err on the side of reserving too much space. |
| - // Most strings we tokenize have about 3.5 to 5 characters per token. |
| - m_tokens.reserveInitialCapacity(string.length() / 3); |
|
Charlie Harrison
2017/01/09 21:35:07
w000t!
|
| + // Try to avoid resizing the Vector by reserving space. |
| + m_tokens.reserveInitialCapacity(m_input.length() / 3); |
| while (true) { |
| CSSParserToken token = nextToken(); |
| if (token.type() == CommentToken) |
| continue; |
| if (token.type() == EOFToken) |
| - return; |
| + break; |
| m_tokens.push_back(token); |
| } |
| + m_finishedTokenizing = true; |
| + return m_tokens; |
| } |
| -CSSTokenizer::CSSTokenizer(const String& string, |
| - CSSParserObserverWrapper& wrapper) |
| - : m_input(string) { |
| - if (string.isEmpty()) |
| +void CSSTokenizer::tokenizeSingle() { |
|
Charlie Harrison
2017/01/09 21:35:07
Maybe tokenizeSingle could return something useful
|
| + if (m_finishedTokenizing) |
| return; |
| - |
| - unsigned offset = 0; |
| while (true) { |
| CSSParserToken token = nextToken(); |
| + if (token.type() == CommentToken) |
| + continue; |
| if (token.type() == EOFToken) |
| - break; |
| - if (token.type() == CommentToken) { |
| - wrapper.addComment(offset, m_input.offset(), m_tokens.size()); |
| - } else { |
| + m_finishedTokenizing = true; |
| + else |
| m_tokens.push_back(token); |
| - wrapper.addToken(offset); |
| - } |
| - offset = m_input.offset(); |
| + return; |
| } |
| - |
| - wrapper.addToken(offset); |
| - wrapper.finalizeConstruction(m_tokens.begin()); |
| -} |
| - |
| -CSSParserTokenRange CSSTokenizer::tokenRange() { |
| - return m_tokens; |
| -} |
| - |
| -unsigned CSSTokenizer::tokenCount() { |
| - return m_tokens.size(); |
| } |
| static bool isNewLine(UChar cc) { |
| @@ -681,4 +666,196 @@ StringView CSSTokenizer::registerString(const String& string) { |
| return string; |
| } |
| +bool isCSSIdentCharacter(UChar cc) { |
| + return isASCIIAlphanumeric(cc) || cc == '-' || cc == '_' || cc > 128; |
| +} |
| + |
| +// If the next character is an ASCII insensitive match for cc, skips that. |
| +// Otherwise, skips the remaining ident characters. |
| +bool CSSTokenizer::skipIdentCharacters(UChar cc) { |
| + UChar nextChar = consume(); |
| + if (isASCIIAlphaCaselessEqual(nextChar, cc)) |
|
Charlie Harrison
2017/01/09 21:35:07
Hm, this function has a compiler hint that the bra
|
| + return true; |
| + if (nextChar == '\\') { |
| + if (m_input.peekWithoutReplacement(0) == '\n') { |
| + m_input.advance(); |
| + return false; |
| + } |
| + if (isASCIIAlphaCaselessEqual(consumeEscape(), cc)) |
| + return true; |
| + } |
| + skipIdentCharacters(); |
| + return false; |
| +} |
| + |
| +void CSSTokenizer::skipIdentCharacters() { |
| + while (true) { |
| + UChar nextChar = m_input.peekWithoutReplacement(0); |
| + if (nextChar == '\\') { |
| + if (m_input.peekWithoutReplacement(1) == '\n') { |
| + m_input.advance(2); |
| + return; |
| + } |
| + m_input.advance(); |
| + consumeEscape(); |
| + } |
| + if (!isCSSIdentCharacter(nextChar)) |
| + return; |
| + m_input.advance(); |
| + } |
| +} |
| + |
| +void CSSTokenizer::skipToBlockEnd() { |
| + DCHECK(!m_blockStack.isEmpty()); |
| + DCHECK_EQ(m_blockStack.back(), LeftBraceToken); |
| + int nesting = 1; |
| + do { |
| + UChar nextChar = m_input.peekWithoutReplacement(0); |
| + |
| + // url's are special as their error recovery doesn't match blocks. |
| + // url(([{) is a single <bad-url-token>, while moo(([{) needs }])) to |
| + // close the component value. We need enough tokenization logic to be |
| + // able to identify urls. |
| + // Technically we should handle our non-standard unicode range tokens |
| + // but probably no one will encounter these cases. |
| + if (isCSSIdentCharacter(nextChar) || nextChar == '\\') { |
| + if (!skipIdentCharacters('u') || !skipIdentCharacters('r') || |
| + !skipIdentCharacters('l')) |
| + continue; |
| + |
| + if (m_input.peekWithoutReplacement(0) != '(') { |
| + skipIdentCharacters(); |
| + continue; |
| + } |
| + |
| + skipWhitespaceAndComments(); |
| + UChar innerChar = m_input.nextInputChar(); |
| + if (innerChar == '"' || innerChar == '\'') { |
| + nesting++; |
| + m_blockStack.append(LeftParenthesisToken); |
| + continue; |
| + } |
| + while (innerChar != ')' && innerChar != '\0') { |
| + m_input.advance(); |
| + if (innerChar == '\\') { |
| + if (m_input.peekWithoutReplacement(0) == '\n') |
| + m_input.advance(); |
| + else |
| + consumeEscape(); |
| + } |
| + innerChar = m_input.nextInputChar(); |
| + } |
| + m_input.advance(); |
| + continue; |
| + } |
| + |
| + m_input.advance(); |
| + |
| + switch (nextChar) { |
| + case '/': |
| + if (m_input.peekWithoutReplacement(0) == '*') { |
| + m_input.advance(); |
| + consumeUntilCommentEndFound(); |
| + } |
| + break; |
| + case '"': |
| + case '\'': { |
| + while (true) { |
| + UChar cc = m_input.peekWithoutReplacement(0); |
| + m_input.advance(1); |
| + if (cc == nextChar || isNewLine(cc)) |
| + break; |
| + if (cc == '\\') { |
| + // This is to handle escapes delimited by newlines |
| + consumeEscape(); |
| + continue; |
| + } |
| + if (cc == '\0' && m_input.nextInputChar() == '\0') |
| + return; // EOF |
| + } |
| + break; |
| + } |
| + case '<': |
| + // We need to handle this so <!--url( makes an url token |
| + if (m_input.peekWithoutReplacement(0) == '!' && |
| + m_input.peekWithoutReplacement(1) == '-' && |
| + m_input.peekWithoutReplacement(2) == '-') { |
| + m_input.advance(3); |
| + } |
| + break; |
| + case '#': |
| + case '@': |
| + skipIdentCharacters(); |
| + break; |
| + case '(': |
| + nesting++; |
| + m_blockStack.append(LeftParenthesisToken); |
| + break; |
| + case '[': |
| + nesting++; |
| + m_blockStack.append(LeftBracketToken); |
| + break; |
| + case '{': |
| + nesting++; |
| + m_blockStack.append(LeftBraceToken); |
| + break; |
| + case ')': |
| + if (nesting && m_blockStack.back() == LeftParenthesisToken) { |
| + nesting--; |
| + m_blockStack.pop_back(); |
| + } |
| + break; |
| + case ']': |
| + if (nesting && m_blockStack.back() == LeftBracketToken) { |
| + nesting--; |
| + m_blockStack.pop_back(); |
| + } |
| + break; |
| + case '}': |
| + if (nesting && m_blockStack.back() == LeftBraceToken) { |
| + nesting--; |
| + m_blockStack.pop_back(); |
| + } |
| + break; |
| + case '\0': |
| + if (m_input.nextInputChar() == '\0') |
| + return; |
| + skipIdentCharacters(); |
| + break; |
| + } |
| + } while (nesting); |
| + m_tokens.pop_back(); |
| +} |
| + |
| +void CSSTokenizer::skipComments() { |
| + while (m_input.peekWithoutReplacement(0) == '/' && |
| + m_input.peekWithoutReplacement(1) == '*') { |
| + m_input.advance(2); |
| + consumeUntilCommentEndFound(); |
| + } |
| +} |
| + |
| +void CSSTokenizer::skipWhitespaceAndComments() { |
| + while (true) { |
| + m_input.advanceUntilNonWhitespace(); |
| + if (m_input.peekWithoutReplacement(0) == '/' && |
| + m_input.peekWithoutReplacement(1) == '*') { |
| + m_input.advance(2); |
| + consumeUntilCommentEndFound(); |
| + } else { |
| + break; |
| + } |
| + } |
| +} |
| + |
| +void CSSTokenizer::yieldComments(CSSParserObserver& observer) { |
| + while (m_input.peekWithoutReplacement(0) == '/' && |
| + m_input.peekWithoutReplacement(1) == '*') { |
| + size_t startOffset = m_input.offset(); |
| + m_input.advance(2); |
| + consumeUntilCommentEndFound(); |
| + observer.observeComment(startOffset, m_input.offset()); |
| + } |
| +} |
| + |
| } // namespace blink |