Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(44)

Side by Side Diff: third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp

Issue 2503683003: [WIP] Streaming CSS parser (Closed)
Patch Set: rebase Created 3 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "core/css/parser/CSSTokenizer.h" 5 #include "core/css/parser/CSSTokenizer.h"
6 6
7 namespace blink { 7 namespace blink {
8 #include "core/CSSTokenizerCodepoints.cpp" 8 #include "core/CSSTokenizerCodepoints.cpp"
9 } 9 }
10 10
11 #include "core/css/parser/CSSParserIdioms.h" 11 #include "core/css/parser/CSSParserIdioms.h"
12 #include "core/css/parser/CSSParserObserverWrapper.h" 12 #include "core/css/parser/CSSParserObserver.h"
13 #include "core/css/parser/CSSParserTokenRange.h" 13 #include "core/css/parser/CSSParserTokenRange.h"
14 #include "core/html/parser/HTMLParserIdioms.h" 14 #include "core/html/parser/HTMLParserIdioms.h"
15 #include "wtf/text/CharacterNames.h" 15 #include "wtf/text/CharacterNames.h"
16 16
17 namespace blink { 17 namespace blink {
18 18
19 CSSTokenizer::CSSTokenizer(const String& string) : m_input(string) { 19 // We handle input preprocessing substitutions during tokenization:
20 // According to the spec, we should perform preprocessing here. 20 // * We also accept \r and \f as white space
21 // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing 21 // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement
22 // 22 // characters
23 // However, we can skip this step since:
24 // * We're using HTML spaces (which accept \r and \f as a valid white space)
25 // * Do not count white spaces
26 // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement
27 // characters
28 23
29 if (string.isEmpty()) 24 CSSTokenizer::CSSTokenizer(const String& string, size_t startOffset)
30 return; 25 : m_input(string) {
26 m_input.advance(startOffset);
Charlie Harrison 2017/01/09 21:35:07 It looks like advance() does not change the underl
27 }
31 28
32 // To avoid resizing we err on the side of reserving too much space. 29 CSSParserTokenRange CSSTokenizer::tokenRange() {
33 // Most strings we tokenize have about 3.5 to 5 characters per token. 30 if (m_finishedTokenizing)
34 m_tokens.reserveInitialCapacity(string.length() / 3); 31 return m_tokens;
Charlie Harrison 2017/01/09 21:35:07 w000t!
32
33 // Try to avoid resizing the Vector by reserving space.
34 m_tokens.reserveInitialCapacity(m_input.length() / 3);
35 35
36 while (true) { 36 while (true) {
37 CSSParserToken token = nextToken(); 37 CSSParserToken token = nextToken();
38 if (token.type() == CommentToken) 38 if (token.type() == CommentToken)
39 continue; 39 continue;
40 if (token.type() == EOFToken) 40 if (token.type() == EOFToken)
41 return; 41 break;
42 m_tokens.push_back(token); 42 m_tokens.push_back(token);
43 } 43 }
44 m_finishedTokenizing = true;
45 return m_tokens;
46 }
47
48 void CSSTokenizer::tokenizeSingle() {
Charlie Harrison 2017/01/09 21:35:07 Maybe tokenizeSingle could return something useful
49 if (m_finishedTokenizing)
50 return;
51 while (true) {
52 CSSParserToken token = nextToken();
53 if (token.type() == CommentToken)
54 continue;
55 if (token.type() == EOFToken)
56 m_finishedTokenizing = true;
57 else
58 m_tokens.push_back(token);
59 return;
60 }
44 } 61 }
45 62
46 CSSTokenizer::CSSTokenizer(const String& string,
47 CSSParserObserverWrapper& wrapper)
48 : m_input(string) {
49 if (string.isEmpty())
50 return;
51
52 unsigned offset = 0;
53 while (true) {
54 CSSParserToken token = nextToken();
55 if (token.type() == EOFToken)
56 break;
57 if (token.type() == CommentToken) {
58 wrapper.addComment(offset, m_input.offset(), m_tokens.size());
59 } else {
60 m_tokens.push_back(token);
61 wrapper.addToken(offset);
62 }
63 offset = m_input.offset();
64 }
65
66 wrapper.addToken(offset);
67 wrapper.finalizeConstruction(m_tokens.begin());
68 }
69
70 CSSParserTokenRange CSSTokenizer::tokenRange() {
71 return m_tokens;
72 }
73
74 unsigned CSSTokenizer::tokenCount() {
75 return m_tokens.size();
76 }
77
78 static bool isNewLine(UChar cc) { 63 static bool isNewLine(UChar cc) {
79 // We check \r and \f here, since we have no preprocessing stage 64 // We check \r and \f here, since we have no preprocessing stage
80 return (cc == '\r' || cc == '\n' || cc == '\f'); 65 return (cc == '\r' || cc == '\n' || cc == '\f');
81 } 66 }
82 67
83 // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-esca pe 68 // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-esca pe
84 static bool twoCharsAreValidEscape(UChar first, UChar second) { 69 static bool twoCharsAreValidEscape(UChar first, UChar second) {
85 return first == '\\' && !isNewLine(second); 70 return first == '\\' && !isNewLine(second);
86 } 71 }
87 72
(...skipping 586 matching lines...) Expand 10 before | Expand all | Expand 10 after
674 bool areIdentifier = nextCharsAreIdentifier(first); 659 bool areIdentifier = nextCharsAreIdentifier(first);
675 reconsume(first); 660 reconsume(first);
676 return areIdentifier; 661 return areIdentifier;
677 } 662 }
678 663
679 StringView CSSTokenizer::registerString(const String& string) { 664 StringView CSSTokenizer::registerString(const String& string) {
680 m_stringPool.push_back(string); 665 m_stringPool.push_back(string);
681 return string; 666 return string;
682 } 667 }
683 668
669 bool isCSSIdentCharacter(UChar cc) {
670 return isASCIIAlphanumeric(cc) || cc == '-' || cc == '_' || cc > 128;
671 }
672
673 // If the next character is an ASCII insensitive match for cc, skips that.
674 // Otherwise, skips the remaining ident characters.
675 bool CSSTokenizer::skipIdentCharacters(UChar cc) {
676 UChar nextChar = consume();
677 if (isASCIIAlphaCaselessEqual(nextChar, cc))
Charlie Harrison 2017/01/09 21:35:07 Hm, this function has a compiler hint that the bra
678 return true;
679 if (nextChar == '\\') {
680 if (m_input.peekWithoutReplacement(0) == '\n') {
681 m_input.advance();
682 return false;
683 }
684 if (isASCIIAlphaCaselessEqual(consumeEscape(), cc))
685 return true;
686 }
687 skipIdentCharacters();
688 return false;
689 }
690
691 void CSSTokenizer::skipIdentCharacters() {
692 while (true) {
693 UChar nextChar = m_input.peekWithoutReplacement(0);
694 if (nextChar == '\\') {
695 if (m_input.peekWithoutReplacement(1) == '\n') {
696 m_input.advance(2);
697 return;
698 }
699 m_input.advance();
700 consumeEscape();
701 }
702 if (!isCSSIdentCharacter(nextChar))
703 return;
704 m_input.advance();
705 }
706 }
707
708 void CSSTokenizer::skipToBlockEnd() {
709 DCHECK(!m_blockStack.isEmpty());
710 DCHECK_EQ(m_blockStack.back(), LeftBraceToken);
711 int nesting = 1;
712 do {
713 UChar nextChar = m_input.peekWithoutReplacement(0);
714
715 // url's are special as their error recovery doesn't match blocks.
716 // url(([{) is a single <bad-url-token>, while moo(([{) needs }])) to
717 // close the component value. We need enough tokenization logic to be
718 // able to identify urls.
719 // Technically we should handle our non-standard unicode range tokens
720 // but probably no one will encounter these cases.
721 if (isCSSIdentCharacter(nextChar) || nextChar == '\\') {
722 if (!skipIdentCharacters('u') || !skipIdentCharacters('r') ||
723 !skipIdentCharacters('l'))
724 continue;
725
726 if (m_input.peekWithoutReplacement(0) != '(') {
727 skipIdentCharacters();
728 continue;
729 }
730
731 skipWhitespaceAndComments();
732 UChar innerChar = m_input.nextInputChar();
733 if (innerChar == '"' || innerChar == '\'') {
734 nesting++;
735 m_blockStack.append(LeftParenthesisToken);
736 continue;
737 }
738 while (innerChar != ')' && innerChar != '\0') {
739 m_input.advance();
740 if (innerChar == '\\') {
741 if (m_input.peekWithoutReplacement(0) == '\n')
742 m_input.advance();
743 else
744 consumeEscape();
745 }
746 innerChar = m_input.nextInputChar();
747 }
748 m_input.advance();
749 continue;
750 }
751
752 m_input.advance();
753
754 switch (nextChar) {
755 case '/':
756 if (m_input.peekWithoutReplacement(0) == '*') {
757 m_input.advance();
758 consumeUntilCommentEndFound();
759 }
760 break;
761 case '"':
762 case '\'': {
763 while (true) {
764 UChar cc = m_input.peekWithoutReplacement(0);
765 m_input.advance(1);
766 if (cc == nextChar || isNewLine(cc))
767 break;
768 if (cc == '\\') {
769 // This is to handle escapes delimited by newlines
770 consumeEscape();
771 continue;
772 }
773 if (cc == '\0' && m_input.nextInputChar() == '\0')
774 return; // EOF
775 }
776 break;
777 }
778 case '<':
779 // We need to handle this so <!--url( makes an url token
780 if (m_input.peekWithoutReplacement(0) == '!' &&
781 m_input.peekWithoutReplacement(1) == '-' &&
782 m_input.peekWithoutReplacement(2) == '-') {
783 m_input.advance(3);
784 }
785 break;
786 case '#':
787 case '@':
788 skipIdentCharacters();
789 break;
790 case '(':
791 nesting++;
792 m_blockStack.append(LeftParenthesisToken);
793 break;
794 case '[':
795 nesting++;
796 m_blockStack.append(LeftBracketToken);
797 break;
798 case '{':
799 nesting++;
800 m_blockStack.append(LeftBraceToken);
801 break;
802 case ')':
803 if (nesting && m_blockStack.back() == LeftParenthesisToken) {
804 nesting--;
805 m_blockStack.pop_back();
806 }
807 break;
808 case ']':
809 if (nesting && m_blockStack.back() == LeftBracketToken) {
810 nesting--;
811 m_blockStack.pop_back();
812 }
813 break;
814 case '}':
815 if (nesting && m_blockStack.back() == LeftBraceToken) {
816 nesting--;
817 m_blockStack.pop_back();
818 }
819 break;
820 case '\0':
821 if (m_input.nextInputChar() == '\0')
822 return;
823 skipIdentCharacters();
824 break;
825 }
826 } while (nesting);
827 m_tokens.pop_back();
828 }
829
830 void CSSTokenizer::skipComments() {
831 while (m_input.peekWithoutReplacement(0) == '/' &&
832 m_input.peekWithoutReplacement(1) == '*') {
833 m_input.advance(2);
834 consumeUntilCommentEndFound();
835 }
836 }
837
838 void CSSTokenizer::skipWhitespaceAndComments() {
839 while (true) {
840 m_input.advanceUntilNonWhitespace();
841 if (m_input.peekWithoutReplacement(0) == '/' &&
842 m_input.peekWithoutReplacement(1) == '*') {
843 m_input.advance(2);
844 consumeUntilCommentEndFound();
845 } else {
846 break;
847 }
848 }
849 }
850
851 void CSSTokenizer::yieldComments(CSSParserObserver& observer) {
852 while (m_input.peekWithoutReplacement(0) == '/' &&
853 m_input.peekWithoutReplacement(1) == '*') {
854 size_t startOffset = m_input.offset();
855 m_input.advance(2);
856 consumeUntilCommentEndFound();
857 observer.observeComment(startOffset, m_input.offset());
858 }
859 }
860
684 } // namespace blink 861 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698