OLD | NEW |
---|---|
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "core/css/parser/CSSTokenizer.h" | 5 #include "core/css/parser/CSSTokenizer.h" |
6 | 6 |
7 namespace blink { | 7 namespace blink { |
8 #include "core/CSSTokenizerCodepoints.cpp" | 8 #include "core/CSSTokenizerCodepoints.cpp" |
9 } | 9 } |
10 | 10 |
11 #include "core/css/parser/CSSParserIdioms.h" | 11 #include "core/css/parser/CSSParserIdioms.h" |
12 #include "core/css/parser/CSSParserObserverWrapper.h" | 12 #include "core/css/parser/CSSParserObserver.h" |
13 #include "core/css/parser/CSSParserTokenRange.h" | 13 #include "core/css/parser/CSSParserTokenRange.h" |
14 #include "core/html/parser/HTMLParserIdioms.h" | 14 #include "core/html/parser/HTMLParserIdioms.h" |
15 #include "wtf/text/CharacterNames.h" | 15 #include "wtf/text/CharacterNames.h" |
16 | 16 |
17 namespace blink { | 17 namespace blink { |
18 | 18 |
19 CSSTokenizer::CSSTokenizer(const String& string) : m_input(string) { | 19 // We handle input preprocessing substitutions during tokenization: |
20 // According to the spec, we should perform preprocessing here. | 20 // * We also accept \r and \f as white space |
21 // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing | 21 // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement |
22 // | 22 // characters |
23 // However, we can skip this step since: | |
24 // * We're using HTML spaces (which accept \r and \f as a valid white space) | |
25 // * Do not count white spaces | |
26 // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement | |
27 // characters | |
28 | 23 |
29 if (string.isEmpty()) | 24 CSSTokenizer::CSSTokenizer(const String& string, size_t startOffset) |
30 return; | 25 : m_input(string) { |
26 m_input.advance(startOffset); | |
Charlie Harrison
2017/01/09 21:35:07
It looks like advance() does not change the underl
| |
27 } | |
31 | 28 |
32 // To avoid resizing we err on the side of reserving too much space. | 29 CSSParserTokenRange CSSTokenizer::tokenRange() { |
33 // Most strings we tokenize have about 3.5 to 5 characters per token. | 30 if (m_finishedTokenizing) |
34 m_tokens.reserveInitialCapacity(string.length() / 3); | 31 return m_tokens; |
Charlie Harrison
2017/01/09 21:35:07
w000t!
| |
32 | |
33 // Try to avoid resizing the Vector by reserving space. | |
34 m_tokens.reserveInitialCapacity(m_input.length() / 3); | |
35 | 35 |
36 while (true) { | 36 while (true) { |
37 CSSParserToken token = nextToken(); | 37 CSSParserToken token = nextToken(); |
38 if (token.type() == CommentToken) | 38 if (token.type() == CommentToken) |
39 continue; | 39 continue; |
40 if (token.type() == EOFToken) | 40 if (token.type() == EOFToken) |
41 return; | 41 break; |
42 m_tokens.push_back(token); | 42 m_tokens.push_back(token); |
43 } | 43 } |
44 m_finishedTokenizing = true; | |
45 return m_tokens; | |
46 } | |
47 | |
48 void CSSTokenizer::tokenizeSingle() { | |
Charlie Harrison
2017/01/09 21:35:07
Maybe tokenizeSingle could return something useful
| |
49 if (m_finishedTokenizing) | |
50 return; | |
51 while (true) { | |
52 CSSParserToken token = nextToken(); | |
53 if (token.type() == CommentToken) | |
54 continue; | |
55 if (token.type() == EOFToken) | |
56 m_finishedTokenizing = true; | |
57 else | |
58 m_tokens.push_back(token); | |
59 return; | |
60 } | |
44 } | 61 } |
45 | 62 |
46 CSSTokenizer::CSSTokenizer(const String& string, | |
47 CSSParserObserverWrapper& wrapper) | |
48 : m_input(string) { | |
49 if (string.isEmpty()) | |
50 return; | |
51 | |
52 unsigned offset = 0; | |
53 while (true) { | |
54 CSSParserToken token = nextToken(); | |
55 if (token.type() == EOFToken) | |
56 break; | |
57 if (token.type() == CommentToken) { | |
58 wrapper.addComment(offset, m_input.offset(), m_tokens.size()); | |
59 } else { | |
60 m_tokens.push_back(token); | |
61 wrapper.addToken(offset); | |
62 } | |
63 offset = m_input.offset(); | |
64 } | |
65 | |
66 wrapper.addToken(offset); | |
67 wrapper.finalizeConstruction(m_tokens.begin()); | |
68 } | |
69 | |
70 CSSParserTokenRange CSSTokenizer::tokenRange() { | |
71 return m_tokens; | |
72 } | |
73 | |
74 unsigned CSSTokenizer::tokenCount() { | |
75 return m_tokens.size(); | |
76 } | |
77 | |
78 static bool isNewLine(UChar cc) { | 63 static bool isNewLine(UChar cc) { |
79 // We check \r and \f here, since we have no preprocessing stage | 64 // We check \r and \f here, since we have no preprocessing stage |
80 return (cc == '\r' || cc == '\n' || cc == '\f'); | 65 return (cc == '\r' || cc == '\n' || cc == '\f'); |
81 } | 66 } |
82 | 67 |
83 // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-esca pe | 68 // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-esca pe |
84 static bool twoCharsAreValidEscape(UChar first, UChar second) { | 69 static bool twoCharsAreValidEscape(UChar first, UChar second) { |
85 return first == '\\' && !isNewLine(second); | 70 return first == '\\' && !isNewLine(second); |
86 } | 71 } |
87 | 72 |
(...skipping 586 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
674 bool areIdentifier = nextCharsAreIdentifier(first); | 659 bool areIdentifier = nextCharsAreIdentifier(first); |
675 reconsume(first); | 660 reconsume(first); |
676 return areIdentifier; | 661 return areIdentifier; |
677 } | 662 } |
678 | 663 |
679 StringView CSSTokenizer::registerString(const String& string) { | 664 StringView CSSTokenizer::registerString(const String& string) { |
680 m_stringPool.push_back(string); | 665 m_stringPool.push_back(string); |
681 return string; | 666 return string; |
682 } | 667 } |
683 | 668 |
669 bool isCSSIdentCharacter(UChar cc) { | |
670 return isASCIIAlphanumeric(cc) || cc == '-' || cc == '_' || cc > 128; | |
671 } | |
672 | |
673 // If the next character is an ASCII insensitive match for cc, skips that. | |
674 // Otherwise, skips the remaining ident characters. | |
675 bool CSSTokenizer::skipIdentCharacters(UChar cc) { | |
676 UChar nextChar = consume(); | |
677 if (isASCIIAlphaCaselessEqual(nextChar, cc)) | |
Charlie Harrison
2017/01/09 21:35:07
Hm, this function has a compiler hint that the bra
| |
678 return true; | |
679 if (nextChar == '\\') { | |
680 if (m_input.peekWithoutReplacement(0) == '\n') { | |
681 m_input.advance(); | |
682 return false; | |
683 } | |
684 if (isASCIIAlphaCaselessEqual(consumeEscape(), cc)) | |
685 return true; | |
686 } | |
687 skipIdentCharacters(); | |
688 return false; | |
689 } | |
690 | |
691 void CSSTokenizer::skipIdentCharacters() { | |
692 while (true) { | |
693 UChar nextChar = m_input.peekWithoutReplacement(0); | |
694 if (nextChar == '\\') { | |
695 if (m_input.peekWithoutReplacement(1) == '\n') { | |
696 m_input.advance(2); | |
697 return; | |
698 } | |
699 m_input.advance(); | |
700 consumeEscape(); | |
701 } | |
702 if (!isCSSIdentCharacter(nextChar)) | |
703 return; | |
704 m_input.advance(); | |
705 } | |
706 } | |
707 | |
708 void CSSTokenizer::skipToBlockEnd() { | |
709 DCHECK(!m_blockStack.isEmpty()); | |
710 DCHECK_EQ(m_blockStack.back(), LeftBraceToken); | |
711 int nesting = 1; | |
712 do { | |
713 UChar nextChar = m_input.peekWithoutReplacement(0); | |
714 | |
715 // url's are special as their error recovery doesn't match blocks. | |
716 // url(([{) is a single <bad-url-token>, while moo(([{) needs }])) to | |
717 // close the component value. We need enough tokenization logic to be | |
718 // able to identify urls. | |
719 // Technically we should handle our non-standard unicode range tokens | |
720 // but probably no one will encounter these cases. | |
721 if (isCSSIdentCharacter(nextChar) || nextChar == '\\') { | |
722 if (!skipIdentCharacters('u') || !skipIdentCharacters('r') || | |
723 !skipIdentCharacters('l')) | |
724 continue; | |
725 | |
726 if (m_input.peekWithoutReplacement(0) != '(') { | |
727 skipIdentCharacters(); | |
728 continue; | |
729 } | |
730 | |
731 skipWhitespaceAndComments(); | |
732 UChar innerChar = m_input.nextInputChar(); | |
733 if (innerChar == '"' || innerChar == '\'') { | |
734 nesting++; | |
735 m_blockStack.append(LeftParenthesisToken); | |
736 continue; | |
737 } | |
738 while (innerChar != ')' && innerChar != '\0') { | |
739 m_input.advance(); | |
740 if (innerChar == '\\') { | |
741 if (m_input.peekWithoutReplacement(0) == '\n') | |
742 m_input.advance(); | |
743 else | |
744 consumeEscape(); | |
745 } | |
746 innerChar = m_input.nextInputChar(); | |
747 } | |
748 m_input.advance(); | |
749 continue; | |
750 } | |
751 | |
752 m_input.advance(); | |
753 | |
754 switch (nextChar) { | |
755 case '/': | |
756 if (m_input.peekWithoutReplacement(0) == '*') { | |
757 m_input.advance(); | |
758 consumeUntilCommentEndFound(); | |
759 } | |
760 break; | |
761 case '"': | |
762 case '\'': { | |
763 while (true) { | |
764 UChar cc = m_input.peekWithoutReplacement(0); | |
765 m_input.advance(1); | |
766 if (cc == nextChar || isNewLine(cc)) | |
767 break; | |
768 if (cc == '\\') { | |
769 // This is to handle escapes delimited by newlines | |
770 consumeEscape(); | |
771 continue; | |
772 } | |
773 if (cc == '\0' && m_input.nextInputChar() == '\0') | |
774 return; // EOF | |
775 } | |
776 break; | |
777 } | |
778 case '<': | |
779 // We need to handle this so <!--url( makes an url token | |
780 if (m_input.peekWithoutReplacement(0) == '!' && | |
781 m_input.peekWithoutReplacement(1) == '-' && | |
782 m_input.peekWithoutReplacement(2) == '-') { | |
783 m_input.advance(3); | |
784 } | |
785 break; | |
786 case '#': | |
787 case '@': | |
788 skipIdentCharacters(); | |
789 break; | |
790 case '(': | |
791 nesting++; | |
792 m_blockStack.append(LeftParenthesisToken); | |
793 break; | |
794 case '[': | |
795 nesting++; | |
796 m_blockStack.append(LeftBracketToken); | |
797 break; | |
798 case '{': | |
799 nesting++; | |
800 m_blockStack.append(LeftBraceToken); | |
801 break; | |
802 case ')': | |
803 if (nesting && m_blockStack.back() == LeftParenthesisToken) { | |
804 nesting--; | |
805 m_blockStack.pop_back(); | |
806 } | |
807 break; | |
808 case ']': | |
809 if (nesting && m_blockStack.back() == LeftBracketToken) { | |
810 nesting--; | |
811 m_blockStack.pop_back(); | |
812 } | |
813 break; | |
814 case '}': | |
815 if (nesting && m_blockStack.back() == LeftBraceToken) { | |
816 nesting--; | |
817 m_blockStack.pop_back(); | |
818 } | |
819 break; | |
820 case '\0': | |
821 if (m_input.nextInputChar() == '\0') | |
822 return; | |
823 skipIdentCharacters(); | |
824 break; | |
825 } | |
826 } while (nesting); | |
827 m_tokens.pop_back(); | |
828 } | |
829 | |
830 void CSSTokenizer::skipComments() { | |
831 while (m_input.peekWithoutReplacement(0) == '/' && | |
832 m_input.peekWithoutReplacement(1) == '*') { | |
833 m_input.advance(2); | |
834 consumeUntilCommentEndFound(); | |
835 } | |
836 } | |
837 | |
838 void CSSTokenizer::skipWhitespaceAndComments() { | |
839 while (true) { | |
840 m_input.advanceUntilNonWhitespace(); | |
841 if (m_input.peekWithoutReplacement(0) == '/' && | |
842 m_input.peekWithoutReplacement(1) == '*') { | |
843 m_input.advance(2); | |
844 consumeUntilCommentEndFound(); | |
845 } else { | |
846 break; | |
847 } | |
848 } | |
849 } | |
850 | |
851 void CSSTokenizer::yieldComments(CSSParserObserver& observer) { | |
852 while (m_input.peekWithoutReplacement(0) == '/' && | |
853 m_input.peekWithoutReplacement(1) == '*') { | |
854 size_t startOffset = m_input.offset(); | |
855 m_input.advance(2); | |
856 consumeUntilCommentEndFound(); | |
857 observer.observeComment(startOffset, m_input.offset()); | |
858 } | |
859 } | |
860 | |
684 } // namespace blink | 861 } // namespace blink |
OLD | NEW |