Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "core/css/parser/CSSTokenizer.h" | 5 #include "core/css/parser/CSSTokenizer.h" |
| 6 | 6 |
| 7 namespace blink { | 7 namespace blink { |
| 8 #include "core/CSSTokenizerCodepoints.cpp" | 8 #include "core/CSSTokenizerCodepoints.cpp" |
| 9 } | 9 } |
| 10 | 10 |
| 11 #include "core/css/parser/CSSParserIdioms.h" | 11 #include "core/css/parser/CSSParserIdioms.h" |
| 12 #include "core/css/parser/CSSParserObserverWrapper.h" | 12 #include "core/css/parser/CSSParserObserver.h" |
| 13 #include "core/css/parser/CSSParserTokenRange.h" | 13 #include "core/css/parser/CSSParserTokenRange.h" |
| 14 #include "core/html/parser/HTMLParserIdioms.h" | 14 #include "core/html/parser/HTMLParserIdioms.h" |
| 15 #include "wtf/text/CharacterNames.h" | 15 #include "wtf/text/CharacterNames.h" |
| 16 | 16 |
| 17 namespace blink { | 17 namespace blink { |
| 18 | 18 |
| 19 CSSTokenizer::CSSTokenizer(const String& string) : m_input(string) { | 19 // We handle input preprocessing substitutions during tokenization: |
| 20 // According to the spec, we should perform preprocessing here. | 20 // * We also accept \r and \f as white space |
| 21 // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing | 21 // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement |
| 22 // | 22 // characters |
| 23 // However, we can skip this step since: | |
| 24 // * We're using HTML spaces (which accept \r and \f as a valid white space) | |
| 25 // * Do not count white spaces | |
| 26 // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement | |
| 27 // characters | |
| 28 | 23 |
| 29 if (string.isEmpty()) | 24 CSSTokenizer::CSSTokenizer(const String& string, size_t startOffset) |
| 30 return; | 25 : m_input(string) { |
| 26 m_input.advance(startOffset); | |
|
Charlie Harrison
2017/01/09 21:35:07
It looks like advance() does not change the underl
| |
| 27 } | |
| 31 | 28 |
| 32 // To avoid resizing we err on the side of reserving too much space. | 29 CSSParserTokenRange CSSTokenizer::tokenRange() { |
| 33 // Most strings we tokenize have about 3.5 to 5 characters per token. | 30 if (m_finishedTokenizing) |
| 34 m_tokens.reserveInitialCapacity(string.length() / 3); | 31 return m_tokens; |
|
Charlie Harrison
2017/01/09 21:35:07
w000t!
| |
| 32 | |
| 33 // Try to avoid resizing the Vector by reserving space. | |
| 34 m_tokens.reserveInitialCapacity(m_input.length() / 3); | |
| 35 | 35 |
| 36 while (true) { | 36 while (true) { |
| 37 CSSParserToken token = nextToken(); | 37 CSSParserToken token = nextToken(); |
| 38 if (token.type() == CommentToken) | 38 if (token.type() == CommentToken) |
| 39 continue; | 39 continue; |
| 40 if (token.type() == EOFToken) | 40 if (token.type() == EOFToken) |
| 41 return; | 41 break; |
| 42 m_tokens.push_back(token); | 42 m_tokens.push_back(token); |
| 43 } | 43 } |
| 44 m_finishedTokenizing = true; | |
| 45 return m_tokens; | |
| 46 } | |
| 47 | |
| 48 void CSSTokenizer::tokenizeSingle() { | |
|
Charlie Harrison
2017/01/09 21:35:07
Maybe tokenizeSingle could return something useful
| |
| 49 if (m_finishedTokenizing) | |
| 50 return; | |
| 51 while (true) { | |
| 52 CSSParserToken token = nextToken(); | |
| 53 if (token.type() == CommentToken) | |
| 54 continue; | |
| 55 if (token.type() == EOFToken) | |
| 56 m_finishedTokenizing = true; | |
| 57 else | |
| 58 m_tokens.push_back(token); | |
| 59 return; | |
| 60 } | |
| 44 } | 61 } |
| 45 | 62 |
| 46 CSSTokenizer::CSSTokenizer(const String& string, | |
| 47 CSSParserObserverWrapper& wrapper) | |
| 48 : m_input(string) { | |
| 49 if (string.isEmpty()) | |
| 50 return; | |
| 51 | |
| 52 unsigned offset = 0; | |
| 53 while (true) { | |
| 54 CSSParserToken token = nextToken(); | |
| 55 if (token.type() == EOFToken) | |
| 56 break; | |
| 57 if (token.type() == CommentToken) { | |
| 58 wrapper.addComment(offset, m_input.offset(), m_tokens.size()); | |
| 59 } else { | |
| 60 m_tokens.push_back(token); | |
| 61 wrapper.addToken(offset); | |
| 62 } | |
| 63 offset = m_input.offset(); | |
| 64 } | |
| 65 | |
| 66 wrapper.addToken(offset); | |
| 67 wrapper.finalizeConstruction(m_tokens.begin()); | |
| 68 } | |
| 69 | |
| 70 CSSParserTokenRange CSSTokenizer::tokenRange() { | |
| 71 return m_tokens; | |
| 72 } | |
| 73 | |
| 74 unsigned CSSTokenizer::tokenCount() { | |
| 75 return m_tokens.size(); | |
| 76 } | |
| 77 | |
| 78 static bool isNewLine(UChar cc) { | 63 static bool isNewLine(UChar cc) { |
| 79 // We check \r and \f here, since we have no preprocessing stage | 64 // We check \r and \f here, since we have no preprocessing stage |
| 80 return (cc == '\r' || cc == '\n' || cc == '\f'); | 65 return (cc == '\r' || cc == '\n' || cc == '\f'); |
| 81 } | 66 } |
| 82 | 67 |
| 83 // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-esca pe | 68 // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-esca pe |
| 84 static bool twoCharsAreValidEscape(UChar first, UChar second) { | 69 static bool twoCharsAreValidEscape(UChar first, UChar second) { |
| 85 return first == '\\' && !isNewLine(second); | 70 return first == '\\' && !isNewLine(second); |
| 86 } | 71 } |
| 87 | 72 |
| (...skipping 586 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 674 bool areIdentifier = nextCharsAreIdentifier(first); | 659 bool areIdentifier = nextCharsAreIdentifier(first); |
| 675 reconsume(first); | 660 reconsume(first); |
| 676 return areIdentifier; | 661 return areIdentifier; |
| 677 } | 662 } |
| 678 | 663 |
| 679 StringView CSSTokenizer::registerString(const String& string) { | 664 StringView CSSTokenizer::registerString(const String& string) { |
| 680 m_stringPool.push_back(string); | 665 m_stringPool.push_back(string); |
| 681 return string; | 666 return string; |
| 682 } | 667 } |
| 683 | 668 |
| 669 bool isCSSIdentCharacter(UChar cc) { | |
| 670 return isASCIIAlphanumeric(cc) || cc == '-' || cc == '_' || cc > 128; | |
| 671 } | |
| 672 | |
| 673 // If the next character is an ASCII insensitive match for cc, skips that. | |
| 674 // Otherwise, skips the remaining ident characters. | |
| 675 bool CSSTokenizer::skipIdentCharacters(UChar cc) { | |
| 676 UChar nextChar = consume(); | |
| 677 if (isASCIIAlphaCaselessEqual(nextChar, cc)) | |
|
Charlie Harrison
2017/01/09 21:35:07
Hm, this function has a compiler hint that the bra
| |
| 678 return true; | |
| 679 if (nextChar == '\\') { | |
| 680 if (m_input.peekWithoutReplacement(0) == '\n') { | |
| 681 m_input.advance(); | |
| 682 return false; | |
| 683 } | |
| 684 if (isASCIIAlphaCaselessEqual(consumeEscape(), cc)) | |
| 685 return true; | |
| 686 } | |
| 687 skipIdentCharacters(); | |
| 688 return false; | |
| 689 } | |
| 690 | |
| 691 void CSSTokenizer::skipIdentCharacters() { | |
| 692 while (true) { | |
| 693 UChar nextChar = m_input.peekWithoutReplacement(0); | |
| 694 if (nextChar == '\\') { | |
| 695 if (m_input.peekWithoutReplacement(1) == '\n') { | |
| 696 m_input.advance(2); | |
| 697 return; | |
| 698 } | |
| 699 m_input.advance(); | |
| 700 consumeEscape(); | |
| 701 } | |
| 702 if (!isCSSIdentCharacter(nextChar)) | |
| 703 return; | |
| 704 m_input.advance(); | |
| 705 } | |
| 706 } | |
| 707 | |
| 708 void CSSTokenizer::skipToBlockEnd() { | |
| 709 DCHECK(!m_blockStack.isEmpty()); | |
| 710 DCHECK_EQ(m_blockStack.back(), LeftBraceToken); | |
| 711 int nesting = 1; | |
| 712 do { | |
| 713 UChar nextChar = m_input.peekWithoutReplacement(0); | |
| 714 | |
| 715 // url's are special as their error recovery doesn't match blocks. | |
| 716 // url(([{) is a single <bad-url-token>, while moo(([{) needs }])) to | |
| 717 // close the component value. We need enough tokenization logic to be | |
| 718 // able to identify urls. | |
| 719 // Technically we should handle our non-standard unicode range tokens | |
| 720 // but probably no one will encounter these cases. | |
| 721 if (isCSSIdentCharacter(nextChar) || nextChar == '\\') { | |
| 722 if (!skipIdentCharacters('u') || !skipIdentCharacters('r') || | |
| 723 !skipIdentCharacters('l')) | |
| 724 continue; | |
| 725 | |
| 726 if (m_input.peekWithoutReplacement(0) != '(') { | |
| 727 skipIdentCharacters(); | |
| 728 continue; | |
| 729 } | |
| 730 | |
| 731 skipWhitespaceAndComments(); | |
| 732 UChar innerChar = m_input.nextInputChar(); | |
| 733 if (innerChar == '"' || innerChar == '\'') { | |
| 734 nesting++; | |
| 735 m_blockStack.append(LeftParenthesisToken); | |
| 736 continue; | |
| 737 } | |
| 738 while (innerChar != ')' && innerChar != '\0') { | |
| 739 m_input.advance(); | |
| 740 if (innerChar == '\\') { | |
| 741 if (m_input.peekWithoutReplacement(0) == '\n') | |
| 742 m_input.advance(); | |
| 743 else | |
| 744 consumeEscape(); | |
| 745 } | |
| 746 innerChar = m_input.nextInputChar(); | |
| 747 } | |
| 748 m_input.advance(); | |
| 749 continue; | |
| 750 } | |
| 751 | |
| 752 m_input.advance(); | |
| 753 | |
| 754 switch (nextChar) { | |
| 755 case '/': | |
| 756 if (m_input.peekWithoutReplacement(0) == '*') { | |
| 757 m_input.advance(); | |
| 758 consumeUntilCommentEndFound(); | |
| 759 } | |
| 760 break; | |
| 761 case '"': | |
| 762 case '\'': { | |
| 763 while (true) { | |
| 764 UChar cc = m_input.peekWithoutReplacement(0); | |
| 765 m_input.advance(1); | |
| 766 if (cc == nextChar || isNewLine(cc)) | |
| 767 break; | |
| 768 if (cc == '\\') { | |
| 769 // This is to handle escapes delimited by newlines | |
| 770 consumeEscape(); | |
| 771 continue; | |
| 772 } | |
| 773 if (cc == '\0' && m_input.nextInputChar() == '\0') | |
| 774 return; // EOF | |
| 775 } | |
| 776 break; | |
| 777 } | |
| 778 case '<': | |
| 779 // We need to handle this so <!--url( makes an url token | |
| 780 if (m_input.peekWithoutReplacement(0) == '!' && | |
| 781 m_input.peekWithoutReplacement(1) == '-' && | |
| 782 m_input.peekWithoutReplacement(2) == '-') { | |
| 783 m_input.advance(3); | |
| 784 } | |
| 785 break; | |
| 786 case '#': | |
| 787 case '@': | |
| 788 skipIdentCharacters(); | |
| 789 break; | |
| 790 case '(': | |
| 791 nesting++; | |
| 792 m_blockStack.append(LeftParenthesisToken); | |
| 793 break; | |
| 794 case '[': | |
| 795 nesting++; | |
| 796 m_blockStack.append(LeftBracketToken); | |
| 797 break; | |
| 798 case '{': | |
| 799 nesting++; | |
| 800 m_blockStack.append(LeftBraceToken); | |
| 801 break; | |
| 802 case ')': | |
| 803 if (nesting && m_blockStack.back() == LeftParenthesisToken) { | |
| 804 nesting--; | |
| 805 m_blockStack.pop_back(); | |
| 806 } | |
| 807 break; | |
| 808 case ']': | |
| 809 if (nesting && m_blockStack.back() == LeftBracketToken) { | |
| 810 nesting--; | |
| 811 m_blockStack.pop_back(); | |
| 812 } | |
| 813 break; | |
| 814 case '}': | |
| 815 if (nesting && m_blockStack.back() == LeftBraceToken) { | |
| 816 nesting--; | |
| 817 m_blockStack.pop_back(); | |
| 818 } | |
| 819 break; | |
| 820 case '\0': | |
| 821 if (m_input.nextInputChar() == '\0') | |
| 822 return; | |
| 823 skipIdentCharacters(); | |
| 824 break; | |
| 825 } | |
| 826 } while (nesting); | |
| 827 m_tokens.pop_back(); | |
| 828 } | |
| 829 | |
| 830 void CSSTokenizer::skipComments() { | |
| 831 while (m_input.peekWithoutReplacement(0) == '/' && | |
| 832 m_input.peekWithoutReplacement(1) == '*') { | |
| 833 m_input.advance(2); | |
| 834 consumeUntilCommentEndFound(); | |
| 835 } | |
| 836 } | |
| 837 | |
| 838 void CSSTokenizer::skipWhitespaceAndComments() { | |
| 839 while (true) { | |
| 840 m_input.advanceUntilNonWhitespace(); | |
| 841 if (m_input.peekWithoutReplacement(0) == '/' && | |
| 842 m_input.peekWithoutReplacement(1) == '*') { | |
| 843 m_input.advance(2); | |
| 844 consumeUntilCommentEndFound(); | |
| 845 } else { | |
| 846 break; | |
| 847 } | |
| 848 } | |
| 849 } | |
| 850 | |
| 851 void CSSTokenizer::yieldComments(CSSParserObserver& observer) { | |
| 852 while (m_input.peekWithoutReplacement(0) == '/' && | |
| 853 m_input.peekWithoutReplacement(1) == '*') { | |
| 854 size_t startOffset = m_input.offset(); | |
| 855 m_input.advance(2); | |
| 856 consumeUntilCommentEndFound(); | |
| 857 observer.observeComment(startOffset, m_input.offset()); | |
| 858 } | |
| 859 } | |
| 860 | |
| 684 } // namespace blink | 861 } // namespace blink |
| OLD | NEW |