third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp - Issue 2503683003: [WIP] Streaming CSS parser

Side by Side Diff: third_party/WebKit/Source/core/css/parser/CSSTokenizer.cpp

Issue 2503683003: [WIP] Streaming CSS parser (Closed)

Patch Set: rebase Created 3 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« third_party/WebKit/Source/core/css/parser/CSSSelectorParser.cpp ('K') | « third_party/WebKit/Source/core/css/parser/CSSTokenizer.h ('k') | third_party/WebKit/Source/core/inspector/InspectorStyleSheet.cpp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2014 The Chromium Authors. All rights reserved.	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "core/css/parser/CSSTokenizer.h"	5 #include "core/css/parser/CSSTokenizer.h"

6	6

7 namespace blink {	7 namespace blink {

8 #include "core/CSSTokenizerCodepoints.cpp"	8 #include "core/CSSTokenizerCodepoints.cpp"

9 }	9 }

10	10

11 #include "core/css/parser/CSSParserIdioms.h"	11 #include "core/css/parser/CSSParserIdioms.h"

12 #include "core/css/parser/CSSParserObserverWrapper.h"	12 #include "core/css/parser/CSSParserObserver.h"

13 #include "core/css/parser/CSSParserTokenRange.h"	13 #include "core/css/parser/CSSParserTokenRange.h"

14 #include "core/html/parser/HTMLParserIdioms.h"	14 #include "core/html/parser/HTMLParserIdioms.h"

15 #include "wtf/text/CharacterNames.h"	15 #include "wtf/text/CharacterNames.h"

16	16

17 namespace blink {	17 namespace blink {

18	18

19 CSSTokenizer::CSSTokenizer(const String& string) : m_input(string) {	19 // We handle input preprocessing substitutions during tokenization:

20 // According to the spec, we should perform preprocessing here.	20 // * We also accept \r and \f as white space

21 // See: http://dev.w3.org/csswg/css-syntax/#input-preprocessing	21 // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement

22 //	22 // characters

23 // However, we can skip this step since:

24 // * We're using HTML spaces (which accept \r and \f as a valid white space)

25 // * Do not count white spaces

26 // * CSSTokenizerInputStream::nextInputChar() replaces NULLs for replacement

27 // characters

28	23

29 if (string.isEmpty())	24 CSSTokenizer::CSSTokenizer(const String& string, size_t startOffset)

30 return;	25 : m_input(string) {

	26 m_input.advance(startOffset);
	Charlie Harrison 2017/01/09 21:35:07 It looks like advance() does not change the underl It looks like advance() does not change the underlying length. Should I be worried at the reserveInitialCapacity() call below?
	27 }

31	28

32 // To avoid resizing we err on the side of reserving too much space.	29 CSSParserTokenRange CSSTokenizer::tokenRange() {

33 // Most strings we tokenize have about 3.5 to 5 characters per token.	30 if (m_finishedTokenizing)

34 m_tokens.reserveInitialCapacity(string.length() / 3);	31 return m_tokens;
Charlie Harrison 2017/01/09 21:35:07 w000t! w000t!
	32

	33 // Try to avoid resizing the Vector by reserving space.

	34 m_tokens.reserveInitialCapacity(m_input.length() / 3);

35	35

36 while (true) {	36 while (true) {

37 CSSParserToken token = nextToken();	37 CSSParserToken token = nextToken();

38 if (token.type() == CommentToken)	38 if (token.type() == CommentToken)

39 continue;	39 continue;

40 if (token.type() == EOFToken)	40 if (token.type() == EOFToken)

41 return;	41 break;

42 m_tokens.push_back(token);	42 m_tokens.push_back(token);

43 }	43 }

	44 m_finishedTokenizing = true;

	45 return m_tokens;

	46 }

	47

	48 void CSSTokenizer::tokenizeSingle() {
	Charlie Harrison 2017/01/09 21:35:07 Maybe tokenizeSingle could return something useful Maybe tokenizeSingle could return something useful, like whether we're finished tokenizing, or maybe the last token tokenized?
	49 if (m_finishedTokenizing)

	50 return;

	51 while (true) {

	52 CSSParserToken token = nextToken();

	53 if (token.type() == CommentToken)

	54 continue;

	55 if (token.type() == EOFToken)

	56 m_finishedTokenizing = true;

	57 else

	58 m_tokens.push_back(token);

	59 return;

	60 }

44 }	61 }

45	62

46 CSSTokenizer::CSSTokenizer(const String& string,

47 CSSParserObserverWrapper& wrapper)

48 : m_input(string) {

49 if (string.isEmpty())

50 return;

51

52 unsigned offset = 0;

53 while (true) {

54 CSSParserToken token = nextToken();

55 if (token.type() == EOFToken)

56 break;

57 if (token.type() == CommentToken) {

58 wrapper.addComment(offset, m_input.offset(), m_tokens.size());

59 } else {

60 m_tokens.push_back(token);

61 wrapper.addToken(offset);

62 }

63 offset = m_input.offset();

64 }

65

66 wrapper.addToken(offset);

67 wrapper.finalizeConstruction(m_tokens.begin());

68 }

69

70 CSSParserTokenRange CSSTokenizer::tokenRange() {

71 return m_tokens;

72 }

73

74 unsigned CSSTokenizer::tokenCount() {

75 return m_tokens.size();

76 }

77

78 static bool isNewLine(UChar cc) {	63 static bool isNewLine(UChar cc) {

79 // We check \r and \f here, since we have no preprocessing stage	64 // We check \r and \f here, since we have no preprocessing stage

80 return (cc == '\r' \|\| cc == '\n' \|\| cc == '\f');	65 return (cc == '\r' \|\| cc == '\n' \|\| cc == '\f');

81 }	66 }

82	67

83 // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-esca pe	68 // http://dev.w3.org/csswg/css-syntax/#check-if-two-code-points-are-a-valid-esca pe

84 static bool twoCharsAreValidEscape(UChar first, UChar second) {	69 static bool twoCharsAreValidEscape(UChar first, UChar second) {

85 return first == '\\' && !isNewLine(second);	70 return first == '\\' && !isNewLine(second);

86 }	71 }

87	72

(...skipping 586 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
674 bool areIdentifier = nextCharsAreIdentifier(first);	659 bool areIdentifier = nextCharsAreIdentifier(first);

675 reconsume(first);	660 reconsume(first);

676 return areIdentifier;	661 return areIdentifier;

677 }	662 }

678	663

679 StringView CSSTokenizer::registerString(const String& string) {	664 StringView CSSTokenizer::registerString(const String& string) {

680 m_stringPool.push_back(string);	665 m_stringPool.push_back(string);

681 return string;	666 return string;

682 }	667 }

683	668

	669 bool isCSSIdentCharacter(UChar cc) {

	670 return isASCIIAlphanumeric(cc) \|\| cc == '-' \|\| cc == '_' \|\| cc > 128;

	671 }

	672

	673 // If the next character is an ASCII insensitive match for cc, skips that.

	674 // Otherwise, skips the remaining ident characters.

	675 bool CSSTokenizer::skipIdentCharacters(UChar cc) {

	676 UChar nextChar = consume();

	677 if (isASCIIAlphaCaselessEqual(nextChar, cc))
	Charlie Harrison 2017/01/09 21:35:07 Hm, this function has a compiler hint that the bra Hm, this function has a compiler hint that the branch is likely to be true. I'm worried that might lead to poor branch prediction. WDYT?
	678 return true;

	679 if (nextChar == '\\') {

	680 if (m_input.peekWithoutReplacement(0) == '\n') {

	681 m_input.advance();

	682 return false;

	683 }

	684 if (isASCIIAlphaCaselessEqual(consumeEscape(), cc))

	685 return true;

	686 }

	687 skipIdentCharacters();

	688 return false;

	689 }

	690

	691 void CSSTokenizer::skipIdentCharacters() {

	692 while (true) {

	693 UChar nextChar = m_input.peekWithoutReplacement(0);

	694 if (nextChar == '\\') {

	695 if (m_input.peekWithoutReplacement(1) == '\n') {

	696 m_input.advance(2);

	697 return;

	698 }

	699 m_input.advance();

	700 consumeEscape();

	701 }

	702 if (!isCSSIdentCharacter(nextChar))

	703 return;

	704 m_input.advance();

	705 }

	706 }

	707

	708 void CSSTokenizer::skipToBlockEnd() {

	709 DCHECK(!m_blockStack.isEmpty());

	710 DCHECK_EQ(m_blockStack.back(), LeftBraceToken);

	711 int nesting = 1;

	712 do {

	713 UChar nextChar = m_input.peekWithoutReplacement(0);

	714

	715 // url's are special as their error recovery doesn't match blocks.

	716 // url(([{) is a single <bad-url-token>, while moo(([{) needs }])) to

	717 // close the component value. We need enough tokenization logic to be

	718 // able to identify urls.

	719 // Technically we should handle our non-standard unicode range tokens

	720 // but probably no one will encounter these cases.

	721 if (isCSSIdentCharacter(nextChar) \|\| nextChar == '\\') {

	722 if (!skipIdentCharacters('u') \|\| !skipIdentCharacters('r') \|\|

	723 !skipIdentCharacters('l'))

	724 continue;

	725

	726 if (m_input.peekWithoutReplacement(0) != '(') {

	727 skipIdentCharacters();

	728 continue;

	729 }

	730

	731 skipWhitespaceAndComments();

	732 UChar innerChar = m_input.nextInputChar();

	733 if (innerChar == '"' \|\| innerChar == '\'') {

	734 nesting++;

	735 m_blockStack.append(LeftParenthesisToken);

	736 continue;

	737 }

	738 while (innerChar != ')' && innerChar != '\0') {

	739 m_input.advance();

	740 if (innerChar == '\\') {

	741 if (m_input.peekWithoutReplacement(0) == '\n')

	742 m_input.advance();

	743 else

	744 consumeEscape();

	745 }

	746 innerChar = m_input.nextInputChar();

	747 }

	748 m_input.advance();

	749 continue;

	750 }

	751

	752 m_input.advance();

	753

	754 switch (nextChar) {

	755 case '/':

	756 if (m_input.peekWithoutReplacement(0) == '*') {

	757 m_input.advance();

	758 consumeUntilCommentEndFound();

	759 }

	760 break;

	761 case '"':

	762 case '\'': {

	763 while (true) {

	764 UChar cc = m_input.peekWithoutReplacement(0);

	765 m_input.advance(1);

	766 if (cc == nextChar \|\| isNewLine(cc))

	767 break;

	768 if (cc == '\\') {

	769 // This is to handle escapes delimited by newlines

	770 consumeEscape();

	771 continue;

	772 }

	773 if (cc == '\0' && m_input.nextInputChar() == '\0')

	774 return; // EOF

	775 }

	776 break;

	777 }

	778 case '<':

	779 // We need to handle this so <!--url( makes an url token

	780 if (m_input.peekWithoutReplacement(0) == '!' &&

	781 m_input.peekWithoutReplacement(1) == '-' &&

	782 m_input.peekWithoutReplacement(2) == '-') {

	783 m_input.advance(3);

	784 }

	785 break;

	786 case '#':

	787 case '@':

	788 skipIdentCharacters();

	789 break;

	790 case '(':

	791 nesting++;

	792 m_blockStack.append(LeftParenthesisToken);

	793 break;

	794 case '[':

	795 nesting++;

	796 m_blockStack.append(LeftBracketToken);

	797 break;

	798 case '{':

	799 nesting++;

	800 m_blockStack.append(LeftBraceToken);

	801 break;

	802 case ')':

	803 if (nesting && m_blockStack.back() == LeftParenthesisToken) {

	804 nesting--;

	805 m_blockStack.pop_back();

	806 }

	807 break;

	808 case ']':

	809 if (nesting && m_blockStack.back() == LeftBracketToken) {

	810 nesting--;

	811 m_blockStack.pop_back();

	812 }

	813 break;

	814 case '}':

	815 if (nesting && m_blockStack.back() == LeftBraceToken) {

	816 nesting--;

	817 m_blockStack.pop_back();

	818 }

	819 break;

	820 case '\0':

	821 if (m_input.nextInputChar() == '\0')

	822 return;

	823 skipIdentCharacters();

	824 break;

	825 }

	826 } while (nesting);

	827 m_tokens.pop_back();

	828 }

	829

	830 void CSSTokenizer::skipComments() {

	831 while (m_input.peekWithoutReplacement(0) == '/' &&

	832 m_input.peekWithoutReplacement(1) == '*') {

	833 m_input.advance(2);

	834 consumeUntilCommentEndFound();

	835 }

	836 }

	837

	838 void CSSTokenizer::skipWhitespaceAndComments() {

	839 while (true) {

	840 m_input.advanceUntilNonWhitespace();

	841 if (m_input.peekWithoutReplacement(0) == '/' &&

	842 m_input.peekWithoutReplacement(1) == '*') {

	843 m_input.advance(2);

	844 consumeUntilCommentEndFound();

	845 } else {

	846 break;

	847 }

	848 }

	849 }

	850

	851 void CSSTokenizer::yieldComments(CSSParserObserver& observer) {

	852 while (m_input.peekWithoutReplacement(0) == '/' &&

	853 m_input.peekWithoutReplacement(1) == '*') {

	854 size_t startOffset = m_input.offset();

	855 m_input.advance(2);

	856 consumeUntilCommentEndFound();

	857 observer.observeComment(startOffset, m_input.offset());

	858 }

	859 }

	860

684 } // namespace blink	861 } // namespace blink

OLD	NEW