Source/core/css/parser/NewCSSTokenizer.cpp - Issue 123053002: Add very basic CSS3 Syntax compatible tokenizer

Side by Side Diff: Source/core/css/parser/NewCSSTokenizer.cpp

Issue 123053002: Add very basic CSS3 Syntax compatible tokenizer Base URL: svn://svn.chromium.org/blink/trunk

Patch Set: Add CSSToken file Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 /*

	2 * Copyright (C) 2013 Google Inc. All rights reserved.

	3 *

	4 * Redistribution and use in source and binary forms, with or without

	5 * modification, are permitted provided that the following conditions are

	6 * met:

	7 *

	8 * * Redistributions of source code must retain the above copyright

	9 * notice, this list of conditions and the following disclaimer.

	10 * * Redistributions in binary form must reproduce the above

	11 * copyright notice, this list of conditions and the following disclaimer

	12 * in the documentation and/or other materials provided with the

	13 * distribution.

	14 * * Neither the name of Google Inc. nor the names of its

	15 * contributors may be used to endorse or promote products derived from

	16 * this software without specific prior written permission.

	17 *

	18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

	19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

	20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

	21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

	22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

	23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

	24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

	25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

	26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

	27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

	28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	29 */

	30

	31 #include "config.h"

	32 #include "core/css/parser/NewCSSTokenizer.h"

	33

	34 #include "core/css/parser/CSSParserIdioms.h"

	35 #include "platform/text/SegmentedString.h"

	36 #include "wtf/TemporaryChange.h"

	37 #include "wtf/unicode/CharacterNames.h"

	38

	39 namespace WebCore {

	40

	41 CSSInputStream::CSSInputStream(String input)

	42 : m_offset(0)

	43 , m_string(input)

	44 {

	45 m_string.append(kEndOfFileMarker);
	abarth-chromium 2014/01/01 18:47:51 Hum... String::append is monstrously slow... Hum... String::append is monstrously slow...
	46 }

	47

	48 UChar CSSInputStream::currentInputChar()

	49 {

	50 ASSERT(m_offset < m_string.length());

	51 return m_string[m_offset];

	52 }

	53

	54 UChar CSSInputStream::nextInputChar()

	55 {

	56 return m_string[m_offset + 1];

	57 }

	58

	59 UChar CSSInputStream::peek2()

	60 {

	61 return m_string[m_offset + 2];

	62 }

	63

	64 UChar CSSInputStream::peek3()

	65 {

	66 return m_string[m_offset + 3];

	67 }

	68

	69 void CSSInputStream::advance()

	70 {

	71 m_offset++;

	72 }

	73

	74 void CSSInputStream::pushBack(UChar cc)

	75 {

	76 m_offset--;

	77 ASSERT(currentInputChar() == cc);

	78 }

	79

	80 // http://dev.w3.org/csswg/css-syntax/#name-start-code-point

	81 static bool isNameStart(UChar c)

	82 {

	83 if (isASCIIAlpha(c))

	84 return true;

	85 if (c == '_')

	86 return true;

	87 return !isASCII(c);

	88 }

	89

	90 // http://www.w3.org/TR/css-syntax-3/#name-code-point

	91 static bool isNameChar(UChar c)

	92 {

	93 return isNameStart(c) \|\| isASCIIDigit(c) \|\| c == '-';

	94 }

	95

	96 NewCSSTokenizer::NewCSSTokenizer()

	97 {

	98 }

	99

	100 void NewCSSTokenizer::reconsume(UChar c)

	101 {

	102 m_input->pushBack(c);

	103 }

	104

	105 UChar NewCSSTokenizer::consume()

	106 {

	107 UChar current = m_input->currentInputChar();

	108 m_input->advance();

	109 return current;

	110 }

	111

	112 CSSToken NewCSSTokenizer::nextToken(CSSInputStream& input)

	113 {

	114 // Unlike the HTMLTokenizer, the CSS Syntax spec is written

	115 // as a stateless, (fixed-size) look-ahead tokenizer.

	116 // We could move to the stateful model and instead create

	117 // states for all the "next 3 codepoints are X" cases.

	118 // State-machine tokenizers are easier to write to handle

	119 // incremental tokenization of partial sources.

	120 // However, for now we follow the spec exactly.

	121 m_input = &input;

	122 UChar cc = consume();

	123

	124 if (isCSSSpace(cc)) {
	abarth-chromium 2014/01/01 18:47:51 I bet it's faster to implement this if-cascade usi I bet it's faster to implement this if-cascade using a lookup table.
	125 // CSS Tokenization is currently lossy, but we could record

	126 // the exact whitespace instead of discarding it here.

	127 consumeUntilNotWhitespace();

	128 return CSSToken(WhitespaceToken);

	129 }

	130 if (cc == '\"' \|\| cc == '\'')

	131 return consumeStringTokenUntil(cc);

	132 if (cc == '#') {

	133 if (nextCharIsName() \|\| nextTwoCharsAreValidEscape()) {

	134 HashTokenType hashType = UnrestrictedHashToken;

	135 if (nextCharsAreIdentifier())

	136 hashType = IdHashToken;

	137 return CSSToken(HashToken, consumeName(), hashType);

	138 }

	139 return CSSToken(DelimToken, cc);

	140 }

	141 if (cc == '$') {

	142 if (consumeIfNext('='))

	143 return CSSToken(SuffixMatchToken);

	144 return CSSToken(DelimToken, cc);

	145 }

	146 if (cc == '(')

	147 return CSSToken(LeftParenToken);

	148 if (cc == ')')

	149 return CSSToken(RightParenToken);

	150 if (cc == '*') {

	151 if (consumeIfNext('='))

	152 return CSSToken(SubstringMatchToken);

	153 return CSSToken(DelimToken, cc);

	154 }

	155 if (cc == '+' \|\| cc == '.') {

	156 if (nextCharsAreNumber()) {

	157 reconsume(cc);

	158 return consumeNumericToken();

	159 }

	160 return CSSToken(DelimToken, cc);

	161 }

	162 if (cc == ',')

	163 return CSSToken(CommaToken);

	164 if (cc == '-') {

	165 if (nextCharsAreNumber()) {

	166 reconsume(cc);

	167 return consumeNumericToken();

	168 }

	169 if (nextCharsAreIdentifier()) {

	170 reconsume(cc);

	171 return consumeIdentLikeToken();

	172 }

	173 if (consumeIfNext("->"))

	174 return CSSToken(CDCToken);

	175 return CSSToken(DelimToken, cc);

	176 }

	177 if (cc == '/') {

	178 if (consumeIfNext('*')) {

	179 consumeThroughCommentEndOrUntilEOF();

	180 return nextToken(*m_input);

	181 }

	182 return CSSToken(DelimToken, cc);

	183 }

	184 if (cc == ':')

	185 return CSSToken(ColonToken);

	186 if (cc == ';')

	187 return CSSToken(SemicolonToken);

	188 if (cc == '<') {

	189 if (consumeIfNext("!--"))

	190 return CSSToken(CDOToken);

	191 return CSSToken(DelimToken, cc);

	192 }

	193 if (cc == '@') {

	194 if (nextCharsAreIdentifier())

	195 return CSSToken(AtKeywordToken, consumeName());

	196 return CSSToken(DelimToken, cc);

	197 }

	198 if (cc == '[')

	199 return CSSToken(LeftBracketToken);

	200 if (cc == '\\') {

	201 if (nextIsValidEscape()) {

	202 reconsume(cc);

	203 return consumeIdentLikeToken();

	204 }

	205 return CSSToken(DelimToken, cc);

	206 }

	207 if (cc == ']')

	208 return CSSToken(RightBracketToken);

	209 if (cc == '^') {

	210 if (consumeIfNext('='))

	211 return CSSToken(PrefixMatchToken);

	212 return CSSToken(DelimToken, cc);

	213 }

	214 if (cc == '{')

	215 return CSSToken(LeftBraceToken);

	216 if (cc == '{')

	217 return CSSToken(RightBraceToken);

	218 if (isASCIIDigit(cc))

	219 return consumeNumericToken();

	220 // if (cc == 'U' \|\| cc == 'u') {

	221 // // U+0055 LATIN CAPITAL LETTER U (U)

	222 // // U+0075 LATIN SMALL LETTER U (u)

	223 // // If the next 2 input code points are U+002B PLUS SIGN (+) followed by a hex digit or U+003F QUESTION MARK (?), consume the next input code point. N ote: don’t consume both of them. Consume a unicode-range token and return it.

	224 // // Otherwise, reconsume the current input code point, consume an iden t-like token, and return it.

	225 // reconsume(cc);

	226 // return consumeIdentLikeToken();

	227 // }

	228 if (isNameStart(cc)) {

	229 reconsume(cc);

	230 return consumeIdentLikeToken();

	231 }

	232 if (cc == '\|') {

	233 if (consumeIfNext('='))

	234 return CSSToken(DashMatchToken);

	235 if (consumeIfNext('\|'))

	236 return CSSToken(ColumnToken);

	237 return CSSToken(DelimToken, cc);

	238 }

	239 if (cc == '~') {

	240 if (consumeIfNext('='))

	241 return CSSToken(IncludeMatchToken);

	242 return CSSToken(DelimToken, cc);

	243 }

	244 if (cc == kEndOfFileMarker)

	245 return CSSToken(EOFToken);

	246 return CSSToken(DelimToken, cc);

	247 }

	248

	249 CSSToken NewCSSTokenizer::consumeNumber()

	250 {

	251 ASSERT(nextCharsAreNumber());

	252 String repr;

	253 NumericValueType type = IntegerValueType;

	254 double value = 0;

	255

	256 // FIXME: Needs implementation.

	257 // http://dev.w3.org/csswg/css-syntax/#consume-a-number0

	258 return CSSToken(NumberToken, repr, value, type);

	259 }

	260

	261 CSSToken NewCSSTokenizer::consumeNumericToken()

	262 {

	263 CSSToken token = consumeNumber();

	264 if (nextCharsAreIdentifier())

	265 token.convertToDimensionWithUnit(consumeName());

	266 else if (consumeIfNext("%"))

	267 token.convertToPercentage();

	268 return token;

	269 }

	270

	271 CSSToken NewCSSTokenizer::consumeIdentLikeToken()

	272 {

	273 String name = consumeName();

	274 if (consumeIfNext('(')) {

	275 if (equalIgnoringCase(name, "url"))

	276 return consumeURLToken();

	277 return CSSToken(FunctionToken, name);

	278 }

	279 return CSSToken(IdentToken, name);

	280 }

	281

	282 CSSToken NewCSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint)

	283 {

	284 // FIXME: Implement.

	285 // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token

	286 return CSSToken(BadStringToken);

	287 }

	288

	289 CSSToken NewCSSTokenizer::consumeURLToken()

	290 {

	291 return CSSToken(BadURLToken);

	292 }

	293

	294 void NewCSSTokenizer::consumeUntilNotWhitespace()

	295 {

	296

	297 }

	298

	299 void NewCSSTokenizer::consumeThroughCommentEndOrUntilEOF()

	300 {

	301

	302 }

	303

	304 bool NewCSSTokenizer::consumeIfNext(UChar)

	305 {

	306 return false;

	307 }

	308

	309 bool NewCSSTokenizer::consumeIfNext(String)

	310 {

	311 return false;

	312 }

	313

	314 String NewCSSTokenizer::consumeName()

	315 {

	316 // FIXME: This is written to match the spec

	317 // but could be much more efficient.

	318 String result("");

	319 while (true) {

	320 if (isNameChar(m_input->currentInputChar())) {

	321 result.append(consume());
	abarth-chromium 2014/01/01 18:47:51 Please use StringBuilder rather than String. Stri Please use StringBuilder rather than String. String::append is so bad we should remove it from the API.
	322 continue;

	323 }

	324 if (nextTwoCharsAreValidEscape()) {

	325 consume(); // SPEC BUG: Emailed Tab.

	326 result.append(consumeEscape());

	327 continue;

	328 }

	329 return result;

	330 }

	331 }

	332

	333 // http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point

	334 UChar NewCSSTokenizer::consumeEscape()

	335 {

	336 UChar cc = consume();

	337 ASSERT(cc != '\n');

	338 if (isASCIIHexDigit(cc)) {

	339 unsigned consumedHexDigits = 1;

	340 String hexChars;
	abarth-chromium 2014/01/01 18:47:51 StringBuilder StringBuilder
	341 do {

	342 hexChars.append(cc);

	343 cc = consume();

	344 consumedHexDigits++;

	345 } while (consumedHexDigits < 6 && isASCIIHexDigit(cc));
	abarth-chromium 2014/01/01 18:47:51 You can reserve capacity 6 in the StringBuilder to You can reserve capacity 6 in the StringBuilder to avoid reallocs.
	346 bool ok = false;

	347 UChar codePoint = hexChars.toUIntStrict(&ok, 16);
	abarth-chromium 2014/01/01 18:47:51 Oh, actually, you don't need to malloc at all in t Oh, actually, you don't need to malloc at all in this function. Just make a UChar[] on the stack and then call the underlying conversion function instead of mallocing up a string.
	348 if (!ok)

	349 return WTF::Unicode::replacementCharacter;

	350 return codePoint;

	351 }

	352 if (cc == kEndOfFileMarker)

	353 return WTF::Unicode::replacementCharacter;

	354 return cc;

	355 }

	356

	357 bool NewCSSTokenizer::nextIsValidEscape()

	358 {

	359 return false;

	360 }

	361

	362 bool NewCSSTokenizer::nextCharIsName()

	363 {

	364 return false;

	365 }

	366

	367 // http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escap eare-a-valid-escapestarts-with-a-valid-escape

	368 bool NewCSSTokenizer::nextTwoCharsAreValidEscape()

	369 {

	370 UChar firstChar = m_input->nextInputChar();

	371 UChar secondChar = m_input->peek2();

	372 if (firstChar != '\\')

	373 return false;

	374 if (secondChar == '\n' \|\| secondChar == kEndOfFileMarker)

	375 return false;

	376 return true;

	377 }

	378

	379 bool NewCSSTokenizer::nextCharsAreNumber()

	380 {

	381 return false;

	382 }

	383

	384 bool NewCSSTokenizer::nextCharsAreIdentifier()

	385 {

	386 return false;

	387 }

	388

	389 } // namespace WebCore

OLD	NEW

« Source/core/css/parser/NewCSSTokenizer.h ('K') | « Source/core/css/parser/NewCSSTokenizer.h ('k') | Source/core/css/parser/NewCSSTokenizerTest.cpp » ('j') | no next file with comments »