Source/core/css/parser/MediaQueryTokenizer.cpp - Issue 171383002: A thread-safe Media Query Parser

Side by Side Diff: Source/core/css/parser/MediaQueryTokenizer.cpp

Issue 171383002: A thread-safe Media Query Parser (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master

Patch Set: Fixed gcc compile issues and debug asserts Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2014 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "config.h"

	6 #include "core/css/parser/MediaQueryTokenizer.h"

	7

	8 #include "core/css/parser/MediaQueryInputStream.h"

	9 #include "core/html/parser/HTMLParserIdioms.h"

	10 #include "wtf/unicode/CharacterNames.h"

	11

	12 namespace WebCore {

	13

	14 const unsigned codePointsNumber = SCHAR_MAX;

	15

	16 class MediaQueryTokenizer::CodePoints {

	17 public:

	18 MediaQueryTokenizer::CodePoint codePoints[codePointsNumber];

	19

	20 CodePoints()

	21 {

	22 memset(codePoints, 0, codePointsNumber);

	23 codePoints['\n'] = &MediaQueryTokenizer::whiteSpace;

	24 codePoints['\r'] = &MediaQueryTokenizer::whiteSpace;

	25 codePoints['\t'] = &MediaQueryTokenizer::whiteSpace;

	26 codePoints[' '] = &MediaQueryTokenizer::whiteSpace;

	27 codePoints['\f'] = &MediaQueryTokenizer::whiteSpace;

	28 codePoints['('] = &MediaQueryTokenizer::leftParenthesis;

	29 codePoints[')'] = &MediaQueryTokenizer::rightParenthesis;

	30 codePoints['+'] = &MediaQueryTokenizer::plusOrFullStop;

	31 codePoints['.'] = &MediaQueryTokenizer::plusOrFullStop;

	32 codePoints[','] = &MediaQueryTokenizer::comma;

	33 codePoints['-'] = &MediaQueryTokenizer::hyphenMinus;

	34 codePoints['/'] = &MediaQueryTokenizer::solidus;

	35 codePoints[':'] = &MediaQueryTokenizer::colon;

	36 codePoints[';'] = &MediaQueryTokenizer::semiColon;

	37 codePoints['\\'] = &MediaQueryTokenizer::reverseSolidus;

	38 for (unsigned char digit = '0'; digit <= '9'; ++digit)

	39 codePoints[digit] = &MediaQueryTokenizer::asciiDigit;

	40 for (unsigned char alpha = 'a'; alpha <= 'z'; ++alpha)

	41 codePoints[alpha] = &MediaQueryTokenizer::nameStart;

	42 for (unsigned char alpha = 'A'; alpha <= 'Z'; ++alpha)

	43 codePoints[alpha] = &MediaQueryTokenizer::nameStart;

	44 codePoints['_'] = &MediaQueryTokenizer::nameStart;

	45 codePoints[kEndOfFileMarker] = &MediaQueryTokenizer::endOfFile;

	46 }

	47 };

	48

	49 MediaQueryTokenizer::CodePoints* MediaQueryTokenizer::codePoints()

	50 {

	51 static CodePoints codePoints;

	52 return &codePoints;

	53 }

	54

	55 // http://dev.w3.org/csswg/css-syntax/#name-start-code-point

	56 static bool isNameStart(UChar c)

	57 {

	58 if (isASCIIAlpha(c))

	59 return true;

	60 if (c == '_')

	61 return true;

	62 return !isASCII(c);

	63 }

	64

	65 // http://www.w3.org/TR/css-syntax-3/#name-code-point

	66 static bool isNameChar(UChar c)

	67 {

	68 return isNameStart(c) \|\| isASCIIDigit(c) \|\| c == '-';

	69 }

	70

	71 // http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escap e

	72 static bool twoCharsAreValidEscape(UChar first, UChar second)

	73 {

	74 return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker)) ;

	75 }

	76

	77 MediaQueryTokenizer::MediaQueryTokenizer()

	78 {

	79 }

	80

	81 void MediaQueryTokenizer::reconsume(UChar c)

	82 {

	83 m_input->pushBack(c);

	84 }

	85

	86 UChar MediaQueryTokenizer::consume()

	87 {

	88 UChar current = m_input->currentInputChar();

	89 m_input->advance();

	90 return current;

	91 }

	92

	93 void MediaQueryTokenizer::consume(unsigned offset)

	94 {

	95 m_input->advance(offset);

	96 }

	97

	98 MediaQueryToken MediaQueryTokenizer::whiteSpace(UChar cc)

	99 {

	100 // CSS Tokenization is currently lossy, but we could record

	101 // the exact whitespace instead of discarding it here.

	102 consumeUntilNonWhitespace();

	103 return MediaQueryToken(WhitespaceToken);

	104 }

	105

	106 MediaQueryToken MediaQueryTokenizer::leftParenthesis(UChar cc)

	107 {

	108 return MediaQueryToken(LeftParenthesisToken);

	109 }

	110

	111 MediaQueryToken MediaQueryTokenizer::rightParenthesis(UChar cc)

	112 {

	113 return MediaQueryToken(RightParenthesisToken);

	114 }

	115

	116 MediaQueryToken MediaQueryTokenizer::plusOrFullStop(UChar cc)

	117 {

	118 if (nextCharsAreNumber()) {

	119 reconsume(cc);

	120 return consumeNumericToken();

	121 }

	122 return MediaQueryToken(DelimiterToken, cc);

	123 }

	124

	125 MediaQueryToken MediaQueryTokenizer::comma(UChar cc)

	126 {

	127 return MediaQueryToken(CommaToken);

	128 }

	129

	130 MediaQueryToken MediaQueryTokenizer::hyphenMinus(UChar cc)

	131 {

	132 if (nextCharsAreNumber()) {

	133 reconsume(cc);

	134 return consumeNumericToken();

	135 }

	136 if (nextCharsAreIdentifier()) {

	137 reconsume(cc);

	138 return consumeIdentLikeToken();

	139 }

	140 return MediaQueryToken(DelimiterToken, cc);

	141 }

	142

	143 MediaQueryToken MediaQueryTokenizer::solidus(UChar cc)

	144 {

	145 return MediaQueryToken(DelimiterToken, cc);

	146 }

	147

	148 MediaQueryToken MediaQueryTokenizer::colon(UChar cc)

	149 {

	150 return MediaQueryToken(ColonToken);

	151 }

	152

	153 MediaQueryToken MediaQueryTokenizer::semiColon(UChar cc)

	154 {

	155 return MediaQueryToken(SemicolonToken);

	156 }

	157

	158 MediaQueryToken MediaQueryTokenizer::reverseSolidus(UChar cc)

	159 {

	160 if (twoCharsAreValidEscape(cc, m_input->currentInputChar())) {

	161 reconsume(cc);

	162 return consumeIdentLikeToken();

	163 }

	164 return MediaQueryToken(DelimiterToken, cc);

	165 }

	166

	167 MediaQueryToken MediaQueryTokenizer::asciiDigit(UChar cc)

	168 {

	169 reconsume(cc);

	170 return consumeNumericToken();

	171 }

	172

	173 MediaQueryToken MediaQueryTokenizer::nameStart(UChar cc)

	174 {

	175 reconsume(cc);

	176 return consumeIdentLikeToken();

	177 }

	178

	179 MediaQueryToken MediaQueryTokenizer::endOfFile(UChar cc)

	180 {

	181 return MediaQueryToken(EOFToken);

	182 }

	183

	184 void MediaQueryTokenizer::tokenize(String string, Vector<MediaQueryToken>& outTo kens)

	185 {

	186 MediaQueryTokenizer tokenizer;

	187 // According to the spec, we should perform preprocessing here.

	188 // See: http://www.w3.org/TR/css-syntax-3/#input-preprocessing

	189 //

	190 // However, we can skip this step since:

	191 // * We're using HTML spaces (which accept \r and \f as a valid white space)

	192 // * Do not count white spaces

	193 // * consumeEscape replaces NULLs for replacement characters

	194

	195 MediaQueryInputStream input(string);

	196 while (true) {

	197 outTokens.append(tokenizer.nextToken(input));

	198 if (outTokens.last().type() == EOFToken)

	199 return;

	200 }

	201 }

	202

	203 MediaQueryToken MediaQueryTokenizer::nextToken(MediaQueryInputStream& input)

	204 {

	205 // Unlike the HTMLTokenizer, the CSS Syntax spec is written

	206 // as a stateless, (fixed-size) look-ahead tokenizer.

	207 // We could move to the stateful model and instead create

	208 // states for all the "next 3 codepoints are X" cases.

	209 // State-machine tokenizers are easier to write to handle

	210 // incremental tokenization of partial sources.

	211 // However, for now we follow the spec exactly.

	212 m_input = &input;

	213 UChar cc = consume();

	214 CodePoint codePointFunc = 0;

	215

	216 if (isASCII(cc)) {

	217 ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber);

	218 codePointFunc = codePoints()->codePoints[cc];

	219 } else {

	220 codePointFunc = &MediaQueryTokenizer::nameStart;

	221 }

	222

	223 if (codePointFunc)

	224 return ((this)->*(codePointFunc))(cc);

	225

	226 return MediaQueryToken(DelimiterToken, cc);

	227 }

	228

	229 // This method merges the following spec sections for efficiency

	230 // http://www.w3.org/TR/css3-syntax/#consume-a-number

	231 // http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number

	232 MediaQueryToken MediaQueryTokenizer::consumeNumber()
	eseidel 2014/03/13 17:58:45 This is a really long function and it might make s This is a really long function and it might make sense to split it into helpers, even if they're local statics.
	233 {

	234 ASSERT(nextCharsAreNumber());

	235 NumericValueType type = IntegerValueType;

	236 double value = 0;

	237 int sign = 1;

	238 unsigned peekOffset = 0;

	239 int exponentSign = 1;

	240 unsigned exponentStartPos = 0;

	241 unsigned exponentEndPos = 0;

	242 unsigned fractionStartPos = 0;

	243 unsigned fractionEndPos = 0;

	244 unsigned long long integerPart;

	245 double fractionPart;

	246 unsigned fractionDigits;

	247 unsigned long long exponentPart;

	248 if (m_input->currentInputChar() == '+') {

	249 ++peekOffset;

	250 } else if (m_input->peek(peekOffset) == '-') {

	251 sign = -1;

	252 ++peekOffset;

	253 }

	254 unsigned intStartPos = peekOffset;

	255 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);

	256 unsigned intEndPos = peekOffset;

	257 if (m_input->peek(peekOffset) == '.' && isASCIIDigit(m_input->peek(++peekOff set))) {

	258 fractionStartPos = peekOffset - 1;

	259 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);

	260 fractionEndPos = peekOffset;

	261 }

	262 if ((m_input->peek(peekOffset) == 'E' \|\| m_input->peek(peekOffset) == 'e')) {

	263 int peekOffsetBeforeExponent = peekOffset;

	264 ++peekOffset;

	265 if (m_input->peek(peekOffset) == '+') {

	266 ++peekOffset;

	267 } else if (m_input->peek(peekOffset) =='-') {

	268 exponentSign = -1;

	269 ++peekOffset;

	270 }

	271 exponentStartPos = peekOffset;

	272 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);

	273 exponentEndPos = peekOffset;

	274 if (exponentEndPos == exponentStartPos)

	275 peekOffset = peekOffsetBeforeExponent;

	276 }

	277 integerPart = m_input->getUInt(intStartPos, intEndPos);

	278 fractionDigits = fractionEndPos - fractionStartPos;

	279 unsigned floatingFractionEndPos = fractionEndPos;

	280 fractionPart = m_input->getDouble(fractionStartPos, floatingFractionEndPos);

	281 exponentPart = m_input->getUInt(exponentStartPos, exponentEndPos);

	282 double exponent = pow(10, (float)exponentSign * (double)exponentPart);

	283 value = (double)sign * ((double)integerPart + fractionPart) * exponent;

	284

	285 m_input->advance(peekOffset);

	286 if (fractionDigits > 0)

	287 type = NumberValueType;

	288

	289 return MediaQueryToken(NumberToken, value, type);

	290 }

	291

	292 // http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token

	293 MediaQueryToken MediaQueryTokenizer::consumeNumericToken()

	294 {

	295 MediaQueryToken token = consumeNumber();

	296 if (nextCharsAreIdentifier())

	297 token.convertToDimensionWithUnit(consumeName());

	298 else if (consumeIfNext('%'))

	299 token.convertToPercentage();

	300 return token;

	301 }

	302

	303 // http://www.w3.org/TR/css3-syntax/#consume-an-ident-like-token

	304 MediaQueryToken MediaQueryTokenizer::consumeIdentLikeToken()

	305 {

	306 String name = consumeName();

	307 if (consumeIfNext('('))

	308 return MediaQueryToken(FunctionToken, name);

	309 return MediaQueryToken(IdentToken, name);

	310 }

	311

	312 void MediaQueryTokenizer::consumeUntilNonWhitespace()

	313 {

	314 // Using HTML space here rather than CSS space since we don't do preprocessi ng

	315 while (isHTMLSpace<UChar>(m_input->currentInputChar()))

	316 consume();

	317 }

	318

	319 bool MediaQueryTokenizer::consumeIfNext(UChar character)

	320 {

	321 if (m_input->currentInputChar() == character) {

	322 consume();

	323 return true;

	324 }

	325 return false;

	326 }

	327

	328 // http://www.w3.org/TR/css3-syntax/#consume-a-name

	329 String MediaQueryTokenizer::consumeName()

	330 {

	331 // FIXME: Is this as efficient as it can be?

	332 // The possibility of escape chars mandates a copy AFAICT.

	333 Vector<UChar> result;

	334 while (true) {

	335 if (isNameChar(m_input->currentInputChar())) {

	336 result.append(consume());

	337 continue;

	338 }

	339 if (nextTwoCharsAreValidEscape()) {

	340 // "consume()" fixes a spec bug.

	341 // The first code point should be consumed before consuming the esca ped code point.

	342 consume();

	343 result.append(consumeEscape());

	344 continue;

	345 }

	346 return String(result);

	347 }

	348 }

	349

	350 // http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point

	351 UChar MediaQueryTokenizer::consumeEscape()

	352 {

	353 UChar cc = consume();

	354 ASSERT(cc != '\n');

	355 if (isASCIIHexDigit(cc)) {

	356 unsigned consumedHexDigits = 1;

	357 String hexChars;

	358 do {

	359 hexChars.append(cc);

	360 cc = consume();

	361 consumedHexDigits++;

	362 } while (consumedHexDigits < 6 && isASCIIHexDigit(cc));

	363 bool ok = false;

	364 UChar codePoint = hexChars.toUIntStrict(&ok, 16);

	365 if (!ok)

	366 return WTF::Unicode::replacementCharacter;

	367 return codePoint;

	368 }

	369

	370 // Replaces NULLs with replacement characters, since we do not perform prepr ocessing

	371 if (cc == kEndOfFileMarker)

	372 return WTF::Unicode::replacementCharacter;

	373 return cc;

	374 }

	375

	376 bool MediaQueryTokenizer::nextTwoCharsAreValidEscape()

	377 {

	378 if (m_input->leftChars() < 2)

	379 return false;

	380 return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2));

	381 }

	382

	383 // http://www.w3.org/TR/css3-syntax/#starts-with-a-number

	384 bool MediaQueryTokenizer::nextCharsAreNumber()

	385 {

	386 UChar first = m_input->currentInputChar();

	387 UChar second = m_input->peek(1);

	388 if (isASCIIDigit(first))

	389 return true;

	390 if (first == '+' \|\| first == '-')

	391 return ((isASCIIDigit(second)) \|\| (second == '.' && isASCIIDigit(m_input ->peek(2))));

	392 if (first =='.')

	393 return (isASCIIDigit(second));

	394 return false;

	395 }

	396

	397 // http://www.w3.org/TR/css3-syntax/#would-start-an-identifier

	398 bool MediaQueryTokenizer::nextCharsAreIdentifier()

	399 {

	400 UChar firstChar = m_input->currentInputChar();
	eseidel 2014/03/13 17:58:45 Is m_input ever null? Can we make it a reference? Is m_input ever null? Can we make it a reference?
	401 if (isNameStart(firstChar) \|\| nextTwoCharsAreValidEscape())

	402 return true;

	403

	404 if (firstChar == '-') {

	405 if (isNameStart(m_input->peek(1)))

	406 return true;

	407 return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2));

	408 }

	409

	410 return false;

	411 }

	412

	413 } // namespace WebCore

OLD	NEW