Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(61)

Side by Side Diff: Source/core/css/parser/NewCSSTokenizer.cpp

Issue 171383002: A thread-safe Media Query Parser (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master
Patch Set: Created 6 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (C) 2013 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "config.h"
32 #include "core/css/parser/NewCSSTokenizer.h"
33
34 #include "core/css/parser/CSSInputStream.h"
35 #include "core/css/parser/CSSParserIdioms.h"
36 #include "platform/text/SegmentedString.h"
37 #include "wtf/TemporaryChange.h"
38 #include "wtf/unicode/CharacterNames.h"
39
40 namespace WebCore {
41
42 // http://dev.w3.org/csswg/css-syntax/#name-start-code-point
43 static bool isNameStart(UChar c)
44 {
45 if (isASCIIAlpha(c))
46 return true;
47 if (c == '_')
48 return true;
49 return !isASCII(c);
50 }
51
52 // http://www.w3.org/TR/css-syntax-3/#name-code-point
53 static bool isNameChar(UChar c)
54 {
55 return isNameStart(c) || isASCIIDigit(c) || c == '-';
56 }
57
58 // http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escap e
59 static bool twoCharsAreValidEscape(UChar first, UChar second)
60 {
61 return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker)) ;
62 }
63
64 NewCSSTokenizer::NewCSSTokenizer()
65 {
66 }
67
68 void NewCSSTokenizer::reconsume(UChar c)
69 {
70 m_input->pushBack(c);
71 }
72
73 UChar NewCSSTokenizer::consume()
74 {
75 UChar current = m_input->currentInputChar();
76 m_input->advance();
77 return current;
78 }
79
80 void NewCSSTokenizer::tokenize(String string, Vector<CSSToken>& outTokens)
81 {
82 NewCSSTokenizer tokenizer;
83 CSSInputStream input(string);
84 while (true) {
85 outTokens.append(tokenizer.nextToken(input));
86 if (outTokens.last().type() == EOFToken)
87 return;
88 }
89 }
90
91 CSSToken NewCSSTokenizer::nextToken(CSSInputStream& input)
92 {
93 // Unlike the HTMLTokenizer, the CSS Syntax spec is written
94 // as a stateless, (fixed-size) look-ahead tokenizer.
95 // We could move to the stateful model and instead create
96 // states for all the "next 3 codepoints are X" cases.
97 // State-machine tokenizers are easier to write to handle
98 // incremental tokenization of partial sources.
99 // However, for now we follow the spec exactly.
100 m_input = &input;
101 UChar cc = consume();
102
103 if (isCSSSpace(cc)) {
104 // CSS Tokenization is currently lossy, but we could record
105 // the exact whitespace instead of discarding it here.
106 consumeUntilNotWhitespace();
107 return CSSToken(WhitespaceToken);
108 }
109 if (cc == '\"' || cc == '\'')
110 return consumeStringTokenUntil(cc);
111 if (cc == '#') {
112 if (nextCharIsNameChar() || nextTwoCharsAreValidEscape()) {
113 HashTokenType hashType = UnrestrictedHashToken;
114 if (nextCharsAreIdentifier())
115 hashType = IdHashToken;
116 return CSSToken(HashToken, consumeName(), hashType);
117 }
118 return CSSToken(DelimToken, cc);
119 }
120 if (cc == '$') {
121 if (consumeIfNext('='))
122 return CSSToken(SuffixMatchToken);
123 return CSSToken(DelimToken, cc);
124 }
125 if (cc == '(')
126 return CSSToken(LeftParenToken);
127 if (cc == ')')
128 return CSSToken(RightParenToken);
129 if (cc == '*') {
130 if (consumeIfNext('='))
131 return CSSToken(SubstringMatchToken);
132 return CSSToken(DelimToken, cc);
133 }
134 if (cc == '+' || cc == '.') {
135 if (nextCharsAreNumber()) {
136 reconsume(cc);
137 return consumeNumericToken();
138 }
139 return CSSToken(DelimToken, cc);
140 }
141 if (cc == ',')
142 return CSSToken(CommaToken);
143 if (cc == '-') {
144 if (nextCharsAreNumber()) {
145 reconsume(cc);
146 return consumeNumericToken();
147 }
148 if (nextCharsAreIdentifier()) {
149 reconsume(cc);
150 return consumeIdentLikeToken();
151 }
152 if (consumeIfNext("->"))
153 return CSSToken(CDCToken);
154 return CSSToken(DelimToken, cc);
155 }
156 if (cc == '/') {
157 if (consumeIfNext('*')) {
158 consumeThroughCommentEndOrUntilEOF();
159 return nextToken(*m_input);
160 }
161 return CSSToken(DelimToken, cc);
162 }
163 if (cc == ':')
164 return CSSToken(ColonToken);
165 if (cc == ';')
166 return CSSToken(SemicolonToken);
167 if (cc == '<') {
168 if (consumeIfNext("!--"))
169 return CSSToken(CDOToken);
170 return CSSToken(DelimToken, cc);
171 }
172 if (cc == '@') {
173 if (nextCharsAreIdentifier())
174 return CSSToken(AtKeywordToken, consumeName());
175 return CSSToken(DelimToken, cc);
176 }
177 if (cc == '[')
178 return CSSToken(LeftBracketToken);
179 if (cc == '\\') {
180 if (twoCharsAreValidEscape(cc, m_input->currentInputChar())) {
181 reconsume(cc);
182 return consumeIdentLikeToken();
183 }
184 return CSSToken(DelimToken, cc);
185 }
186 if (cc == ']')
187 return CSSToken(RightBracketToken);
188 if (cc == '^') {
189 if (consumeIfNext('='))
190 return CSSToken(PrefixMatchToken);
191 return CSSToken(DelimToken, cc);
192 }
193 if (cc == '{')
194 return CSSToken(LeftBraceToken);
195 if (cc == '{')
196 return CSSToken(RightBraceToken);
197 if (isASCIIDigit(cc)) {
198 // "reconsume" here is not according to spec, but required AFAICT.
199 // https://www.w3.org/Bugs/Public/show_bug.cgi?id=24661
200 reconsume(cc);
201 return consumeNumericToken();
202 }
203 // if (cc == 'U' || cc == 'u') {
204 // // U+0055 LATIN CAPITAL LETTER U (U)
205 // // U+0075 LATIN SMALL LETTER U (u)
206 // // If the next 2 input code points are U+002B PLUS SIGN (+) followed by a hex digit or U+003F QUESTION MARK (?), consume the next input code point. N ote: don’t consume both of them. Consume a unicode-range token and return it.
207 // // Otherwise, reconsume the current input code point, consume an iden t-like token, and return it.
208 // reconsume(cc);
209 // return consumeIdentLikeToken();
210 // }
211 if (isNameStart(cc)) {
212 reconsume(cc);
213 return consumeIdentLikeToken();
214 }
215 if (cc == '|') {
216 if (consumeIfNext('='))
217 return CSSToken(DashMatchToken);
218 if (consumeIfNext('|'))
219 return CSSToken(ColumnToken);
220 return CSSToken(DelimToken, cc);
221 }
222 if (cc == '~') {
223 if (consumeIfNext('='))
224 return CSSToken(IncludeMatchToken);
225 return CSSToken(DelimToken, cc);
226 }
227 if (cc == kEndOfFileMarker)
228 return CSSToken(EOFToken);
229 return CSSToken(DelimToken, cc);
230 }
231
232 // This method merges the following spec sections for efficiency
233 // http://www.w3.org/TR/css3-syntax/#consume-a-number
234 // http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number
235 CSSToken NewCSSTokenizer::consumeNumber()
236 {
237 ASSERT(nextCharsAreNumber());
238 // FIXME - repr should get the value as a string, even though I'm not sure i t's useful
239 String repr;
240 NumericValueType type = IntegerValueType;
241 double value = 0;
242 int sign = 1;
243 unsigned peekOffset = 0;
244 int exponentSign = 1;
245 unsigned exponentStartPos = 0;
246 unsigned exponentEndPos = 0;
247 unsigned fractionStartPos = 0;
248 unsigned fractionEndPos = 0;
249 unsigned integerPart;
250 unsigned fractionPart;
251 unsigned fractionDigits;
252 unsigned exponentPart;
253 if (m_input->currentInputChar() == '+') {
254 ++peekOffset;
255 } else if (m_input->peek(peekOffset) == '-') {
256 sign = -1;
257 ++peekOffset;
258 }
259 unsigned intStartPos = peekOffset;
260 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);
261 unsigned intEndPos = peekOffset;
262 if (m_input->peek(peekOffset) == '.' && isASCIIDigit(m_input->peek(++peekOff set))) {
263 fractionStartPos = peekOffset;
264 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);
265 fractionEndPos = peekOffset;
266 }
267 if ((m_input->peek(peekOffset) == 'E' || m_input->peek(peekOffset) == 'e')) {
268 ++peekOffset;
269 if (m_input->peek(peekOffset) == '+') {
270 ++peekOffset;
271 } else if (m_input->peek(peekOffset) =='-') {
272 exponentSign = -1;
273 ++peekOffset;
274 }
275 exponentStartPos = peekOffset;
276 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset);
277 exponentEndPos = peekOffset;
278 }
279 integerPart = m_input->getUnsignedInt(intStartPos, intEndPos);
280 fractionPart = m_input->getUnsignedInt(fractionStartPos, fractionEndPos);
281 fractionDigits = fractionEndPos - fractionStartPos;
282 exponentPart = m_input->getUnsignedInt(exponentStartPos, exponentEndPos);
283 value = sign * (integerPart + fractionPart * pow(10, -1 * fractionDigits)) * pow(10, exponentSign * exponentPart);
284
285 m_input->advance(peekOffset);
286 // FIXME - Always returning an Integer type. Need to look at fractions, etc.
287
288 return CSSToken(NumberToken, repr, value, type);
289 }
290
291 // http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token
292 CSSToken NewCSSTokenizer::consumeNumericToken()
293 {
294 CSSToken token = consumeNumber();
295 if (nextCharsAreIdentifier())
296 token.convertToDimensionWithUnit(consumeName());
297 else if (consumeIfNext('%'))
298 token.convertToPercentage();
299 return token;
300 }
301
302 // http://www.w3.org/TR/css3-syntax/#consume-an-ident-like-token
303 CSSToken NewCSSTokenizer::consumeIdentLikeToken()
304 {
305 String name = consumeName();
306 if (consumeIfNext('(')) {
307 if (equalIgnoringCase(name, "url"))
308 return consumeURLToken();
309 return CSSToken(FunctionToken, name);
310 }
311 return CSSToken(IdentToken, name);
312 }
313
314 // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
315 CSSToken NewCSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
316 {
317 // FIXME: Implement.
318 return CSSToken(BadStringToken);
319 }
320
321 // http://www.w3.org/TR/css3-syntax/#consume-a-url-token
322 CSSToken NewCSSTokenizer::consumeURLToken()
323 {
324 // FIXME: Implement.
325 return CSSToken(BadURLToken);
326 }
327
328 void NewCSSTokenizer::consumeUntilNotWhitespace()
329 {
330 while (m_input->currentInputChar() == '\t' || m_input->currentInputChar() == ' ' || m_input->currentInputChar() == '\n')
331 consume();
332 }
333
334 void NewCSSTokenizer::consumeThroughCommentEndOrUntilEOF()
335 {
336 // FIXME: Implement.
337 }
338
339 bool NewCSSTokenizer::consumeIfNext(UChar character)
340 {
341 return (m_input->currentInputChar() == character);
342 }
343
344 bool NewCSSTokenizer::consumeIfNext(String str)
345 {
346 for (unsigned i = 0; i < str.length(); ++i) {
347 if (str[i] != m_input->peek(i))
348 return false;
349 }
350 return true;
351 }
352
353 // http://www.w3.org/TR/css3-syntax/#consume-a-name
354 String NewCSSTokenizer::consumeName()
355 {
356 // FIXME: This is written to match the spec
357 // but could be much more efficient.
358 String result("");
359 while (true) {
360 if (isNameChar(m_input->currentInputChar())) {
361 result.append(consume());
362 continue;
363 }
364 if (nextTwoCharsAreValidEscape()) {
365 consume(); // SPEC BUG: Emailed Tab.
366 result.append(consumeEscape());
367 continue;
368 }
369 return result;
370 }
371 }
372
373 // http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
374 UChar NewCSSTokenizer::consumeEscape()
375 {
376 UChar cc = consume();
377 ASSERT(cc != '\n');
378 if (isASCIIHexDigit(cc)) {
379 unsigned consumedHexDigits = 1;
380 String hexChars;
381 do {
382 hexChars.append(cc);
383 cc = consume();
384 consumedHexDigits++;
385 } while (consumedHexDigits < 6 && isASCIIHexDigit(cc));
386 bool ok = false;
387 UChar codePoint = hexChars.toUIntStrict(&ok, 16);
388 if (!ok)
389 return WTF::Unicode::replacementCharacter;
390 return codePoint;
391 }
392 if (cc == kEndOfFileMarker)
393 return WTF::Unicode::replacementCharacter;
394 return cc;
395 }
396
397 bool NewCSSTokenizer::nextCharIsNameChar()
398 {
399 return isNameChar(m_input->currentInputChar());
400 }
401
402 bool NewCSSTokenizer::nextTwoCharsAreValidEscape()
403 {
404 return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2));
405 }
406
407 // http://www.w3.org/TR/css3-syntax/#starts-with-a-number
408 bool NewCSSTokenizer::nextCharsAreNumber()
409 {
410 UChar first = m_input->currentInputChar();
411 UChar second = m_input->peek(1);
412 if (isASCIIDigit(first))
413 return true;
414 if (first == '+' || first == '-')
415 return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input ->peek(2))));
416 if (first =='.')
417 return (isASCIIDigit(second));
418 return false;
419 }
420
421 // http://www.w3.org/TR/css3-syntax/#would-start-an-identifier
422 bool NewCSSTokenizer::nextCharsAreIdentifier()
423 {
424 UChar firstChar = m_input->currentInputChar();
425 if (isNameStart(firstChar) || nextTwoCharsAreValidEscape())
426 return true;
427
428 if (firstChar == '-') {
429 if (isNameStart(m_input->peek(1)))
430 return true;
431 return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2));
432 }
433
434 return false;
435 }
436
437 } // namespace WebCore
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698