OLD | NEW |
| (Empty) |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "config.h" | |
6 #include "core/css/parser/MediaQueryTokenizer.h" | |
7 | |
8 #include "core/css/parser/MediaQueryInputStream.h" | |
9 #include "core/html/parser/HTMLParserIdioms.h" | |
10 #include "wtf/unicode/CharacterNames.h" | |
11 | |
12 namespace WebCore { | |
13 | |
14 const unsigned codePointsNumber = SCHAR_MAX; | |
15 | |
16 class MediaQueryTokenizer::CodePoints { | |
17 public: | |
18 MediaQueryTokenizer::CodePoint codePoints[codePointsNumber]; | |
19 | |
20 // FIXME: Move the codePoint array to be a static one, generated by build sc
ripts | |
21 CodePoints() | |
22 { | |
23 memset(codePoints, 0, codePointsNumber); | |
24 codePoints['\n'] = &MediaQueryTokenizer::whiteSpace; | |
25 codePoints['\r'] = &MediaQueryTokenizer::whiteSpace; | |
26 codePoints['\t'] = &MediaQueryTokenizer::whiteSpace; | |
27 codePoints[' '] = &MediaQueryTokenizer::whiteSpace; | |
28 codePoints['\f'] = &MediaQueryTokenizer::whiteSpace; | |
29 codePoints['('] = &MediaQueryTokenizer::leftParenthesis; | |
30 codePoints[')'] = &MediaQueryTokenizer::rightParenthesis; | |
31 codePoints['+'] = &MediaQueryTokenizer::plusOrFullStop; | |
32 codePoints['.'] = &MediaQueryTokenizer::plusOrFullStop; | |
33 codePoints[','] = &MediaQueryTokenizer::comma; | |
34 codePoints['-'] = &MediaQueryTokenizer::hyphenMinus; | |
35 codePoints['/'] = &MediaQueryTokenizer::solidus; | |
36 codePoints[':'] = &MediaQueryTokenizer::colon; | |
37 codePoints[';'] = &MediaQueryTokenizer::semiColon; | |
38 codePoints['\\'] = &MediaQueryTokenizer::reverseSolidus; | |
39 for (unsigned char digit = '0'; digit <= '9'; ++digit) | |
40 codePoints[digit] = &MediaQueryTokenizer::asciiDigit; | |
41 for (unsigned char alpha = 'a'; alpha <= 'z'; ++alpha) | |
42 codePoints[alpha] = &MediaQueryTokenizer::nameStart; | |
43 for (unsigned char alpha = 'A'; alpha <= 'Z'; ++alpha) | |
44 codePoints[alpha] = &MediaQueryTokenizer::nameStart; | |
45 codePoints['_'] = &MediaQueryTokenizer::nameStart; | |
46 codePoints[kEndOfFileMarker] = &MediaQueryTokenizer::endOfFile; | |
47 } | |
48 }; | |
49 | |
50 MediaQueryTokenizer::CodePoints* MediaQueryTokenizer::codePoints() | |
51 { | |
52 static CodePoints codePoints; | |
53 return &codePoints; | |
54 } | |
55 | |
56 // http://dev.w3.org/csswg/css-syntax/#name-start-code-point | |
57 static bool isNameStart(UChar c) | |
58 { | |
59 if (isASCIIAlpha(c)) | |
60 return true; | |
61 if (c == '_') | |
62 return true; | |
63 return !isASCII(c); | |
64 } | |
65 | |
66 // http://www.w3.org/TR/css-syntax-3/#name-code-point | |
67 static bool isNameChar(UChar c) | |
68 { | |
69 return isNameStart(c) || isASCIIDigit(c) || c == '-'; | |
70 } | |
71 | |
72 // http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escap
e | |
73 static bool twoCharsAreValidEscape(UChar first, UChar second) | |
74 { | |
75 return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker))
; | |
76 } | |
77 | |
78 MediaQueryTokenizer::MediaQueryTokenizer(MediaQueryInputStream& inputStream) | |
79 : m_input(inputStream) | |
80 { | |
81 } | |
82 | |
83 void MediaQueryTokenizer::reconsume(UChar c) | |
84 { | |
85 m_input.pushBack(c); | |
86 } | |
87 | |
88 UChar MediaQueryTokenizer::consume() | |
89 { | |
90 UChar current = m_input.currentInputChar(); | |
91 m_input.advance(); | |
92 return current; | |
93 } | |
94 | |
95 void MediaQueryTokenizer::consume(unsigned offset) | |
96 { | |
97 m_input.advance(offset); | |
98 } | |
99 | |
100 MediaQueryToken MediaQueryTokenizer::whiteSpace(UChar cc) | |
101 { | |
102 // CSS Tokenization is currently lossy, but we could record | |
103 // the exact whitespace instead of discarding it here. | |
104 consumeUntilNonWhitespace(); | |
105 return MediaQueryToken(WhitespaceToken); | |
106 } | |
107 | |
108 MediaQueryToken MediaQueryTokenizer::leftParenthesis(UChar cc) | |
109 { | |
110 return MediaQueryToken(LeftParenthesisToken); | |
111 } | |
112 | |
113 MediaQueryToken MediaQueryTokenizer::rightParenthesis(UChar cc) | |
114 { | |
115 return MediaQueryToken(RightParenthesisToken); | |
116 } | |
117 | |
118 MediaQueryToken MediaQueryTokenizer::plusOrFullStop(UChar cc) | |
119 { | |
120 if (nextCharsAreNumber()) { | |
121 reconsume(cc); | |
122 return consumeNumericToken(); | |
123 } | |
124 return MediaQueryToken(DelimiterToken, cc); | |
125 } | |
126 | |
127 MediaQueryToken MediaQueryTokenizer::comma(UChar cc) | |
128 { | |
129 return MediaQueryToken(CommaToken); | |
130 } | |
131 | |
132 MediaQueryToken MediaQueryTokenizer::hyphenMinus(UChar cc) | |
133 { | |
134 if (nextCharsAreNumber()) { | |
135 reconsume(cc); | |
136 return consumeNumericToken(); | |
137 } | |
138 if (nextCharsAreIdentifier()) { | |
139 reconsume(cc); | |
140 return consumeIdentLikeToken(); | |
141 } | |
142 return MediaQueryToken(DelimiterToken, cc); | |
143 } | |
144 | |
145 MediaQueryToken MediaQueryTokenizer::solidus(UChar cc) | |
146 { | |
147 return MediaQueryToken(DelimiterToken, cc); | |
148 } | |
149 | |
150 MediaQueryToken MediaQueryTokenizer::colon(UChar cc) | |
151 { | |
152 return MediaQueryToken(ColonToken); | |
153 } | |
154 | |
155 MediaQueryToken MediaQueryTokenizer::semiColon(UChar cc) | |
156 { | |
157 return MediaQueryToken(SemicolonToken); | |
158 } | |
159 | |
160 MediaQueryToken MediaQueryTokenizer::reverseSolidus(UChar cc) | |
161 { | |
162 if (twoCharsAreValidEscape(cc, m_input.currentInputChar())) { | |
163 reconsume(cc); | |
164 return consumeIdentLikeToken(); | |
165 } | |
166 return MediaQueryToken(DelimiterToken, cc); | |
167 } | |
168 | |
169 MediaQueryToken MediaQueryTokenizer::asciiDigit(UChar cc) | |
170 { | |
171 reconsume(cc); | |
172 return consumeNumericToken(); | |
173 } | |
174 | |
175 MediaQueryToken MediaQueryTokenizer::nameStart(UChar cc) | |
176 { | |
177 reconsume(cc); | |
178 return consumeIdentLikeToken(); | |
179 } | |
180 | |
181 MediaQueryToken MediaQueryTokenizer::endOfFile(UChar cc) | |
182 { | |
183 return MediaQueryToken(EOFToken); | |
184 } | |
185 | |
186 void MediaQueryTokenizer::tokenize(String string, Vector<MediaQueryToken>& outTo
kens) | |
187 { | |
188 // According to the spec, we should perform preprocessing here. | |
189 // See: http://www.w3.org/TR/css-syntax-3/#input-preprocessing | |
190 // | |
191 // However, we can skip this step since: | |
192 // * We're using HTML spaces (which accept \r and \f as a valid white space) | |
193 // * Do not count white spaces | |
194 // * consumeEscape replaces NULLs for replacement characters | |
195 | |
196 MediaQueryInputStream input(string); | |
197 MediaQueryTokenizer tokenizer(input); | |
198 while (true) { | |
199 outTokens.append(tokenizer.nextToken()); | |
200 if (outTokens.last().type() == EOFToken) | |
201 return; | |
202 } | |
203 } | |
204 | |
205 MediaQueryToken MediaQueryTokenizer::nextToken() | |
206 { | |
207 // Unlike the HTMLTokenizer, the CSS Syntax spec is written | |
208 // as a stateless, (fixed-size) look-ahead tokenizer. | |
209 // We could move to the stateful model and instead create | |
210 // states for all the "next 3 codepoints are X" cases. | |
211 // State-machine tokenizers are easier to write to handle | |
212 // incremental tokenization of partial sources. | |
213 // However, for now we follow the spec exactly. | |
214 UChar cc = consume(); | |
215 CodePoint codePointFunc = 0; | |
216 | |
217 if (isASCII(cc)) { | |
218 ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber); | |
219 codePointFunc = codePoints()->codePoints[cc]; | |
220 } else { | |
221 codePointFunc = &MediaQueryTokenizer::nameStart; | |
222 } | |
223 | |
224 if (codePointFunc) | |
225 return ((this)->*(codePointFunc))(cc); | |
226 | |
227 return MediaQueryToken(DelimiterToken, cc); | |
228 } | |
229 | |
230 static int getSign(MediaQueryInputStream& input, unsigned& offset) | |
231 { | |
232 int sign = 1; | |
233 if (input.currentInputChar() == '+') { | |
234 ++offset; | |
235 } else if (input.peek(offset) == '-') { | |
236 sign = -1; | |
237 ++offset; | |
238 } | |
239 return sign; | |
240 } | |
241 | |
242 static unsigned long long getInteger(MediaQueryInputStream& input, unsigned& off
set) | |
243 { | |
244 unsigned intStartPos = offset; | |
245 offset = input.skipWhilePredicate<isASCIIDigit>(offset); | |
246 unsigned intEndPos = offset; | |
247 return input.getUInt(intStartPos, intEndPos); | |
248 } | |
249 | |
250 static double getFraction(MediaQueryInputStream& input, unsigned& offset, unsign
ed& digitsNumber) | |
251 { | |
252 unsigned fractionStartPos = 0; | |
253 unsigned fractionEndPos = 0; | |
254 if (input.peek(offset) == '.' && isASCIIDigit(input.peek(++offset))) { | |
255 fractionStartPos = offset - 1; | |
256 offset = input.skipWhilePredicate<isASCIIDigit>(offset); | |
257 fractionEndPos = offset; | |
258 } | |
259 digitsNumber = fractionEndPos- fractionStartPos; | |
260 return input.getDouble(fractionStartPos, fractionEndPos); | |
261 } | |
262 | |
263 static unsigned long long getExponent(MediaQueryInputStream& input, unsigned& of
fset, int sign) | |
264 { | |
265 unsigned exponentStartPos = 0; | |
266 unsigned exponentEndPos = 0; | |
267 if ((input.peek(offset) == 'E' || input.peek(offset) == 'e')) { | |
268 int offsetBeforeExponent = offset; | |
269 ++offset; | |
270 if (input.peek(offset) == '+') { | |
271 ++offset; | |
272 } else if (input.peek(offset) =='-') { | |
273 sign = -1; | |
274 ++offset; | |
275 } | |
276 exponentStartPos = offset; | |
277 offset = input.skipWhilePredicate<isASCIIDigit>(offset); | |
278 exponentEndPos = offset; | |
279 if (exponentEndPos == exponentStartPos) | |
280 offset = offsetBeforeExponent; | |
281 } | |
282 return input.getUInt(exponentStartPos, exponentEndPos); | |
283 } | |
284 | |
285 // This method merges the following spec sections for efficiency | |
286 // http://www.w3.org/TR/css3-syntax/#consume-a-number | |
287 // http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number | |
288 MediaQueryToken MediaQueryTokenizer::consumeNumber() | |
289 { | |
290 ASSERT(nextCharsAreNumber()); | |
291 NumericValueType type = IntegerValueType; | |
292 double value = 0; | |
293 unsigned offset = 0; | |
294 int exponentSign = 1; | |
295 unsigned fractionDigits; | |
296 int sign = getSign(m_input, offset); | |
297 unsigned long long integerPart = getInteger(m_input, offset); | |
298 double fractionPart = getFraction(m_input, offset, fractionDigits); | |
299 unsigned long long exponentPart = getExponent(m_input, offset, exponentSign)
; | |
300 double exponent = pow(10, (float)exponentSign * (double)exponentPart); | |
301 value = (double)sign * ((double)integerPart + fractionPart) * exponent; | |
302 | |
303 m_input.advance(offset); | |
304 if (fractionDigits > 0) | |
305 type = NumberValueType; | |
306 | |
307 return MediaQueryToken(NumberToken, value, type); | |
308 } | |
309 | |
310 // http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token | |
311 MediaQueryToken MediaQueryTokenizer::consumeNumericToken() | |
312 { | |
313 MediaQueryToken token = consumeNumber(); | |
314 if (nextCharsAreIdentifier()) | |
315 token.convertToDimensionWithUnit(consumeName()); | |
316 else if (consumeIfNext('%')) | |
317 token.convertToPercentage(); | |
318 return token; | |
319 } | |
320 | |
321 // http://www.w3.org/TR/css3-syntax/#consume-an-ident-like-token | |
322 MediaQueryToken MediaQueryTokenizer::consumeIdentLikeToken() | |
323 { | |
324 String name = consumeName(); | |
325 if (consumeIfNext('(')) | |
326 return MediaQueryToken(FunctionToken, name); | |
327 return MediaQueryToken(IdentToken, name); | |
328 } | |
329 | |
330 void MediaQueryTokenizer::consumeUntilNonWhitespace() | |
331 { | |
332 // Using HTML space here rather than CSS space since we don't do preprocessi
ng | |
333 while (isHTMLSpace<UChar>(m_input.currentInputChar())) | |
334 consume(); | |
335 } | |
336 | |
337 bool MediaQueryTokenizer::consumeIfNext(UChar character) | |
338 { | |
339 if (m_input.currentInputChar() == character) { | |
340 consume(); | |
341 return true; | |
342 } | |
343 return false; | |
344 } | |
345 | |
346 // http://www.w3.org/TR/css3-syntax/#consume-a-name | |
347 String MediaQueryTokenizer::consumeName() | |
348 { | |
349 // FIXME: Is this as efficient as it can be? | |
350 // The possibility of escape chars mandates a copy AFAICT. | |
351 Vector<UChar> result; | |
352 while (true) { | |
353 if (isNameChar(m_input.currentInputChar())) { | |
354 result.append(consume()); | |
355 continue; | |
356 } | |
357 if (nextTwoCharsAreValidEscape()) { | |
358 // "consume()" fixes a spec bug. | |
359 // The first code point should be consumed before consuming the esca
ped code point. | |
360 consume(); | |
361 result.append(consumeEscape()); | |
362 continue; | |
363 } | |
364 return String(result); | |
365 } | |
366 } | |
367 | |
368 // http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point | |
369 UChar MediaQueryTokenizer::consumeEscape() | |
370 { | |
371 UChar cc = consume(); | |
372 ASSERT(cc != '\n'); | |
373 if (isASCIIHexDigit(cc)) { | |
374 unsigned consumedHexDigits = 1; | |
375 String hexChars; | |
376 do { | |
377 hexChars.append(cc); | |
378 cc = consume(); | |
379 consumedHexDigits++; | |
380 } while (consumedHexDigits < 6 && isASCIIHexDigit(cc)); | |
381 bool ok = false; | |
382 UChar codePoint = hexChars.toUIntStrict(&ok, 16); | |
383 if (!ok) | |
384 return WTF::Unicode::replacementCharacter; | |
385 return codePoint; | |
386 } | |
387 | |
388 // Replaces NULLs with replacement characters, since we do not perform prepr
ocessing | |
389 if (cc == kEndOfFileMarker) | |
390 return WTF::Unicode::replacementCharacter; | |
391 return cc; | |
392 } | |
393 | |
394 bool MediaQueryTokenizer::nextTwoCharsAreValidEscape() | |
395 { | |
396 if (m_input.leftChars() < 2) | |
397 return false; | |
398 return twoCharsAreValidEscape(m_input.peek(1), m_input.peek(2)); | |
399 } | |
400 | |
401 // http://www.w3.org/TR/css3-syntax/#starts-with-a-number | |
402 bool MediaQueryTokenizer::nextCharsAreNumber() | |
403 { | |
404 UChar first = m_input.currentInputChar(); | |
405 UChar second = m_input.peek(1); | |
406 if (isASCIIDigit(first)) | |
407 return true; | |
408 if (first == '+' || first == '-') | |
409 return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input
.peek(2)))); | |
410 if (first =='.') | |
411 return (isASCIIDigit(second)); | |
412 return false; | |
413 } | |
414 | |
415 // http://www.w3.org/TR/css3-syntax/#would-start-an-identifier | |
416 bool MediaQueryTokenizer::nextCharsAreIdentifier() | |
417 { | |
418 UChar firstChar = m_input.currentInputChar(); | |
419 if (isNameStart(firstChar) || nextTwoCharsAreValidEscape()) | |
420 return true; | |
421 | |
422 if (firstChar == '-') { | |
423 if (isNameStart(m_input.peek(1))) | |
424 return true; | |
425 return nextTwoCharsAreValidEscape(); | |
426 } | |
427 | |
428 return false; | |
429 } | |
430 | |
431 } // namespace WebCore | |
OLD | NEW |