OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 #include "config.h" | |
6 #include "core/css/parser/MediaQueryTokenizer.h" | |
7 | |
8 #include "core/css/parser/MediaQueryInputStream.h" | |
9 #include "core/html/parser/HTMLParserIdioms.h" | |
10 #include "wtf/unicode/CharacterNames.h" | |
11 | |
12 namespace WebCore { | |
13 | |
14 const unsigned codePointsNumber = SCHAR_MAX; | |
15 | |
16 class MediaQueryTokenizer::CodePoints { | |
17 public: | |
18 MediaQueryTokenizer::CodePoint codePoints[codePointsNumber]; | |
19 | |
20 CodePoints() | |
21 { | |
22 memset(codePoints, 0, codePointsNumber); | |
23 codePoints['\n'] = &MediaQueryTokenizer::whiteSpace; | |
24 codePoints['\r'] = &MediaQueryTokenizer::whiteSpace; | |
25 codePoints['\t'] = &MediaQueryTokenizer::whiteSpace; | |
26 codePoints[' '] = &MediaQueryTokenizer::whiteSpace; | |
27 codePoints['\f'] = &MediaQueryTokenizer::whiteSpace; | |
28 codePoints['('] = &MediaQueryTokenizer::leftParenthesis; | |
29 codePoints[')'] = &MediaQueryTokenizer::rightParenthesis; | |
30 codePoints['+'] = &MediaQueryTokenizer::plusOrFullStop; | |
31 codePoints['.'] = &MediaQueryTokenizer::plusOrFullStop; | |
32 codePoints[','] = &MediaQueryTokenizer::comma; | |
33 codePoints['-'] = &MediaQueryTokenizer::hyphenMinus; | |
34 codePoints['/'] = &MediaQueryTokenizer::solidus; | |
35 codePoints[':'] = &MediaQueryTokenizer::colon; | |
36 codePoints[';'] = &MediaQueryTokenizer::semiColon; | |
37 codePoints['\\'] = &MediaQueryTokenizer::reverseSolidus; | |
38 for (unsigned char digit = '0'; digit <= '9'; ++digit) | |
39 codePoints[digit] = &MediaQueryTokenizer::asciiDigit; | |
40 for (unsigned char alpha = 'a'; alpha <= 'z'; ++alpha) | |
41 codePoints[alpha] = &MediaQueryTokenizer::nameStart; | |
42 for (unsigned char alpha = 'A'; alpha <= 'Z'; ++alpha) | |
43 codePoints[alpha] = &MediaQueryTokenizer::nameStart; | |
44 codePoints['_'] = &MediaQueryTokenizer::nameStart; | |
45 codePoints[kEndOfFileMarker] = &MediaQueryTokenizer::endOfFile; | |
46 } | |
47 }; | |
48 | |
49 MediaQueryTokenizer::CodePoints* MediaQueryTokenizer::codePoints() | |
50 { | |
51 static CodePoints codePoints; | |
52 return &codePoints; | |
53 } | |
54 | |
55 // http://dev.w3.org/csswg/css-syntax/#name-start-code-point | |
56 static bool isNameStart(UChar c) | |
57 { | |
58 if (isASCIIAlpha(c)) | |
59 return true; | |
60 if (c == '_') | |
61 return true; | |
62 return !isASCII(c); | |
63 } | |
64 | |
65 // http://www.w3.org/TR/css-syntax-3/#name-code-point | |
66 static bool isNameChar(UChar c) | |
67 { | |
68 return isNameStart(c) || isASCIIDigit(c) || c == '-'; | |
69 } | |
70 | |
71 // http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escap e | |
72 static bool twoCharsAreValidEscape(UChar first, UChar second) | |
73 { | |
74 return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker)) ; | |
75 } | |
76 | |
77 MediaQueryTokenizer::MediaQueryTokenizer() | |
78 { | |
79 } | |
80 | |
81 void MediaQueryTokenizer::reconsume(UChar c) | |
82 { | |
83 m_input->pushBack(c); | |
84 } | |
85 | |
86 UChar MediaQueryTokenizer::consume() | |
87 { | |
88 UChar current = m_input->currentInputChar(); | |
89 m_input->advance(); | |
90 return current; | |
91 } | |
92 | |
93 void MediaQueryTokenizer::consume(unsigned offset) | |
94 { | |
95 m_input->advance(offset); | |
96 } | |
97 | |
98 MediaQueryToken MediaQueryTokenizer::whiteSpace(UChar cc) | |
99 { | |
100 // CSS Tokenization is currently lossy, but we could record | |
101 // the exact whitespace instead of discarding it here. | |
102 consumeUntilNonWhitespace(); | |
103 return MediaQueryToken(WhitespaceToken); | |
104 } | |
105 | |
106 MediaQueryToken MediaQueryTokenizer::leftParenthesis(UChar cc) | |
107 { | |
108 return MediaQueryToken(LeftParenthesisToken); | |
109 } | |
110 | |
111 MediaQueryToken MediaQueryTokenizer::rightParenthesis(UChar cc) | |
112 { | |
113 return MediaQueryToken(RightParenthesisToken); | |
114 } | |
115 | |
116 MediaQueryToken MediaQueryTokenizer::plusOrFullStop(UChar cc) | |
117 { | |
118 if (nextCharsAreNumber()) { | |
119 reconsume(cc); | |
120 return consumeNumericToken(); | |
121 } | |
122 return MediaQueryToken(DelimiterToken, cc); | |
123 } | |
124 | |
125 MediaQueryToken MediaQueryTokenizer::comma(UChar cc) | |
126 { | |
127 return MediaQueryToken(CommaToken); | |
128 } | |
129 | |
130 MediaQueryToken MediaQueryTokenizer::hyphenMinus(UChar cc) | |
131 { | |
132 if (nextCharsAreNumber()) { | |
133 reconsume(cc); | |
134 return consumeNumericToken(); | |
135 } | |
136 if (nextCharsAreIdentifier()) { | |
137 reconsume(cc); | |
138 return consumeIdentLikeToken(); | |
139 } | |
140 return MediaQueryToken(DelimiterToken, cc); | |
141 } | |
142 | |
143 MediaQueryToken MediaQueryTokenizer::solidus(UChar cc) | |
144 { | |
145 return MediaQueryToken(DelimiterToken, cc); | |
146 } | |
147 | |
148 MediaQueryToken MediaQueryTokenizer::colon(UChar cc) | |
149 { | |
150 return MediaQueryToken(ColonToken); | |
151 } | |
152 | |
153 MediaQueryToken MediaQueryTokenizer::semiColon(UChar cc) | |
154 { | |
155 return MediaQueryToken(SemicolonToken); | |
156 } | |
157 | |
158 MediaQueryToken MediaQueryTokenizer::reverseSolidus(UChar cc) | |
159 { | |
160 if (twoCharsAreValidEscape(cc, m_input->currentInputChar())) { | |
161 reconsume(cc); | |
162 return consumeIdentLikeToken(); | |
163 } | |
164 return MediaQueryToken(DelimiterToken, cc); | |
165 } | |
166 | |
167 MediaQueryToken MediaQueryTokenizer::asciiDigit(UChar cc) | |
168 { | |
169 reconsume(cc); | |
170 return consumeNumericToken(); | |
171 } | |
172 | |
173 MediaQueryToken MediaQueryTokenizer::nameStart(UChar cc) | |
174 { | |
175 reconsume(cc); | |
176 return consumeIdentLikeToken(); | |
177 } | |
178 | |
179 MediaQueryToken MediaQueryTokenizer::endOfFile(UChar cc) | |
180 { | |
181 return MediaQueryToken(EOFToken); | |
182 } | |
183 | |
184 void MediaQueryTokenizer::tokenize(String string, Vector<MediaQueryToken>& outTo kens) | |
185 { | |
186 MediaQueryTokenizer tokenizer; | |
187 // According to the spec, we should perform preprocessing here. | |
188 // See: http://www.w3.org/TR/css-syntax-3/#input-preprocessing | |
189 // | |
190 // However, we can skip this step since: | |
191 // * We're using HTML spaces (which accept \r and \f as a valid white space) | |
192 // * Do not count white spaces | |
193 // * consumeEscape replaces NULLs for replacement characters | |
194 | |
195 MediaQueryInputStream input(string); | |
196 while (true) { | |
197 outTokens.append(tokenizer.nextToken(input)); | |
198 if (outTokens.last().type() == EOFToken) | |
199 return; | |
200 } | |
201 } | |
202 | |
203 MediaQueryToken MediaQueryTokenizer::nextToken(MediaQueryInputStream& input) | |
204 { | |
205 // Unlike the HTMLTokenizer, the CSS Syntax spec is written | |
206 // as a stateless, (fixed-size) look-ahead tokenizer. | |
207 // We could move to the stateful model and instead create | |
208 // states for all the "next 3 codepoints are X" cases. | |
209 // State-machine tokenizers are easier to write to handle | |
210 // incremental tokenization of partial sources. | |
211 // However, for now we follow the spec exactly. | |
212 m_input = &input; | |
213 UChar cc = consume(); | |
214 CodePoint codePointFunc = 0; | |
215 | |
216 if (isASCII(cc)) { | |
217 ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber); | |
218 codePointFunc = codePoints()->codePoints[cc]; | |
219 } else { | |
220 codePointFunc = &MediaQueryTokenizer::nameStart; | |
221 } | |
222 | |
223 if (codePointFunc) | |
224 return ((this)->*(codePointFunc))(cc); | |
225 | |
226 return MediaQueryToken(DelimiterToken, cc); | |
227 } | |
228 | |
229 // This method merges the following spec sections for efficiency | |
230 // http://www.w3.org/TR/css3-syntax/#consume-a-number | |
231 // http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number | |
232 MediaQueryToken MediaQueryTokenizer::consumeNumber() | |
eseidel
2014/03/13 17:58:45
This is a really long function and it might make s
| |
233 { | |
234 ASSERT(nextCharsAreNumber()); | |
235 NumericValueType type = IntegerValueType; | |
236 double value = 0; | |
237 int sign = 1; | |
238 unsigned peekOffset = 0; | |
239 int exponentSign = 1; | |
240 unsigned exponentStartPos = 0; | |
241 unsigned exponentEndPos = 0; | |
242 unsigned fractionStartPos = 0; | |
243 unsigned fractionEndPos = 0; | |
244 unsigned long long integerPart; | |
245 double fractionPart; | |
246 unsigned fractionDigits; | |
247 unsigned long long exponentPart; | |
248 if (m_input->currentInputChar() == '+') { | |
249 ++peekOffset; | |
250 } else if (m_input->peek(peekOffset) == '-') { | |
251 sign = -1; | |
252 ++peekOffset; | |
253 } | |
254 unsigned intStartPos = peekOffset; | |
255 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset); | |
256 unsigned intEndPos = peekOffset; | |
257 if (m_input->peek(peekOffset) == '.' && isASCIIDigit(m_input->peek(++peekOff set))) { | |
258 fractionStartPos = peekOffset - 1; | |
259 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset); | |
260 fractionEndPos = peekOffset; | |
261 } | |
262 if ((m_input->peek(peekOffset) == 'E' || m_input->peek(peekOffset) == 'e')) { | |
263 int peekOffsetBeforeExponent = peekOffset; | |
264 ++peekOffset; | |
265 if (m_input->peek(peekOffset) == '+') { | |
266 ++peekOffset; | |
267 } else if (m_input->peek(peekOffset) =='-') { | |
268 exponentSign = -1; | |
269 ++peekOffset; | |
270 } | |
271 exponentStartPos = peekOffset; | |
272 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset); | |
273 exponentEndPos = peekOffset; | |
274 if (exponentEndPos == exponentStartPos) | |
275 peekOffset = peekOffsetBeforeExponent; | |
276 } | |
277 integerPart = m_input->getUInt(intStartPos, intEndPos); | |
278 fractionDigits = fractionEndPos - fractionStartPos; | |
279 unsigned floatingFractionEndPos = fractionEndPos; | |
280 fractionPart = m_input->getDouble(fractionStartPos, floatingFractionEndPos); | |
281 exponentPart = m_input->getUInt(exponentStartPos, exponentEndPos); | |
282 double exponent = pow(10, (float)exponentSign * (double)exponentPart); | |
283 value = (double)sign * ((double)integerPart + fractionPart) * exponent; | |
284 | |
285 m_input->advance(peekOffset); | |
286 if (fractionDigits > 0) | |
287 type = NumberValueType; | |
288 | |
289 return MediaQueryToken(NumberToken, value, type); | |
290 } | |
291 | |
292 // http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token | |
293 MediaQueryToken MediaQueryTokenizer::consumeNumericToken() | |
294 { | |
295 MediaQueryToken token = consumeNumber(); | |
296 if (nextCharsAreIdentifier()) | |
297 token.convertToDimensionWithUnit(consumeName()); | |
298 else if (consumeIfNext('%')) | |
299 token.convertToPercentage(); | |
300 return token; | |
301 } | |
302 | |
303 // http://www.w3.org/TR/css3-syntax/#consume-an-ident-like-token | |
304 MediaQueryToken MediaQueryTokenizer::consumeIdentLikeToken() | |
305 { | |
306 String name = consumeName(); | |
307 if (consumeIfNext('(')) | |
308 return MediaQueryToken(FunctionToken, name); | |
309 return MediaQueryToken(IdentToken, name); | |
310 } | |
311 | |
312 void MediaQueryTokenizer::consumeUntilNonWhitespace() | |
313 { | |
314 // Using HTML space here rather than CSS space since we don't do preprocessi ng | |
315 while (isHTMLSpace<UChar>(m_input->currentInputChar())) | |
316 consume(); | |
317 } | |
318 | |
319 bool MediaQueryTokenizer::consumeIfNext(UChar character) | |
320 { | |
321 if (m_input->currentInputChar() == character) { | |
322 consume(); | |
323 return true; | |
324 } | |
325 return false; | |
326 } | |
327 | |
328 // http://www.w3.org/TR/css3-syntax/#consume-a-name | |
329 String MediaQueryTokenizer::consumeName() | |
330 { | |
331 // FIXME: Is this as efficient as it can be? | |
332 // The possibility of escape chars mandates a copy AFAICT. | |
333 Vector<UChar> result; | |
334 while (true) { | |
335 if (isNameChar(m_input->currentInputChar())) { | |
336 result.append(consume()); | |
337 continue; | |
338 } | |
339 if (nextTwoCharsAreValidEscape()) { | |
340 // "consume()" fixes a spec bug. | |
341 // The first code point should be consumed before consuming the esca ped code point. | |
342 consume(); | |
343 result.append(consumeEscape()); | |
344 continue; | |
345 } | |
346 return String(result); | |
347 } | |
348 } | |
349 | |
350 // http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point | |
351 UChar MediaQueryTokenizer::consumeEscape() | |
352 { | |
353 UChar cc = consume(); | |
354 ASSERT(cc != '\n'); | |
355 if (isASCIIHexDigit(cc)) { | |
356 unsigned consumedHexDigits = 1; | |
357 String hexChars; | |
358 do { | |
359 hexChars.append(cc); | |
360 cc = consume(); | |
361 consumedHexDigits++; | |
362 } while (consumedHexDigits < 6 && isASCIIHexDigit(cc)); | |
363 bool ok = false; | |
364 UChar codePoint = hexChars.toUIntStrict(&ok, 16); | |
365 if (!ok) | |
366 return WTF::Unicode::replacementCharacter; | |
367 return codePoint; | |
368 } | |
369 | |
370 // Replaces NULLs with replacement characters, since we do not perform prepr ocessing | |
371 if (cc == kEndOfFileMarker) | |
372 return WTF::Unicode::replacementCharacter; | |
373 return cc; | |
374 } | |
375 | |
376 bool MediaQueryTokenizer::nextTwoCharsAreValidEscape() | |
377 { | |
378 if (m_input->leftChars() < 2) | |
379 return false; | |
380 return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2)); | |
381 } | |
382 | |
383 // http://www.w3.org/TR/css3-syntax/#starts-with-a-number | |
384 bool MediaQueryTokenizer::nextCharsAreNumber() | |
385 { | |
386 UChar first = m_input->currentInputChar(); | |
387 UChar second = m_input->peek(1); | |
388 if (isASCIIDigit(first)) | |
389 return true; | |
390 if (first == '+' || first == '-') | |
391 return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input ->peek(2)))); | |
392 if (first =='.') | |
393 return (isASCIIDigit(second)); | |
394 return false; | |
395 } | |
396 | |
397 // http://www.w3.org/TR/css3-syntax/#would-start-an-identifier | |
398 bool MediaQueryTokenizer::nextCharsAreIdentifier() | |
399 { | |
400 UChar firstChar = m_input->currentInputChar(); | |
eseidel
2014/03/13 17:58:45
Is m_input ever null? Can we make it a reference?
| |
401 if (isNameStart(firstChar) || nextTwoCharsAreValidEscape()) | |
402 return true; | |
403 | |
404 if (firstChar == '-') { | |
405 if (isNameStart(m_input->peek(1))) | |
406 return true; | |
407 return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2)); | |
408 } | |
409 | |
410 return false; | |
411 } | |
412 | |
413 } // namespace WebCore | |
OLD | NEW |