OLD | NEW |
---|---|
(Empty) | |
1 /* | |
2 * Copyright (C) 2013 Google Inc. All rights reserved. | |
3 * | |
4 * Redistribution and use in source and binary forms, with or without | |
5 * modification, are permitted provided that the following conditions are | |
6 * met: | |
7 * | |
8 * * Redistributions of source code must retain the above copyright | |
9 * notice, this list of conditions and the following disclaimer. | |
10 * * Redistributions in binary form must reproduce the above | |
11 * copyright notice, this list of conditions and the following disclaimer | |
12 * in the documentation and/or other materials provided with the | |
13 * distribution. | |
14 * * Neither the name of Google Inc. nor the names of its | |
15 * contributors may be used to endorse or promote products derived from | |
16 * this software without specific prior written permission. | |
17 * | |
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 #include "config.h" | |
32 #include "core/css/parser/NewCSSTokenizer.h" | |
33 | |
34 #include "core/css/parser/CSSParserIdioms.h" | |
35 #include "platform/text/SegmentedString.h" | |
36 #include "wtf/TemporaryChange.h" | |
37 #include "wtf/unicode/CharacterNames.h" | |
38 | |
39 namespace WebCore { | |
40 | |
41 CSSInputStream::CSSInputStream(String input) | |
42 : m_offset(0) | |
43 , m_string(input) | |
44 { | |
45 m_string.append(kEndOfFileMarker); | |
abarth-chromium
2014/01/01 18:47:51
Hum... String::append is monstrously slow...
| |
46 } | |
47 | |
48 UChar CSSInputStream::currentInputChar() | |
49 { | |
50 ASSERT(m_offset < m_string.length()); | |
51 return m_string[m_offset]; | |
52 } | |
53 | |
54 UChar CSSInputStream::nextInputChar() | |
55 { | |
56 return m_string[m_offset + 1]; | |
57 } | |
58 | |
59 UChar CSSInputStream::peek2() | |
60 { | |
61 return m_string[m_offset + 2]; | |
62 } | |
63 | |
64 UChar CSSInputStream::peek3() | |
65 { | |
66 return m_string[m_offset + 3]; | |
67 } | |
68 | |
69 void CSSInputStream::advance() | |
70 { | |
71 m_offset++; | |
72 } | |
73 | |
74 void CSSInputStream::pushBack(UChar cc) | |
75 { | |
76 m_offset--; | |
77 ASSERT(currentInputChar() == cc); | |
78 } | |
79 | |
80 // http://dev.w3.org/csswg/css-syntax/#name-start-code-point | |
81 static bool isNameStart(UChar c) | |
82 { | |
83 if (isASCIIAlpha(c)) | |
84 return true; | |
85 if (c == '_') | |
86 return true; | |
87 return !isASCII(c); | |
88 } | |
89 | |
90 // http://www.w3.org/TR/css-syntax-3/#name-code-point | |
91 static bool isNameChar(UChar c) | |
92 { | |
93 return isNameStart(c) || isASCIIDigit(c) || c == '-'; | |
94 } | |
95 | |
96 NewCSSTokenizer::NewCSSTokenizer() | |
97 { | |
98 } | |
99 | |
100 void NewCSSTokenizer::reconsume(UChar c) | |
101 { | |
102 m_input->pushBack(c); | |
103 } | |
104 | |
105 UChar NewCSSTokenizer::consume() | |
106 { | |
107 UChar current = m_input->currentInputChar(); | |
108 m_input->advance(); | |
109 return current; | |
110 } | |
111 | |
112 CSSToken NewCSSTokenizer::nextToken(CSSInputStream& input) | |
113 { | |
114 // Unlike the HTMLTokenizer, the CSS Syntax spec is written | |
115 // as a stateless, (fixed-size) look-ahead tokenizer. | |
116 // We could move to the stateful model and instead create | |
117 // states for all the "next 3 codepoints are X" cases. | |
118 // State-machine tokenizers are easier to write to handle | |
119 // incremental tokenization of partial sources. | |
120 // However, for now we follow the spec exactly. | |
121 m_input = &input; | |
122 UChar cc = consume(); | |
123 | |
124 if (isCSSSpace(cc)) { | |
abarth-chromium
2014/01/01 18:47:51
I bet it's faster to implement this if-cascade usi
| |
125 // CSS Tokenization is currently lossy, but we could record | |
126 // the exact whitespace instead of discarding it here. | |
127 consumeUntilNotWhitespace(); | |
128 return CSSToken(WhitespaceToken); | |
129 } | |
130 if (cc == '\"' || cc == '\'') | |
131 return consumeStringTokenUntil(cc); | |
132 if (cc == '#') { | |
133 if (nextCharIsName() || nextTwoCharsAreValidEscape()) { | |
134 HashTokenType hashType = UnrestrictedHashToken; | |
135 if (nextCharsAreIdentifier()) | |
136 hashType = IdHashToken; | |
137 return CSSToken(HashToken, consumeName(), hashType); | |
138 } | |
139 return CSSToken(DelimToken, cc); | |
140 } | |
141 if (cc == '$') { | |
142 if (consumeIfNext('=')) | |
143 return CSSToken(SuffixMatchToken); | |
144 return CSSToken(DelimToken, cc); | |
145 } | |
146 if (cc == '(') | |
147 return CSSToken(LeftParenToken); | |
148 if (cc == ')') | |
149 return CSSToken(RightParenToken); | |
150 if (cc == '*') { | |
151 if (consumeIfNext('=')) | |
152 return CSSToken(SubstringMatchToken); | |
153 return CSSToken(DelimToken, cc); | |
154 } | |
155 if (cc == '+' || cc == '.') { | |
156 if (nextCharsAreNumber()) { | |
157 reconsume(cc); | |
158 return consumeNumericToken(); | |
159 } | |
160 return CSSToken(DelimToken, cc); | |
161 } | |
162 if (cc == ',') | |
163 return CSSToken(CommaToken); | |
164 if (cc == '-') { | |
165 if (nextCharsAreNumber()) { | |
166 reconsume(cc); | |
167 return consumeNumericToken(); | |
168 } | |
169 if (nextCharsAreIdentifier()) { | |
170 reconsume(cc); | |
171 return consumeIdentLikeToken(); | |
172 } | |
173 if (consumeIfNext("->")) | |
174 return CSSToken(CDCToken); | |
175 return CSSToken(DelimToken, cc); | |
176 } | |
177 if (cc == '/') { | |
178 if (consumeIfNext('*')) { | |
179 consumeThroughCommentEndOrUntilEOF(); | |
180 return nextToken(*m_input); | |
181 } | |
182 return CSSToken(DelimToken, cc); | |
183 } | |
184 if (cc == ':') | |
185 return CSSToken(ColonToken); | |
186 if (cc == ';') | |
187 return CSSToken(SemicolonToken); | |
188 if (cc == '<') { | |
189 if (consumeIfNext("!--")) | |
190 return CSSToken(CDOToken); | |
191 return CSSToken(DelimToken, cc); | |
192 } | |
193 if (cc == '@') { | |
194 if (nextCharsAreIdentifier()) | |
195 return CSSToken(AtKeywordToken, consumeName()); | |
196 return CSSToken(DelimToken, cc); | |
197 } | |
198 if (cc == '[') | |
199 return CSSToken(LeftBracketToken); | |
200 if (cc == '\\') { | |
201 if (nextIsValidEscape()) { | |
202 reconsume(cc); | |
203 return consumeIdentLikeToken(); | |
204 } | |
205 return CSSToken(DelimToken, cc); | |
206 } | |
207 if (cc == ']') | |
208 return CSSToken(RightBracketToken); | |
209 if (cc == '^') { | |
210 if (consumeIfNext('=')) | |
211 return CSSToken(PrefixMatchToken); | |
212 return CSSToken(DelimToken, cc); | |
213 } | |
214 if (cc == '{') | |
215 return CSSToken(LeftBraceToken); | |
216 if (cc == '{') | |
217 return CSSToken(RightBraceToken); | |
218 if (isASCIIDigit(cc)) | |
219 return consumeNumericToken(); | |
220 // if (cc == 'U' || cc == 'u') { | |
221 // // U+0055 LATIN CAPITAL LETTER U (U) | |
222 // // U+0075 LATIN SMALL LETTER U (u) | |
223 // // If the next 2 input code points are U+002B PLUS SIGN (+) followed by a hex digit or U+003F QUESTION MARK (?), consume the next input code point. N ote: don’t consume both of them. Consume a unicode-range token and return it. | |
224 // // Otherwise, reconsume the current input code point, consume an iden t-like token, and return it. | |
225 // reconsume(cc); | |
226 // return consumeIdentLikeToken(); | |
227 // } | |
228 if (isNameStart(cc)) { | |
229 reconsume(cc); | |
230 return consumeIdentLikeToken(); | |
231 } | |
232 if (cc == '|') { | |
233 if (consumeIfNext('=')) | |
234 return CSSToken(DashMatchToken); | |
235 if (consumeIfNext('|')) | |
236 return CSSToken(ColumnToken); | |
237 return CSSToken(DelimToken, cc); | |
238 } | |
239 if (cc == '~') { | |
240 if (consumeIfNext('=')) | |
241 return CSSToken(IncludeMatchToken); | |
242 return CSSToken(DelimToken, cc); | |
243 } | |
244 if (cc == kEndOfFileMarker) | |
245 return CSSToken(EOFToken); | |
246 return CSSToken(DelimToken, cc); | |
247 } | |
248 | |
249 CSSToken NewCSSTokenizer::consumeNumber() | |
250 { | |
251 ASSERT(nextCharsAreNumber()); | |
252 String repr; | |
253 NumericValueType type = IntegerValueType; | |
254 double value = 0; | |
255 | |
256 // FIXME: Needs implementation. | |
257 // http://dev.w3.org/csswg/css-syntax/#consume-a-number0 | |
258 return CSSToken(NumberToken, repr, value, type); | |
259 } | |
260 | |
261 CSSToken NewCSSTokenizer::consumeNumericToken() | |
262 { | |
263 CSSToken token = consumeNumber(); | |
264 if (nextCharsAreIdentifier()) | |
265 token.convertToDimensionWithUnit(consumeName()); | |
266 else if (consumeIfNext("%")) | |
267 token.convertToPercentage(); | |
268 return token; | |
269 } | |
270 | |
271 CSSToken NewCSSTokenizer::consumeIdentLikeToken() | |
272 { | |
273 String name = consumeName(); | |
274 if (consumeIfNext('(')) { | |
275 if (equalIgnoringCase(name, "url")) | |
276 return consumeURLToken(); | |
277 return CSSToken(FunctionToken, name); | |
278 } | |
279 return CSSToken(IdentToken, name); | |
280 } | |
281 | |
282 CSSToken NewCSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint) | |
283 { | |
284 // FIXME: Implement. | |
285 // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token | |
286 return CSSToken(BadStringToken); | |
287 } | |
288 | |
289 CSSToken NewCSSTokenizer::consumeURLToken() | |
290 { | |
291 return CSSToken(BadURLToken); | |
292 } | |
293 | |
294 void NewCSSTokenizer::consumeUntilNotWhitespace() | |
295 { | |
296 | |
297 } | |
298 | |
299 void NewCSSTokenizer::consumeThroughCommentEndOrUntilEOF() | |
300 { | |
301 | |
302 } | |
303 | |
304 bool NewCSSTokenizer::consumeIfNext(UChar) | |
305 { | |
306 return false; | |
307 } | |
308 | |
309 bool NewCSSTokenizer::consumeIfNext(String) | |
310 { | |
311 return false; | |
312 } | |
313 | |
314 String NewCSSTokenizer::consumeName() | |
315 { | |
316 // FIXME: This is written to match the spec | |
317 // but could be much more efficient. | |
318 String result(""); | |
319 while (true) { | |
320 if (isNameChar(m_input->currentInputChar())) { | |
321 result.append(consume()); | |
abarth-chromium
2014/01/01 18:47:51
Please use StringBuilder rather than String. Stri
| |
322 continue; | |
323 } | |
324 if (nextTwoCharsAreValidEscape()) { | |
325 consume(); // SPEC BUG: Emailed Tab. | |
326 result.append(consumeEscape()); | |
327 continue; | |
328 } | |
329 return result; | |
330 } | |
331 } | |
332 | |
333 // http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point | |
334 UChar NewCSSTokenizer::consumeEscape() | |
335 { | |
336 UChar cc = consume(); | |
337 ASSERT(cc != '\n'); | |
338 if (isASCIIHexDigit(cc)) { | |
339 unsigned consumedHexDigits = 1; | |
340 String hexChars; | |
abarth-chromium
2014/01/01 18:47:51
StringBuilder
| |
341 do { | |
342 hexChars.append(cc); | |
343 cc = consume(); | |
344 consumedHexDigits++; | |
345 } while (consumedHexDigits < 6 && isASCIIHexDigit(cc)); | |
abarth-chromium
2014/01/01 18:47:51
You can reserve capacity 6 in the StringBuilder to
| |
346 bool ok = false; | |
347 UChar codePoint = hexChars.toUIntStrict(&ok, 16); | |
abarth-chromium
2014/01/01 18:47:51
Oh, actually, you don't need to malloc at all in t
| |
348 if (!ok) | |
349 return WTF::Unicode::replacementCharacter; | |
350 return codePoint; | |
351 } | |
352 if (cc == kEndOfFileMarker) | |
353 return WTF::Unicode::replacementCharacter; | |
354 return cc; | |
355 } | |
356 | |
357 bool NewCSSTokenizer::nextIsValidEscape() | |
358 { | |
359 return false; | |
360 } | |
361 | |
362 bool NewCSSTokenizer::nextCharIsName() | |
363 { | |
364 return false; | |
365 } | |
366 | |
367 // http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escap eare-a-valid-escapestarts-with-a-valid-escape | |
368 bool NewCSSTokenizer::nextTwoCharsAreValidEscape() | |
369 { | |
370 UChar firstChar = m_input->nextInputChar(); | |
371 UChar secondChar = m_input->peek2(); | |
372 if (firstChar != '\\') | |
373 return false; | |
374 if (secondChar == '\n' || secondChar == kEndOfFileMarker) | |
375 return false; | |
376 return true; | |
377 } | |
378 | |
379 bool NewCSSTokenizer::nextCharsAreNumber() | |
380 { | |
381 return false; | |
382 } | |
383 | |
384 bool NewCSSTokenizer::nextCharsAreIdentifier() | |
385 { | |
386 return false; | |
387 } | |
388 | |
389 } // namespace WebCore | |
OLD | NEW |