Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(424)

Side by Side Diff: Source/core/css/parser/NewCSSTokenizer.cpp

Issue 123053002: Add very basic CSS3 Syntax compatible tokenizer Base URL: svn://svn.chromium.org/blink/trunk
Patch Set: Add CSSToken file Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 /*
2 * Copyright (C) 2013 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "config.h"
32 #include "core/css/parser/NewCSSTokenizer.h"
33
34 #include "core/css/parser/CSSParserIdioms.h"
35 #include "platform/text/SegmentedString.h"
36 #include "wtf/TemporaryChange.h"
37 #include "wtf/unicode/CharacterNames.h"
38
39 namespace WebCore {
40
41 CSSInputStream::CSSInputStream(String input)
42 : m_offset(0)
43 , m_string(input)
44 {
45 m_string.append(kEndOfFileMarker);
abarth-chromium 2014/01/01 18:47:51 Hum... String::append is monstrously slow...
46 }
47
48 UChar CSSInputStream::currentInputChar()
49 {
50 ASSERT(m_offset < m_string.length());
51 return m_string[m_offset];
52 }
53
54 UChar CSSInputStream::nextInputChar()
55 {
56 return m_string[m_offset + 1];
57 }
58
59 UChar CSSInputStream::peek2()
60 {
61 return m_string[m_offset + 2];
62 }
63
64 UChar CSSInputStream::peek3()
65 {
66 return m_string[m_offset + 3];
67 }
68
69 void CSSInputStream::advance()
70 {
71 m_offset++;
72 }
73
74 void CSSInputStream::pushBack(UChar cc)
75 {
76 m_offset--;
77 ASSERT(currentInputChar() == cc);
78 }
79
80 // http://dev.w3.org/csswg/css-syntax/#name-start-code-point
81 static bool isNameStart(UChar c)
82 {
83 if (isASCIIAlpha(c))
84 return true;
85 if (c == '_')
86 return true;
87 return !isASCII(c);
88 }
89
90 // http://www.w3.org/TR/css-syntax-3/#name-code-point
91 static bool isNameChar(UChar c)
92 {
93 return isNameStart(c) || isASCIIDigit(c) || c == '-';
94 }
95
96 NewCSSTokenizer::NewCSSTokenizer()
97 {
98 }
99
100 void NewCSSTokenizer::reconsume(UChar c)
101 {
102 m_input->pushBack(c);
103 }
104
105 UChar NewCSSTokenizer::consume()
106 {
107 UChar current = m_input->currentInputChar();
108 m_input->advance();
109 return current;
110 }
111
112 CSSToken NewCSSTokenizer::nextToken(CSSInputStream& input)
113 {
114 // Unlike the HTMLTokenizer, the CSS Syntax spec is written
115 // as a stateless, (fixed-size) look-ahead tokenizer.
116 // We could move to the stateful model and instead create
117 // states for all the "next 3 codepoints are X" cases.
118 // State-machine tokenizers are easier to write to handle
119 // incremental tokenization of partial sources.
120 // However, for now we follow the spec exactly.
121 m_input = &input;
122 UChar cc = consume();
123
124 if (isCSSSpace(cc)) {
abarth-chromium 2014/01/01 18:47:51 I bet it's faster to implement this if-cascade usi
125 // CSS Tokenization is currently lossy, but we could record
126 // the exact whitespace instead of discarding it here.
127 consumeUntilNotWhitespace();
128 return CSSToken(WhitespaceToken);
129 }
130 if (cc == '\"' || cc == '\'')
131 return consumeStringTokenUntil(cc);
132 if (cc == '#') {
133 if (nextCharIsName() || nextTwoCharsAreValidEscape()) {
134 HashTokenType hashType = UnrestrictedHashToken;
135 if (nextCharsAreIdentifier())
136 hashType = IdHashToken;
137 return CSSToken(HashToken, consumeName(), hashType);
138 }
139 return CSSToken(DelimToken, cc);
140 }
141 if (cc == '$') {
142 if (consumeIfNext('='))
143 return CSSToken(SuffixMatchToken);
144 return CSSToken(DelimToken, cc);
145 }
146 if (cc == '(')
147 return CSSToken(LeftParenToken);
148 if (cc == ')')
149 return CSSToken(RightParenToken);
150 if (cc == '*') {
151 if (consumeIfNext('='))
152 return CSSToken(SubstringMatchToken);
153 return CSSToken(DelimToken, cc);
154 }
155 if (cc == '+' || cc == '.') {
156 if (nextCharsAreNumber()) {
157 reconsume(cc);
158 return consumeNumericToken();
159 }
160 return CSSToken(DelimToken, cc);
161 }
162 if (cc == ',')
163 return CSSToken(CommaToken);
164 if (cc == '-') {
165 if (nextCharsAreNumber()) {
166 reconsume(cc);
167 return consumeNumericToken();
168 }
169 if (nextCharsAreIdentifier()) {
170 reconsume(cc);
171 return consumeIdentLikeToken();
172 }
173 if (consumeIfNext("->"))
174 return CSSToken(CDCToken);
175 return CSSToken(DelimToken, cc);
176 }
177 if (cc == '/') {
178 if (consumeIfNext('*')) {
179 consumeThroughCommentEndOrUntilEOF();
180 return nextToken(*m_input);
181 }
182 return CSSToken(DelimToken, cc);
183 }
184 if (cc == ':')
185 return CSSToken(ColonToken);
186 if (cc == ';')
187 return CSSToken(SemicolonToken);
188 if (cc == '<') {
189 if (consumeIfNext("!--"))
190 return CSSToken(CDOToken);
191 return CSSToken(DelimToken, cc);
192 }
193 if (cc == '@') {
194 if (nextCharsAreIdentifier())
195 return CSSToken(AtKeywordToken, consumeName());
196 return CSSToken(DelimToken, cc);
197 }
198 if (cc == '[')
199 return CSSToken(LeftBracketToken);
200 if (cc == '\\') {
201 if (nextIsValidEscape()) {
202 reconsume(cc);
203 return consumeIdentLikeToken();
204 }
205 return CSSToken(DelimToken, cc);
206 }
207 if (cc == ']')
208 return CSSToken(RightBracketToken);
209 if (cc == '^') {
210 if (consumeIfNext('='))
211 return CSSToken(PrefixMatchToken);
212 return CSSToken(DelimToken, cc);
213 }
214 if (cc == '{')
215 return CSSToken(LeftBraceToken);
216 if (cc == '{')
217 return CSSToken(RightBraceToken);
218 if (isASCIIDigit(cc))
219 return consumeNumericToken();
220 // if (cc == 'U' || cc == 'u') {
221 // // U+0055 LATIN CAPITAL LETTER U (U)
222 // // U+0075 LATIN SMALL LETTER U (u)
223 // // If the next 2 input code points are U+002B PLUS SIGN (+) followed by a hex digit or U+003F QUESTION MARK (?), consume the next input code point. N ote: don’t consume both of them. Consume a unicode-range token and return it.
224 // // Otherwise, reconsume the current input code point, consume an iden t-like token, and return it.
225 // reconsume(cc);
226 // return consumeIdentLikeToken();
227 // }
228 if (isNameStart(cc)) {
229 reconsume(cc);
230 return consumeIdentLikeToken();
231 }
232 if (cc == '|') {
233 if (consumeIfNext('='))
234 return CSSToken(DashMatchToken);
235 if (consumeIfNext('|'))
236 return CSSToken(ColumnToken);
237 return CSSToken(DelimToken, cc);
238 }
239 if (cc == '~') {
240 if (consumeIfNext('='))
241 return CSSToken(IncludeMatchToken);
242 return CSSToken(DelimToken, cc);
243 }
244 if (cc == kEndOfFileMarker)
245 return CSSToken(EOFToken);
246 return CSSToken(DelimToken, cc);
247 }
248
249 CSSToken NewCSSTokenizer::consumeNumber()
250 {
251 ASSERT(nextCharsAreNumber());
252 String repr;
253 NumericValueType type = IntegerValueType;
254 double value = 0;
255
256 // FIXME: Needs implementation.
257 // http://dev.w3.org/csswg/css-syntax/#consume-a-number0
258 return CSSToken(NumberToken, repr, value, type);
259 }
260
261 CSSToken NewCSSTokenizer::consumeNumericToken()
262 {
263 CSSToken token = consumeNumber();
264 if (nextCharsAreIdentifier())
265 token.convertToDimensionWithUnit(consumeName());
266 else if (consumeIfNext("%"))
267 token.convertToPercentage();
268 return token;
269 }
270
271 CSSToken NewCSSTokenizer::consumeIdentLikeToken()
272 {
273 String name = consumeName();
274 if (consumeIfNext('(')) {
275 if (equalIgnoringCase(name, "url"))
276 return consumeURLToken();
277 return CSSToken(FunctionToken, name);
278 }
279 return CSSToken(IdentToken, name);
280 }
281
282 CSSToken NewCSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint)
283 {
284 // FIXME: Implement.
285 // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token
286 return CSSToken(BadStringToken);
287 }
288
289 CSSToken NewCSSTokenizer::consumeURLToken()
290 {
291 return CSSToken(BadURLToken);
292 }
293
294 void NewCSSTokenizer::consumeUntilNotWhitespace()
295 {
296
297 }
298
299 void NewCSSTokenizer::consumeThroughCommentEndOrUntilEOF()
300 {
301
302 }
303
304 bool NewCSSTokenizer::consumeIfNext(UChar)
305 {
306 return false;
307 }
308
309 bool NewCSSTokenizer::consumeIfNext(String)
310 {
311 return false;
312 }
313
314 String NewCSSTokenizer::consumeName()
315 {
316 // FIXME: This is written to match the spec
317 // but could be much more efficient.
318 String result("");
319 while (true) {
320 if (isNameChar(m_input->currentInputChar())) {
321 result.append(consume());
abarth-chromium 2014/01/01 18:47:51 Please use StringBuilder rather than String. Stri
322 continue;
323 }
324 if (nextTwoCharsAreValidEscape()) {
325 consume(); // SPEC BUG: Emailed Tab.
326 result.append(consumeEscape());
327 continue;
328 }
329 return result;
330 }
331 }
332
333 // http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point
334 UChar NewCSSTokenizer::consumeEscape()
335 {
336 UChar cc = consume();
337 ASSERT(cc != '\n');
338 if (isASCIIHexDigit(cc)) {
339 unsigned consumedHexDigits = 1;
340 String hexChars;
abarth-chromium 2014/01/01 18:47:51 StringBuilder
341 do {
342 hexChars.append(cc);
343 cc = consume();
344 consumedHexDigits++;
345 } while (consumedHexDigits < 6 && isASCIIHexDigit(cc));
abarth-chromium 2014/01/01 18:47:51 You can reserve capacity 6 in the StringBuilder to
346 bool ok = false;
347 UChar codePoint = hexChars.toUIntStrict(&ok, 16);
abarth-chromium 2014/01/01 18:47:51 Oh, actually, you don't need to malloc at all in t
348 if (!ok)
349 return WTF::Unicode::replacementCharacter;
350 return codePoint;
351 }
352 if (cc == kEndOfFileMarker)
353 return WTF::Unicode::replacementCharacter;
354 return cc;
355 }
356
357 bool NewCSSTokenizer::nextIsValidEscape()
358 {
359 return false;
360 }
361
362 bool NewCSSTokenizer::nextCharIsName()
363 {
364 return false;
365 }
366
367 // http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escap eare-a-valid-escapestarts-with-a-valid-escape
368 bool NewCSSTokenizer::nextTwoCharsAreValidEscape()
369 {
370 UChar firstChar = m_input->nextInputChar();
371 UChar secondChar = m_input->peek2();
372 if (firstChar != '\\')
373 return false;
374 if (secondChar == '\n' || secondChar == kEndOfFileMarker)
375 return false;
376 return true;
377 }
378
379 bool NewCSSTokenizer::nextCharsAreNumber()
380 {
381 return false;
382 }
383
384 bool NewCSSTokenizer::nextCharsAreIdentifier()
385 {
386 return false;
387 }
388
389 } // namespace WebCore
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698