Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 /* | |
| 2 * Copyright (C) 2013 Google Inc. All rights reserved. | |
| 3 * | |
| 4 * Redistribution and use in source and binary forms, with or without | |
| 5 * modification, are permitted provided that the following conditions are | |
| 6 * met: | |
| 7 * | |
| 8 * * Redistributions of source code must retain the above copyright | |
| 9 * notice, this list of conditions and the following disclaimer. | |
| 10 * * Redistributions in binary form must reproduce the above | |
| 11 * copyright notice, this list of conditions and the following disclaimer | |
| 12 * in the documentation and/or other materials provided with the | |
| 13 * distribution. | |
| 14 * * Neither the name of Google Inc. nor the names of its | |
| 15 * contributors may be used to endorse or promote products derived from | |
| 16 * this software without specific prior written permission. | |
| 17 * | |
| 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
| 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
| 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
| 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
| 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 29 */ | |
| 30 | |
| 31 #include "config.h" | |
| 32 #include "core/css/parser/NewCSSTokenizer.h" | |
| 33 | |
| 34 #include "core/css/parser/CSSParserIdioms.h" | |
| 35 #include "platform/text/SegmentedString.h" | |
| 36 #include "wtf/TemporaryChange.h" | |
| 37 #include "wtf/unicode/CharacterNames.h" | |
| 38 | |
| 39 namespace WebCore { | |
| 40 | |
| 41 CSSInputStream::CSSInputStream(String input) | |
| 42 : m_offset(0) | |
| 43 , m_string(input) | |
| 44 { | |
| 45 m_string.append(kEndOfFileMarker); | |
|
abarth-chromium
2014/01/01 18:47:51
Hum... String::append is monstrously slow...
| |
| 46 } | |
| 47 | |
| 48 UChar CSSInputStream::currentInputChar() | |
| 49 { | |
| 50 ASSERT(m_offset < m_string.length()); | |
| 51 return m_string[m_offset]; | |
| 52 } | |
| 53 | |
| 54 UChar CSSInputStream::nextInputChar() | |
| 55 { | |
| 56 return m_string[m_offset + 1]; | |
| 57 } | |
| 58 | |
| 59 UChar CSSInputStream::peek2() | |
| 60 { | |
| 61 return m_string[m_offset + 2]; | |
| 62 } | |
| 63 | |
| 64 UChar CSSInputStream::peek3() | |
| 65 { | |
| 66 return m_string[m_offset + 3]; | |
| 67 } | |
| 68 | |
| 69 void CSSInputStream::advance() | |
| 70 { | |
| 71 m_offset++; | |
| 72 } | |
| 73 | |
| 74 void CSSInputStream::pushBack(UChar cc) | |
| 75 { | |
| 76 m_offset--; | |
| 77 ASSERT(currentInputChar() == cc); | |
| 78 } | |
| 79 | |
| 80 // http://dev.w3.org/csswg/css-syntax/#name-start-code-point | |
| 81 static bool isNameStart(UChar c) | |
| 82 { | |
| 83 if (isASCIIAlpha(c)) | |
| 84 return true; | |
| 85 if (c == '_') | |
| 86 return true; | |
| 87 return !isASCII(c); | |
| 88 } | |
| 89 | |
| 90 // http://www.w3.org/TR/css-syntax-3/#name-code-point | |
| 91 static bool isNameChar(UChar c) | |
| 92 { | |
| 93 return isNameStart(c) || isASCIIDigit(c) || c == '-'; | |
| 94 } | |
| 95 | |
| 96 NewCSSTokenizer::NewCSSTokenizer() | |
| 97 { | |
| 98 } | |
| 99 | |
| 100 void NewCSSTokenizer::reconsume(UChar c) | |
| 101 { | |
| 102 m_input->pushBack(c); | |
| 103 } | |
| 104 | |
| 105 UChar NewCSSTokenizer::consume() | |
| 106 { | |
| 107 UChar current = m_input->currentInputChar(); | |
| 108 m_input->advance(); | |
| 109 return current; | |
| 110 } | |
| 111 | |
| 112 CSSToken NewCSSTokenizer::nextToken(CSSInputStream& input) | |
| 113 { | |
| 114 // Unlike the HTMLTokenizer, the CSS Syntax spec is written | |
| 115 // as a stateless, (fixed-size) look-ahead tokenizer. | |
| 116 // We could move to the stateful model and instead create | |
| 117 // states for all the "next 3 codepoints are X" cases. | |
| 118 // State-machine tokenizers are easier to write to handle | |
| 119 // incremental tokenization of partial sources. | |
| 120 // However, for now we follow the spec exactly. | |
| 121 m_input = &input; | |
| 122 UChar cc = consume(); | |
| 123 | |
| 124 if (isCSSSpace(cc)) { | |
|
abarth-chromium
2014/01/01 18:47:51
I bet it's faster to implement this if-cascade usi
| |
| 125 // CSS Tokenization is currently lossy, but we could record | |
| 126 // the exact whitespace instead of discarding it here. | |
| 127 consumeUntilNotWhitespace(); | |
| 128 return CSSToken(WhitespaceToken); | |
| 129 } | |
| 130 if (cc == '\"' || cc == '\'') | |
| 131 return consumeStringTokenUntil(cc); | |
| 132 if (cc == '#') { | |
| 133 if (nextCharIsName() || nextTwoCharsAreValidEscape()) { | |
| 134 HashTokenType hashType = UnrestrictedHashToken; | |
| 135 if (nextCharsAreIdentifier()) | |
| 136 hashType = IdHashToken; | |
| 137 return CSSToken(HashToken, consumeName(), hashType); | |
| 138 } | |
| 139 return CSSToken(DelimToken, cc); | |
| 140 } | |
| 141 if (cc == '$') { | |
| 142 if (consumeIfNext('=')) | |
| 143 return CSSToken(SuffixMatchToken); | |
| 144 return CSSToken(DelimToken, cc); | |
| 145 } | |
| 146 if (cc == '(') | |
| 147 return CSSToken(LeftParenToken); | |
| 148 if (cc == ')') | |
| 149 return CSSToken(RightParenToken); | |
| 150 if (cc == '*') { | |
| 151 if (consumeIfNext('=')) | |
| 152 return CSSToken(SubstringMatchToken); | |
| 153 return CSSToken(DelimToken, cc); | |
| 154 } | |
| 155 if (cc == '+' || cc == '.') { | |
| 156 if (nextCharsAreNumber()) { | |
| 157 reconsume(cc); | |
| 158 return consumeNumericToken(); | |
| 159 } | |
| 160 return CSSToken(DelimToken, cc); | |
| 161 } | |
| 162 if (cc == ',') | |
| 163 return CSSToken(CommaToken); | |
| 164 if (cc == '-') { | |
| 165 if (nextCharsAreNumber()) { | |
| 166 reconsume(cc); | |
| 167 return consumeNumericToken(); | |
| 168 } | |
| 169 if (nextCharsAreIdentifier()) { | |
| 170 reconsume(cc); | |
| 171 return consumeIdentLikeToken(); | |
| 172 } | |
| 173 if (consumeIfNext("->")) | |
| 174 return CSSToken(CDCToken); | |
| 175 return CSSToken(DelimToken, cc); | |
| 176 } | |
| 177 if (cc == '/') { | |
| 178 if (consumeIfNext('*')) { | |
| 179 consumeThroughCommentEndOrUntilEOF(); | |
| 180 return nextToken(*m_input); | |
| 181 } | |
| 182 return CSSToken(DelimToken, cc); | |
| 183 } | |
| 184 if (cc == ':') | |
| 185 return CSSToken(ColonToken); | |
| 186 if (cc == ';') | |
| 187 return CSSToken(SemicolonToken); | |
| 188 if (cc == '<') { | |
| 189 if (consumeIfNext("!--")) | |
| 190 return CSSToken(CDOToken); | |
| 191 return CSSToken(DelimToken, cc); | |
| 192 } | |
| 193 if (cc == '@') { | |
| 194 if (nextCharsAreIdentifier()) | |
| 195 return CSSToken(AtKeywordToken, consumeName()); | |
| 196 return CSSToken(DelimToken, cc); | |
| 197 } | |
| 198 if (cc == '[') | |
| 199 return CSSToken(LeftBracketToken); | |
| 200 if (cc == '\\') { | |
| 201 if (nextIsValidEscape()) { | |
| 202 reconsume(cc); | |
| 203 return consumeIdentLikeToken(); | |
| 204 } | |
| 205 return CSSToken(DelimToken, cc); | |
| 206 } | |
| 207 if (cc == ']') | |
| 208 return CSSToken(RightBracketToken); | |
| 209 if (cc == '^') { | |
| 210 if (consumeIfNext('=')) | |
| 211 return CSSToken(PrefixMatchToken); | |
| 212 return CSSToken(DelimToken, cc); | |
| 213 } | |
| 214 if (cc == '{') | |
| 215 return CSSToken(LeftBraceToken); | |
| 216 if (cc == '{') | |
| 217 return CSSToken(RightBraceToken); | |
| 218 if (isASCIIDigit(cc)) | |
| 219 return consumeNumericToken(); | |
| 220 // if (cc == 'U' || cc == 'u') { | |
| 221 // // U+0055 LATIN CAPITAL LETTER U (U) | |
| 222 // // U+0075 LATIN SMALL LETTER U (u) | |
| 223 // // If the next 2 input code points are U+002B PLUS SIGN (+) followed by a hex digit or U+003F QUESTION MARK (?), consume the next input code point. N ote: don’t consume both of them. Consume a unicode-range token and return it. | |
| 224 // // Otherwise, reconsume the current input code point, consume an iden t-like token, and return it. | |
| 225 // reconsume(cc); | |
| 226 // return consumeIdentLikeToken(); | |
| 227 // } | |
| 228 if (isNameStart(cc)) { | |
| 229 reconsume(cc); | |
| 230 return consumeIdentLikeToken(); | |
| 231 } | |
| 232 if (cc == '|') { | |
| 233 if (consumeIfNext('=')) | |
| 234 return CSSToken(DashMatchToken); | |
| 235 if (consumeIfNext('|')) | |
| 236 return CSSToken(ColumnToken); | |
| 237 return CSSToken(DelimToken, cc); | |
| 238 } | |
| 239 if (cc == '~') { | |
| 240 if (consumeIfNext('=')) | |
| 241 return CSSToken(IncludeMatchToken); | |
| 242 return CSSToken(DelimToken, cc); | |
| 243 } | |
| 244 if (cc == kEndOfFileMarker) | |
| 245 return CSSToken(EOFToken); | |
| 246 return CSSToken(DelimToken, cc); | |
| 247 } | |
| 248 | |
| 249 CSSToken NewCSSTokenizer::consumeNumber() | |
| 250 { | |
| 251 ASSERT(nextCharsAreNumber()); | |
| 252 String repr; | |
| 253 NumericValueType type = IntegerValueType; | |
| 254 double value = 0; | |
| 255 | |
| 256 // FIXME: Needs implementation. | |
| 257 // http://dev.w3.org/csswg/css-syntax/#consume-a-number0 | |
| 258 return CSSToken(NumberToken, repr, value, type); | |
| 259 } | |
| 260 | |
| 261 CSSToken NewCSSTokenizer::consumeNumericToken() | |
| 262 { | |
| 263 CSSToken token = consumeNumber(); | |
| 264 if (nextCharsAreIdentifier()) | |
| 265 token.convertToDimensionWithUnit(consumeName()); | |
| 266 else if (consumeIfNext("%")) | |
| 267 token.convertToPercentage(); | |
| 268 return token; | |
| 269 } | |
| 270 | |
| 271 CSSToken NewCSSTokenizer::consumeIdentLikeToken() | |
| 272 { | |
| 273 String name = consumeName(); | |
| 274 if (consumeIfNext('(')) { | |
| 275 if (equalIgnoringCase(name, "url")) | |
| 276 return consumeURLToken(); | |
| 277 return CSSToken(FunctionToken, name); | |
| 278 } | |
| 279 return CSSToken(IdentToken, name); | |
| 280 } | |
| 281 | |
| 282 CSSToken NewCSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint) | |
| 283 { | |
| 284 // FIXME: Implement. | |
| 285 // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token | |
| 286 return CSSToken(BadStringToken); | |
| 287 } | |
| 288 | |
| 289 CSSToken NewCSSTokenizer::consumeURLToken() | |
| 290 { | |
| 291 return CSSToken(BadURLToken); | |
| 292 } | |
| 293 | |
| 294 void NewCSSTokenizer::consumeUntilNotWhitespace() | |
| 295 { | |
| 296 | |
| 297 } | |
| 298 | |
| 299 void NewCSSTokenizer::consumeThroughCommentEndOrUntilEOF() | |
| 300 { | |
| 301 | |
| 302 } | |
| 303 | |
| 304 bool NewCSSTokenizer::consumeIfNext(UChar) | |
| 305 { | |
| 306 return false; | |
| 307 } | |
| 308 | |
| 309 bool NewCSSTokenizer::consumeIfNext(String) | |
| 310 { | |
| 311 return false; | |
| 312 } | |
| 313 | |
| 314 String NewCSSTokenizer::consumeName() | |
| 315 { | |
| 316 // FIXME: This is written to match the spec | |
| 317 // but could be much more efficient. | |
| 318 String result(""); | |
| 319 while (true) { | |
| 320 if (isNameChar(m_input->currentInputChar())) { | |
| 321 result.append(consume()); | |
|
abarth-chromium
2014/01/01 18:47:51
Please use StringBuilder rather than String. Stri
| |
| 322 continue; | |
| 323 } | |
| 324 if (nextTwoCharsAreValidEscape()) { | |
| 325 consume(); // SPEC BUG: Emailed Tab. | |
| 326 result.append(consumeEscape()); | |
| 327 continue; | |
| 328 } | |
| 329 return result; | |
| 330 } | |
| 331 } | |
| 332 | |
| 333 // http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point | |
| 334 UChar NewCSSTokenizer::consumeEscape() | |
| 335 { | |
| 336 UChar cc = consume(); | |
| 337 ASSERT(cc != '\n'); | |
| 338 if (isASCIIHexDigit(cc)) { | |
| 339 unsigned consumedHexDigits = 1; | |
| 340 String hexChars; | |
|
abarth-chromium
2014/01/01 18:47:51
StringBuilder
| |
| 341 do { | |
| 342 hexChars.append(cc); | |
| 343 cc = consume(); | |
| 344 consumedHexDigits++; | |
| 345 } while (consumedHexDigits < 6 && isASCIIHexDigit(cc)); | |
|
abarth-chromium
2014/01/01 18:47:51
You can reserve capacity 6 in the StringBuilder to
| |
| 346 bool ok = false; | |
| 347 UChar codePoint = hexChars.toUIntStrict(&ok, 16); | |
|
abarth-chromium
2014/01/01 18:47:51
Oh, actually, you don't need to malloc at all in t
| |
| 348 if (!ok) | |
| 349 return WTF::Unicode::replacementCharacter; | |
| 350 return codePoint; | |
| 351 } | |
| 352 if (cc == kEndOfFileMarker) | |
| 353 return WTF::Unicode::replacementCharacter; | |
| 354 return cc; | |
| 355 } | |
| 356 | |
| 357 bool NewCSSTokenizer::nextIsValidEscape() | |
| 358 { | |
| 359 return false; | |
| 360 } | |
| 361 | |
| 362 bool NewCSSTokenizer::nextCharIsName() | |
| 363 { | |
| 364 return false; | |
| 365 } | |
| 366 | |
| 367 // http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escap eare-a-valid-escapestarts-with-a-valid-escape | |
| 368 bool NewCSSTokenizer::nextTwoCharsAreValidEscape() | |
| 369 { | |
| 370 UChar firstChar = m_input->nextInputChar(); | |
| 371 UChar secondChar = m_input->peek2(); | |
| 372 if (firstChar != '\\') | |
| 373 return false; | |
| 374 if (secondChar == '\n' || secondChar == kEndOfFileMarker) | |
| 375 return false; | |
| 376 return true; | |
| 377 } | |
| 378 | |
| 379 bool NewCSSTokenizer::nextCharsAreNumber() | |
| 380 { | |
| 381 return false; | |
| 382 } | |
| 383 | |
| 384 bool NewCSSTokenizer::nextCharsAreIdentifier() | |
| 385 { | |
| 386 return false; | |
| 387 } | |
| 388 | |
| 389 } // namespace WebCore | |
| OLD | NEW |