OLD | NEW |
(Empty) | |
| 1 /* |
| 2 * Copyright (C) 2013 Google Inc. All rights reserved. |
| 3 * |
| 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions are |
| 6 * met: |
| 7 * |
| 8 * * Redistributions of source code must retain the above copyright |
| 9 * notice, this list of conditions and the following disclaimer. |
| 10 * * Redistributions in binary form must reproduce the above |
| 11 * copyright notice, this list of conditions and the following disclaimer |
| 12 * in the documentation and/or other materials provided with the |
| 13 * distribution. |
| 14 * * Neither the name of Google Inc. nor the names of its |
| 15 * contributors may be used to endorse or promote products derived from |
| 16 * this software without specific prior written permission. |
| 17 * |
| 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
| 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
| 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
| 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
| 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| 29 */ |
| 30 |
| 31 #include "config.h" |
| 32 #include "core/css/parser/NewCSSTokenizer.h" |
| 33 |
| 34 #include "core/css/parser/CSSInputStream.h" |
| 35 #include "core/css/parser/CSSParserIdioms.h" |
| 36 #include "platform/text/SegmentedString.h" |
| 37 #include "wtf/TemporaryChange.h" |
| 38 #include "wtf/unicode/CharacterNames.h" |
| 39 |
| 40 namespace WebCore { |
| 41 |
| 42 // http://dev.w3.org/csswg/css-syntax/#name-start-code-point |
| 43 static bool isNameStart(UChar c) |
| 44 { |
| 45 if (isASCIIAlpha(c)) |
| 46 return true; |
| 47 if (c == '_') |
| 48 return true; |
| 49 return !isASCII(c); |
| 50 } |
| 51 |
| 52 // http://www.w3.org/TR/css-syntax-3/#name-code-point |
| 53 static bool isNameChar(UChar c) |
| 54 { |
| 55 return isNameStart(c) || isASCIIDigit(c) || c == '-'; |
| 56 } |
| 57 |
| 58 // http://www.w3.org/TR/css-syntax-3/#check-if-two-code-points-are-a-valid-escap
e |
| 59 static bool twoCharsAreValidEscape(UChar first, UChar second) |
| 60 { |
| 61 return ((first == '\\') && (second != '\n') && (second != kEndOfFileMarker))
; |
| 62 } |
| 63 |
| 64 NewCSSTokenizer::NewCSSTokenizer() |
| 65 { |
| 66 } |
| 67 |
| 68 void NewCSSTokenizer::reconsume(UChar c) |
| 69 { |
| 70 m_input->pushBack(c); |
| 71 } |
| 72 |
| 73 UChar NewCSSTokenizer::consume() |
| 74 { |
| 75 UChar current = m_input->currentInputChar(); |
| 76 m_input->advance(); |
| 77 return current; |
| 78 } |
| 79 |
| 80 void NewCSSTokenizer::tokenize(String string, Vector<CSSToken>& outTokens) |
| 81 { |
| 82 NewCSSTokenizer tokenizer; |
| 83 CSSInputStream input(string); |
| 84 while (true) { |
| 85 outTokens.append(tokenizer.nextToken(input)); |
| 86 if (outTokens.last().type() == EOFToken) |
| 87 return; |
| 88 } |
| 89 } |
| 90 |
| 91 CSSToken NewCSSTokenizer::nextToken(CSSInputStream& input) |
| 92 { |
| 93 // Unlike the HTMLTokenizer, the CSS Syntax spec is written |
| 94 // as a stateless, (fixed-size) look-ahead tokenizer. |
| 95 // We could move to the stateful model and instead create |
| 96 // states for all the "next 3 codepoints are X" cases. |
| 97 // State-machine tokenizers are easier to write to handle |
| 98 // incremental tokenization of partial sources. |
| 99 // However, for now we follow the spec exactly. |
| 100 m_input = &input; |
| 101 UChar cc = consume(); |
| 102 |
| 103 if (isCSSSpace(cc)) { |
| 104 // CSS Tokenization is currently lossy, but we could record |
| 105 // the exact whitespace instead of discarding it here. |
| 106 consumeUntilNotWhitespace(); |
| 107 return CSSToken(WhitespaceToken); |
| 108 } |
| 109 if (cc == '\"' || cc == '\'') |
| 110 return consumeStringTokenUntil(cc); |
| 111 if (cc == '#') { |
| 112 if (nextCharIsNameChar() || nextTwoCharsAreValidEscape()) { |
| 113 HashTokenType hashType = UnrestrictedHashToken; |
| 114 if (nextCharsAreIdentifier()) |
| 115 hashType = IdHashToken; |
| 116 return CSSToken(HashToken, consumeName(), hashType); |
| 117 } |
| 118 return CSSToken(DelimToken, cc); |
| 119 } |
| 120 if (cc == '$') { |
| 121 if (consumeIfNext('=')) |
| 122 return CSSToken(SuffixMatchToken); |
| 123 return CSSToken(DelimToken, cc); |
| 124 } |
| 125 if (cc == '(') |
| 126 return CSSToken(LeftParenToken); |
| 127 if (cc == ')') |
| 128 return CSSToken(RightParenToken); |
| 129 if (cc == '*') { |
| 130 if (consumeIfNext('=')) |
| 131 return CSSToken(SubstringMatchToken); |
| 132 return CSSToken(DelimToken, cc); |
| 133 } |
| 134 if (cc == '+' || cc == '.') { |
| 135 if (nextCharsAreNumber()) { |
| 136 reconsume(cc); |
| 137 return consumeNumericToken(); |
| 138 } |
| 139 return CSSToken(DelimToken, cc); |
| 140 } |
| 141 if (cc == ',') |
| 142 return CSSToken(CommaToken); |
| 143 if (cc == '-') { |
| 144 if (nextCharsAreNumber()) { |
| 145 reconsume(cc); |
| 146 return consumeNumericToken(); |
| 147 } |
| 148 if (nextCharsAreIdentifier()) { |
| 149 reconsume(cc); |
| 150 return consumeIdentLikeToken(); |
| 151 } |
| 152 if (consumeIfNext("->")) |
| 153 return CSSToken(CDCToken); |
| 154 return CSSToken(DelimToken, cc); |
| 155 } |
| 156 if (cc == '/') { |
| 157 if (consumeIfNext('*')) { |
| 158 consumeThroughCommentEndOrUntilEOF(); |
| 159 return nextToken(*m_input); |
| 160 } |
| 161 return CSSToken(DelimToken, cc); |
| 162 } |
| 163 if (cc == ':') |
| 164 return CSSToken(ColonToken); |
| 165 if (cc == ';') |
| 166 return CSSToken(SemicolonToken); |
| 167 if (cc == '<') { |
| 168 if (consumeIfNext("!--")) |
| 169 return CSSToken(CDOToken); |
| 170 return CSSToken(DelimToken, cc); |
| 171 } |
| 172 if (cc == '@') { |
| 173 if (nextCharsAreIdentifier()) |
| 174 return CSSToken(AtKeywordToken, consumeName()); |
| 175 return CSSToken(DelimToken, cc); |
| 176 } |
| 177 if (cc == '[') |
| 178 return CSSToken(LeftBracketToken); |
| 179 if (cc == '\\') { |
| 180 if (twoCharsAreValidEscape(cc, m_input->currentInputChar())) { |
| 181 reconsume(cc); |
| 182 return consumeIdentLikeToken(); |
| 183 } |
| 184 return CSSToken(DelimToken, cc); |
| 185 } |
| 186 if (cc == ']') |
| 187 return CSSToken(RightBracketToken); |
| 188 if (cc == '^') { |
| 189 if (consumeIfNext('=')) |
| 190 return CSSToken(PrefixMatchToken); |
| 191 return CSSToken(DelimToken, cc); |
| 192 } |
| 193 if (cc == '{') |
| 194 return CSSToken(LeftBraceToken); |
| 195 if (cc == '{') |
| 196 return CSSToken(RightBraceToken); |
| 197 if (isASCIIDigit(cc)) { |
| 198 // "reconsume" here is not according to spec, but required AFAICT. |
| 199 // https://www.w3.org/Bugs/Public/show_bug.cgi?id=24661 |
| 200 reconsume(cc); |
| 201 return consumeNumericToken(); |
| 202 } |
| 203 // if (cc == 'U' || cc == 'u') { |
| 204 // // U+0055 LATIN CAPITAL LETTER U (U) |
| 205 // // U+0075 LATIN SMALL LETTER U (u) |
| 206 // // If the next 2 input code points are U+002B PLUS SIGN (+) followed
by a hex digit or U+003F QUESTION MARK (?), consume the next input code point. N
ote: don’t consume both of them. Consume a unicode-range token and return it. |
| 207 // // Otherwise, reconsume the current input code point, consume an iden
t-like token, and return it. |
| 208 // reconsume(cc); |
| 209 // return consumeIdentLikeToken(); |
| 210 // } |
| 211 if (isNameStart(cc)) { |
| 212 reconsume(cc); |
| 213 return consumeIdentLikeToken(); |
| 214 } |
| 215 if (cc == '|') { |
| 216 if (consumeIfNext('=')) |
| 217 return CSSToken(DashMatchToken); |
| 218 if (consumeIfNext('|')) |
| 219 return CSSToken(ColumnToken); |
| 220 return CSSToken(DelimToken, cc); |
| 221 } |
| 222 if (cc == '~') { |
| 223 if (consumeIfNext('=')) |
| 224 return CSSToken(IncludeMatchToken); |
| 225 return CSSToken(DelimToken, cc); |
| 226 } |
| 227 if (cc == kEndOfFileMarker) |
| 228 return CSSToken(EOFToken); |
| 229 return CSSToken(DelimToken, cc); |
| 230 } |
| 231 |
| 232 // This method merges the following spec sections for efficiency |
| 233 // http://www.w3.org/TR/css3-syntax/#consume-a-number |
| 234 // http://www.w3.org/TR/css3-syntax/#convert-a-string-to-a-number |
| 235 CSSToken NewCSSTokenizer::consumeNumber() |
| 236 { |
| 237 ASSERT(nextCharsAreNumber()); |
| 238 // FIXME - repr should get the value as a string, even though I'm not sure i
t's useful |
| 239 String repr; |
| 240 NumericValueType type = IntegerValueType; |
| 241 double value = 0; |
| 242 int sign = 1; |
| 243 unsigned peekOffset = 0; |
| 244 int exponentSign = 1; |
| 245 unsigned exponentStartPos = 0; |
| 246 unsigned exponentEndPos = 0; |
| 247 unsigned fractionStartPos = 0; |
| 248 unsigned fractionEndPos = 0; |
| 249 unsigned integerPart; |
| 250 unsigned fractionPart; |
| 251 unsigned fractionDigits; |
| 252 unsigned exponentPart; |
| 253 if (m_input->currentInputChar() == '+') { |
| 254 ++peekOffset; |
| 255 } else if (m_input->peek(peekOffset) == '-') { |
| 256 sign = -1; |
| 257 ++peekOffset; |
| 258 } |
| 259 unsigned intStartPos = peekOffset; |
| 260 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset); |
| 261 unsigned intEndPos = peekOffset; |
| 262 if (m_input->peek(peekOffset) == '.' && isASCIIDigit(m_input->peek(++peekOff
set))) { |
| 263 fractionStartPos = peekOffset; |
| 264 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset); |
| 265 fractionEndPos = peekOffset; |
| 266 } |
| 267 if ((m_input->peek(peekOffset) == 'E' || m_input->peek(peekOffset) == 'e'))
{ |
| 268 ++peekOffset; |
| 269 if (m_input->peek(peekOffset) == '+') { |
| 270 ++peekOffset; |
| 271 } else if (m_input->peek(peekOffset) =='-') { |
| 272 exponentSign = -1; |
| 273 ++peekOffset; |
| 274 } |
| 275 exponentStartPos = peekOffset; |
| 276 peekOffset = m_input->skipWhilePredicate<isASCIIDigit>(peekOffset); |
| 277 exponentEndPos = peekOffset; |
| 278 } |
| 279 integerPart = m_input->getUnsignedInt(intStartPos, intEndPos); |
| 280 fractionPart = m_input->getUnsignedInt(fractionStartPos, fractionEndPos); |
| 281 fractionDigits = fractionEndPos - fractionStartPos; |
| 282 exponentPart = m_input->getUnsignedInt(exponentStartPos, exponentEndPos); |
| 283 value = sign * (integerPart + fractionPart * pow(10, -1 * fractionDigits)) *
pow(10, exponentSign * exponentPart); |
| 284 |
| 285 m_input->advance(peekOffset); |
| 286 // FIXME - Always returning an Integer type. Need to look at fractions, etc. |
| 287 |
| 288 return CSSToken(NumberToken, repr, value, type); |
| 289 } |
| 290 |
| 291 // http://www.w3.org/TR/css3-syntax/#consume-a-numeric-token |
| 292 CSSToken NewCSSTokenizer::consumeNumericToken() |
| 293 { |
| 294 CSSToken token = consumeNumber(); |
| 295 if (nextCharsAreIdentifier()) |
| 296 token.convertToDimensionWithUnit(consumeName()); |
| 297 else if (consumeIfNext('%')) |
| 298 token.convertToPercentage(); |
| 299 return token; |
| 300 } |
| 301 |
| 302 // http://www.w3.org/TR/css3-syntax/#consume-an-ident-like-token |
| 303 CSSToken NewCSSTokenizer::consumeIdentLikeToken() |
| 304 { |
| 305 String name = consumeName(); |
| 306 if (consumeIfNext('(')) { |
| 307 if (equalIgnoringCase(name, "url")) |
| 308 return consumeURLToken(); |
| 309 return CSSToken(FunctionToken, name); |
| 310 } |
| 311 return CSSToken(IdentToken, name); |
| 312 } |
| 313 |
| 314 // http://dev.w3.org/csswg/css-syntax/#consume-a-string-token |
| 315 CSSToken NewCSSTokenizer::consumeStringTokenUntil(UChar endingCodePoint) |
| 316 { |
| 317 // FIXME: Implement. |
| 318 return CSSToken(BadStringToken); |
| 319 } |
| 320 |
| 321 // http://www.w3.org/TR/css3-syntax/#consume-a-url-token |
| 322 CSSToken NewCSSTokenizer::consumeURLToken() |
| 323 { |
| 324 // FIXME: Implement. |
| 325 return CSSToken(BadURLToken); |
| 326 } |
| 327 |
| 328 void NewCSSTokenizer::consumeUntilNotWhitespace() |
| 329 { |
| 330 while (m_input->currentInputChar() == '\t' || m_input->currentInputChar() ==
' ' || m_input->currentInputChar() == '\n') |
| 331 consume(); |
| 332 } |
| 333 |
| 334 void NewCSSTokenizer::consumeThroughCommentEndOrUntilEOF() |
| 335 { |
| 336 // FIXME: Implement. |
| 337 } |
| 338 |
| 339 bool NewCSSTokenizer::consumeIfNext(UChar character) |
| 340 { |
| 341 return (m_input->currentInputChar() == character); |
| 342 } |
| 343 |
| 344 bool NewCSSTokenizer::consumeIfNext(String str) |
| 345 { |
| 346 for (unsigned i = 0; i < str.length(); ++i) { |
| 347 if (str[i] != m_input->peek(i)) |
| 348 return false; |
| 349 } |
| 350 return true; |
| 351 } |
| 352 |
| 353 // http://www.w3.org/TR/css3-syntax/#consume-a-name |
| 354 String NewCSSTokenizer::consumeName() |
| 355 { |
| 356 // FIXME: This is written to match the spec |
| 357 // but could be much more efficient. |
| 358 String result(""); |
| 359 while (true) { |
| 360 if (isNameChar(m_input->currentInputChar())) { |
| 361 result.append(consume()); |
| 362 continue; |
| 363 } |
| 364 if (nextTwoCharsAreValidEscape()) { |
| 365 consume(); // SPEC BUG: Emailed Tab. |
| 366 result.append(consumeEscape()); |
| 367 continue; |
| 368 } |
| 369 return result; |
| 370 } |
| 371 } |
| 372 |
| 373 // http://www.w3.org/TR/css-syntax-3/#consume-an-escaped-code-point |
| 374 UChar NewCSSTokenizer::consumeEscape() |
| 375 { |
| 376 UChar cc = consume(); |
| 377 ASSERT(cc != '\n'); |
| 378 if (isASCIIHexDigit(cc)) { |
| 379 unsigned consumedHexDigits = 1; |
| 380 String hexChars; |
| 381 do { |
| 382 hexChars.append(cc); |
| 383 cc = consume(); |
| 384 consumedHexDigits++; |
| 385 } while (consumedHexDigits < 6 && isASCIIHexDigit(cc)); |
| 386 bool ok = false; |
| 387 UChar codePoint = hexChars.toUIntStrict(&ok, 16); |
| 388 if (!ok) |
| 389 return WTF::Unicode::replacementCharacter; |
| 390 return codePoint; |
| 391 } |
| 392 if (cc == kEndOfFileMarker) |
| 393 return WTF::Unicode::replacementCharacter; |
| 394 return cc; |
| 395 } |
| 396 |
| 397 bool NewCSSTokenizer::nextCharIsNameChar() |
| 398 { |
| 399 return isNameChar(m_input->currentInputChar()); |
| 400 } |
| 401 |
| 402 bool NewCSSTokenizer::nextTwoCharsAreValidEscape() |
| 403 { |
| 404 return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2)); |
| 405 } |
| 406 |
| 407 // http://www.w3.org/TR/css3-syntax/#starts-with-a-number |
| 408 bool NewCSSTokenizer::nextCharsAreNumber() |
| 409 { |
| 410 UChar first = m_input->currentInputChar(); |
| 411 UChar second = m_input->peek(1); |
| 412 if (isASCIIDigit(first)) |
| 413 return true; |
| 414 if (first == '+' || first == '-') |
| 415 return ((isASCIIDigit(second)) || (second == '.' && isASCIIDigit(m_input
->peek(2)))); |
| 416 if (first =='.') |
| 417 return (isASCIIDigit(second)); |
| 418 return false; |
| 419 } |
| 420 |
| 421 // http://www.w3.org/TR/css3-syntax/#would-start-an-identifier |
| 422 bool NewCSSTokenizer::nextCharsAreIdentifier() |
| 423 { |
| 424 UChar firstChar = m_input->currentInputChar(); |
| 425 if (isNameStart(firstChar) || nextTwoCharsAreValidEscape()) |
| 426 return true; |
| 427 |
| 428 if (firstChar == '-') { |
| 429 if (isNameStart(m_input->peek(1))) |
| 430 return true; |
| 431 return twoCharsAreValidEscape(m_input->peek(1), m_input->peek(2)); |
| 432 } |
| 433 |
| 434 return false; |
| 435 } |
| 436 |
| 437 } // namespace WebCore |
OLD | NEW |