Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(48)

Side by Side Diff: Source/core/css/CSSTokenizer-in.cpp

Issue 196353018: Smaller CSSParser UTF16 buffers for escaped strings. (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master
Patch Set: Rewrote comments. Created 6 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org) 2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org)
3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com) 3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com)
4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved. 4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.
5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com> 5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com>
6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org> 6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org>
7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmo bile.com/) 7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmo bile.com/)
8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved. 8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.
9 * Copyright (C) 2012 Intel Corporation. All rights reserved. 9 * Copyright (C) 2012 Intel Corporation. All rights reserved.
10 * 10 *
(...skipping 286 matching lines...) Expand 10 before | Expand all | Expand 10 after
297 { 297 {
298 return m_currentCharacter8; 298 return m_currentCharacter8;
299 } 299 }
300 300
301 template <> 301 template <>
302 inline UChar*& CSSTokenizer::currentCharacter<UChar>() 302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()
303 { 303 {
304 return m_currentCharacter16; 304 return m_currentCharacter16;
305 } 305 }
306 306
307 UChar*& CSSTokenizer::currentCharacter16() 307 UChar* CSSTokenizer::getStringBuffer16(size_t len)
Julien - ping for review 2014/03/20 20:57:39 We usually don't put the word "get" on getters as
Daniel Bratell 2014/03/21 15:14:39 Done.
308 { 308 {
309 if (!m_currentCharacter16) { 309 OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]);
310 m_dataStart16 = adoptArrayPtr(new UChar[m_length]);
311 m_currentCharacter16 = m_dataStart16.get();
312 }
313 310
314 return m_currentCharacter16; 311 UChar* bufferPtr = buffer.get();
312
313 m_cssStrings16.append(buffer.release());
314 return bufferPtr;
315 } 315 }
316 316
317 template <> 317 template <>
318 inline LChar* CSSTokenizer::dataStart<LChar>() 318 inline LChar* CSSTokenizer::dataStart<LChar>()
319 { 319 {
320 return m_dataStart8.get(); 320 return m_dataStart8.get();
321 } 321 }
322 322
323 template <> 323 template <>
324 inline UChar* CSSTokenizer::dataStart<UChar>() 324 inline UChar* CSSTokenizer::dataStart<UChar>()
(...skipping 54 matching lines...) Expand 10 before | Expand all | Expand 10 after
379 currentCharacter += currentCharacter[2] == '\n' ? 3 : 2; 379 currentCharacter += currentCharacter[2] == '\n' ? 3 : 2;
380 } else { 380 } else {
381 currentCharacter = checkAndSkipEscape(currentCharacter); 381 currentCharacter = checkAndSkipEscape(currentCharacter);
382 if (!currentCharacter) 382 if (!currentCharacter)
383 return 0; 383 return 0;
384 } 384 }
385 } 385 }
386 } 386 }
387 387
388 template <typename CharacterType> 388 template <typename CharacterType>
389 unsigned CSSTokenizer::parseEscape(CharacterType*& src) 389 unsigned CSSTokenizer::parseEscape(CharacterType*& src)
Julien - ping for review 2014/03/20 20:57:39 This function should now have static linkage.
Daniel Bratell 2014/03/21 15:14:39 Hmm, not sure I understand. A function can have in
Julien - ping for review 2014/03/21 17:56:26 I was talking about file level internal function a
390 { 390 {
391 ASSERT(*src == '\\' && isCSSEscape(src[1])); 391 ASSERT(*src == '\\' && isCSSEscape(src[1]));
392 392
393 unsigned unicode = 0; 393 unsigned unicode = 0;
394 394
395 ++src; 395 ++src;
396 if (isASCIIHexDigit(*src)) { 396 if (isASCIIHexDigit(*src)) {
397 397
398 int length = 6; 398 int length = 6;
399 399
400 do { 400 do {
401 unicode = (unicode << 4) + toASCIIHexValue(*src++); 401 unicode = (unicode << 4) + toASCIIHexValue(*src++);
402 } while (--length && isASCIIHexDigit(*src)); 402 } while (--length && isASCIIHexDigit(*src));
403 403
404 // Characters above 0x10ffff are not handled. 404 // Characters above 0x10ffff are not handled.
405 if (unicode > 0x10ffff) 405 if (unicode > 0x10ffff)
406 unicode = 0xfffd; 406 unicode = 0xfffd;
407 407
408 // Optional space after the escape sequence. 408 // Optional space after the escape sequence.
409 if (isHTMLSpace<CharacterType>(*src)) 409 if (isHTMLSpace<CharacterType>(*src))
410 ++src; 410 ++src;
411 411
412 return unicode; 412 return unicode;
413 } 413 }
414 414
415 return *currentCharacter<CharacterType>()++; 415 return *src++;
416 } 416 }
417 417
418 template <> 418 template <>
419 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode ) 419 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode )
420 { 420 {
421 ASSERT(unicode <= 0xff); 421 ASSERT(unicode <= 0xff);
422 *result = unicode; 422 *result = unicode;
423 423
424 ++result; 424 ++result;
425 } 425 }
426 426
427 template <> 427 template <>
428 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode ) 428 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode )
429 { 429 {
430 // Replace unicode with a surrogate pairs when it is bigger than 0xffff 430 // Replace unicode with a surrogate pairs when it is bigger than 0xffff
431 if (U16_LENGTH(unicode) == 2) { 431 if (U16_LENGTH(unicode) == 2) {
432 *result++ = U16_LEAD(unicode); 432 *result++ = U16_LEAD(unicode);
433 *result = U16_TRAIL(unicode); 433 *result = U16_TRAIL(unicode);
434 } else { 434 } else {
435 *result = unicode; 435 *result = unicode;
436 } 436 }
437 437
438 ++result; 438 ++result;
439 } 439 }
440 440
441 template <typename SrcCharacterType>
442 size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src)
443 {
444 // The decoded form of an identifier (after resolving escape
445 // sequences) will not contain more characters (ASCII or UTF-16
446 // codepoints) than the input. This code can therefore ignore
447 // escape sequences completely.
448 SrcCharacterType* start = src;
449 do {
450 if (LIKELY(*src != '\\'))
451 src++;
452 else
453 parseEscape<SrcCharacterType>(src);
454 } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
455
456 return src - start;
457 }
458
441 template <typename SrcCharacterType, typename DestCharacterType> 459 template <typename SrcCharacterType, typename DestCharacterType>
442 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCh aracterType*& result, bool& hasEscape) 460 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCh aracterType*& result, bool& hasEscape)
443 { 461 {
444 hasEscape = false; 462 hasEscape = false;
445 do { 463 do {
446 if (LIKELY(*src != '\\')) { 464 if (LIKELY(*src != '\\')) {
447 *result++ = *src++; 465 *result++ = *src++;
448 } else { 466 } else {
449 hasEscape = true; 467 hasEscape = true;
450 SrcCharacterType* savedEscapeStart = src; 468 SrcCharacterType* savedEscapeStart = src;
(...skipping 13 matching lines...) Expand all
464 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin g& resultString, bool& hasEscape) 482 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin g& resultString, bool& hasEscape)
465 { 483 {
466 // If a valid identifier start is found, we can safely 484 // If a valid identifier start is found, we can safely
467 // parse the identifier until the next invalid character. 485 // parse the identifier until the next invalid character.
468 ASSERT(isIdentifierStart<CharacterType>()); 486 ASSERT(isIdentifierStart<CharacterType>());
469 487
470 CharacterType* start = currentCharacter<CharacterType>(); 488 CharacterType* start = currentCharacter<CharacterType>();
471 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), res ult, hasEscape))) { 489 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), res ult, hasEscape))) {
472 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue 490 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue
473 ASSERT(is8BitSource()); 491 ASSERT(is8BitSource());
474 UChar*& result16 = currentCharacter16(); 492 UChar* result16 = getStringBuffer16((result - start) + peekMaxIdentifier Len(result));
475 UChar* start16 = result16; 493 UChar* start16 = result16;
476 int i = 0; 494 int i = 0;
477 for (; i < result - start; i++) 495 for (; i < result - start; i++)
478 result16[i] = start[i]; 496 result16[i] = start[i];
479 497
480 result16 += i; 498 result16 += i;
481 499
482 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, has Escape); 500 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, has Escape);
483 501
484 resultString.init(start16, result16 - start16); 502 resultString.init(start16, result16 - start16);
485 503
486 return; 504 return;
487 } 505 }
488 506
489 resultString.init(start, result - start); 507 resultString.init(start, result - start);
490 } 508 }
491 509
510 template <typename SrcCharacterType>
511 size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote)
512 {
513 // The decoded form of a CSS string (after resolving escape
514 // sequences) will not contain more characters (ASCII or UTF-16
515 // codepoints) than the input. This code can therefore ignore
516 // escape sequences completely.
517 SrcCharacterType* start = src;
518 while (true) {
519 if (UNLIKELY(*src == quote)) {
520 // String parsing is done.
521 ++src;
522 break;
523 }
524 if (UNLIKELY(!*src)) {
525 // String parsing is done, but don't advance pointer if at the end o f input.
526 break;
527 }
528 ASSERT(*src > '\r' || (*src < '\n' && *src) || *src == '\v');
529
530 if (LIKELY(src[0] != '\\'))
531 src++;
532 else if (src[1] == '\n' || src[1] == '\f')
533 src += 2;
534 else if (src[1] == '\r')
535 src += src[2] == '\n' ? 3 : 2;
Julien - ping for review 2014/03/20 20:57:39 This looks awfully like checkAndSkipString, maybe
536 else
537 parseEscape<SrcCharacterType>(src);
538 }
539
540 return src - start;
541 }
542
492 template <typename SrcCharacterType, typename DestCharacterType> 543 template <typename SrcCharacterType, typename DestCharacterType>
493 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharac terType*& result, UChar quote) 544 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharac terType*& result, UChar quote)
494 { 545 {
495 while (true) { 546 while (true) {
496 if (UNLIKELY(*src == quote)) { 547 if (UNLIKELY(*src == quote)) {
497 // String parsing is done. 548 // String parsing is done.
498 ++src; 549 ++src;
499 return true; 550 return true;
500 } 551 }
501 if (UNLIKELY(!*src)) { 552 if (UNLIKELY(!*src)) {
(...skipping 23 matching lines...) Expand all
525 } 576 }
526 577
527 template <typename CharacterType> 578 template <typename CharacterType>
528 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r esultString, UChar quote) 579 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r esultString, UChar quote)
529 { 580 {
530 CharacterType* start = currentCharacter<CharacterType>(); 581 CharacterType* start = currentCharacter<CharacterType>();
531 582
532 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) { 583 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {
533 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue 584 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue
534 ASSERT(is8BitSource()); 585 ASSERT(is8BitSource());
535 UChar*& result16 = currentCharacter16(); 586 UChar* result16 = getStringBuffer16((result - start) + peekMaxStringLen( result, quote));
536 UChar* start16 = result16; 587 UChar* start16 = result16;
537 int i = 0; 588 int i = 0;
538 for (; i < result - start; i++) 589 for (; i < result - start; i++)
539 result16[i] = start[i]; 590 result16[i] = start[i];
540 591
541 result16 += i; 592 result16 += i;
542 593
543 parseStringInternal(currentCharacter<CharacterType>(), result16, quote); 594 parseStringInternal(currentCharacter<CharacterType>(), result16, quote);
544 595
545 resultString.init(start16, result16 - start16); 596 resultString.init(start16, result16 - start16);
(...skipping 27 matching lines...) Expand all
573 } 624 }
574 } 625 }
575 626
576 end = skipWhiteSpace(end); 627 end = skipWhiteSpace(end);
577 if (*end != ')') 628 if (*end != ')')
578 return false; 629 return false;
579 630
580 return true; 631 return true;
581 } 632 }
582 633
634 template <typename SrcCharacterType>
635 inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote)
636 {
637 // The decoded form of a URI (after resolving escape sequences)
638 // will not contain more characters (ASCII or UTF-16 codepoints)
639 // than the input. This code can therefore ignore escape sequences
640 // completely.
641 SrcCharacterType* start = src;
642 if (quote) {
643 ASSERT(quote == '"' || quote == '\'');
644 return peekMaxStringLen(src, quote);
645 }
646
647 while (isURILetter(*src)) {
648 if (LIKELY(*src != '\\'))
649 src++;
650 else
651 parseEscape<SrcCharacterType>(src);
652 }
653
654 return src - start;
655 }
656
583 template <typename SrcCharacterType, typename DestCharacterType> 657 template <typename SrcCharacterType, typename DestCharacterType>
584 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacter Type*& dest, UChar quote) 658 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacter Type*& dest, UChar quote)
585 { 659 {
586 if (quote) { 660 if (quote) {
587 ASSERT(quote == '"' || quote == '\''); 661 ASSERT(quote == '"' || quote == '\'');
588 return parseStringInternal(src, dest, quote); 662 return parseStringInternal(src, dest, quote);
589 } 663 }
590 664
591 while (isURILetter(*src)) { 665 while (isURILetter(*src)) {
592 if (LIKELY(*src != '\\')) { 666 if (LIKELY(*src != '\\')) {
593 *dest++ = *src++; 667 *dest++ = *src++;
594 } else { 668 } else {
595 unsigned unicode = parseEscape<SrcCharacterType>(src); 669 unsigned unicode = parseEscape<SrcCharacterType>(src);
596 if (unicode > 0xff && sizeof(SrcCharacterType) == 1) 670 if (unicode > 0xff && sizeof(DestCharacterType) == 1)
597 return false; 671 return false;
598 UnicodeToChars(dest, unicode); 672 UnicodeToChars(dest, unicode);
599 } 673 }
600 } 674 }
601 675
602 return true; 676 return true;
603 } 677 }
604 678
605 template <typename CharacterType> 679 template <typename CharacterType>
606 inline void CSSTokenizer::parseURI(CSSParserString& string) 680 inline void CSSTokenizer::parseURI(CSSParserString& string)
607 { 681 {
608 CharacterType* uriStart; 682 CharacterType* uriStart;
609 CharacterType* uriEnd; 683 CharacterType* uriEnd;
610 UChar quote; 684 UChar quote;
611 if (!findURI(uriStart, uriEnd, quote)) 685 if (!findURI(uriStart, uriEnd, quote))
612 return; 686 return;
613 687
614 CharacterType* dest = currentCharacter<CharacterType>() = uriStart; 688 CharacterType* dest = currentCharacter<CharacterType>() = uriStart;
615 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote)) ) { 689 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote)) ) {
616 string.init(uriStart, dest - uriStart); 690 string.init(uriStart, dest - uriStart);
617 } else { 691 } else {
618 // An escape sequence was encountered that can't be stored in 8 bits. 692 // An escape sequence was encountered that can't be stored in 8 bits.
619 // Reset the current character to the start of the URI and re-parse with 693 // Reset the current character to the start of the URI and re-parse with
620 // a 16-bit destination. 694 // a 16-bit destination.
621 ASSERT(is8BitSource()); 695 ASSERT(is8BitSource());
622 UChar* uriStart16 = currentCharacter16(); 696 UChar* result16 = getStringBuffer16(peekMaxURILen(uriStart, quote));
697 UChar* uriStart16 = result16;
623 currentCharacter<CharacterType>() = uriStart; 698 currentCharacter<CharacterType>() = uriStart;
624 bool result = parseURIInternal(currentCharacter<CharacterType>(), curren tCharacter16(), quote); 699 bool result = parseURIInternal(currentCharacter<CharacterType>(), result 16, quote);
625 ASSERT_UNUSED(result, result); 700 ASSERT_UNUSED(result, result);
626 string.init(uriStart16, currentCharacter16() - uriStart16); 701 string.init(uriStart16, result16 - uriStart16);
627 } 702 }
628 703
629 currentCharacter<CharacterType>() = uriEnd + 1; 704 currentCharacter<CharacterType>() = uriEnd + 1;
630 m_token = URI; 705 m_token = URI;
631 } 706 }
632 707
633 template <typename CharacterType> 708 template <typename CharacterType>
634 inline bool CSSTokenizer::parseUnicodeRange() 709 inline bool CSSTokenizer::parseUnicodeRange()
635 { 710 {
636 CharacterType* character = currentCharacter<CharacterType>() + 1; 711 CharacterType* character = currentCharacter<CharacterType>() + 1;
(...skipping 889 matching lines...) Expand 10 before | Expand all | Expand 10 after
1526 m_dataStart16[length - 1] = 0; 1601 m_dataStart16[length - 1] = 0;
1527 1602
1528 m_is8BitSource = false; 1603 m_is8BitSource = false;
1529 m_currentCharacter8 = 0; 1604 m_currentCharacter8 = 0;
1530 m_currentCharacter16 = m_dataStart16.get(); 1605 m_currentCharacter16 = m_dataStart16.get();
1531 setTokenStart<UChar>(m_currentCharacter16); 1606 setTokenStart<UChar>(m_currentCharacter16);
1532 m_lexFunc = &CSSTokenizer::realLex<UChar>; 1607 m_lexFunc = &CSSTokenizer::realLex<UChar>;
1533 } 1608 }
1534 1609
1535 } // namespace WebCore 1610 } // namespace WebCore
OLDNEW
« Source/core/css/CSSTokenizer.h ('K') | « Source/core/css/CSSTokenizer.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698