Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(23)

Side by Side Diff: Source/core/css/CSSTokenizer-in.cpp

Issue 196353018: Smaller CSSParser UTF16 buffers for escaped strings. (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master
Patch Set: Rebased Created 6 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « Source/core/css/CSSTokenizer.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org) 2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org)
3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com) 3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com)
4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved. 4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.
5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com> 5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com>
6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org> 6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org>
7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmo bile.com/) 7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmo bile.com/)
8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved. 8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.
9 * Copyright (C) 2012 Intel Corporation. All rights reserved. 9 * Copyright (C) 2012 Intel Corporation. All rights reserved.
10 * 10 *
(...skipping 286 matching lines...) Expand 10 before | Expand all | Expand 10 after
297 { 297 {
298 return m_currentCharacter8; 298 return m_currentCharacter8;
299 } 299 }
300 300
301 template <> 301 template <>
302 inline UChar*& CSSTokenizer::currentCharacter<UChar>() 302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()
303 { 303 {
304 return m_currentCharacter16; 304 return m_currentCharacter16;
305 } 305 }
306 306
307 UChar*& CSSTokenizer::currentCharacter16() 307 UChar* CSSTokenizer::allocateStringBuffer16(size_t len)
308 { 308 {
309 if (!m_currentCharacter16) { 309 // Allocates and returns a CSSTokenizer owned buffer for storing
310 m_dataStart16 = adoptArrayPtr(new UChar[m_length]); 310 // UTF-16 data. Used to get a suitable life span for UTF-16
311 m_currentCharacter16 = m_dataStart16.get(); 311 // strings, identifiers and URIs created by the tokenizer.
312 } 312 OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]);
313 313
314 return m_currentCharacter16; 314 UChar* bufferPtr = buffer.get();
315
316 m_cssStrings16.append(buffer.release());
317 return bufferPtr;
315 } 318 }
316 319
317 template <> 320 template <>
318 inline LChar* CSSTokenizer::dataStart<LChar>() 321 inline LChar* CSSTokenizer::dataStart<LChar>()
319 { 322 {
320 return m_dataStart8.get(); 323 return m_dataStart8.get();
321 } 324 }
322 325
323 template <> 326 template <>
324 inline UChar* CSSTokenizer::dataStart<UChar>() 327 inline UChar* CSSTokenizer::dataStart<UChar>()
(...skipping 80 matching lines...) Expand 10 before | Expand all | Expand 10 after
405 if (unicode > 0x10ffff) 408 if (unicode > 0x10ffff)
406 unicode = 0xfffd; 409 unicode = 0xfffd;
407 410
408 // Optional space after the escape sequence. 411 // Optional space after the escape sequence.
409 if (isHTMLSpace<CharacterType>(*src)) 412 if (isHTMLSpace<CharacterType>(*src))
410 ++src; 413 ++src;
411 414
412 return unicode; 415 return unicode;
413 } 416 }
414 417
415 return *currentCharacter<CharacterType>()++; 418 return *src++;
416 } 419 }
417 420
418 template <> 421 template <>
419 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode ) 422 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode )
420 { 423 {
421 ASSERT(unicode <= 0xff); 424 ASSERT(unicode <= 0xff);
422 *result = unicode; 425 *result = unicode;
423 426
424 ++result; 427 ++result;
425 } 428 }
426 429
427 template <> 430 template <>
428 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode ) 431 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode )
429 { 432 {
430 // Replace unicode with a surrogate pairs when it is bigger than 0xffff 433 // Replace unicode with a surrogate pairs when it is bigger than 0xffff
431 if (U16_LENGTH(unicode) == 2) { 434 if (U16_LENGTH(unicode) == 2) {
432 *result++ = U16_LEAD(unicode); 435 *result++ = U16_LEAD(unicode);
433 *result = U16_TRAIL(unicode); 436 *result = U16_TRAIL(unicode);
434 } else { 437 } else {
435 *result = unicode; 438 *result = unicode;
436 } 439 }
437 440
438 ++result; 441 ++result;
439 } 442 }
440 443
444 template <typename SrcCharacterType>
445 size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src)
446 {
447 // The decoded form of an identifier (after resolving escape
448 // sequences) will not contain more characters (ASCII or UTF-16
449 // codepoints) than the input. This code can therefore ignore
450 // escape sequences completely.
451 SrcCharacterType* start = src;
452 do {
453 if (LIKELY(*src != '\\'))
454 src++;
455 else
456 parseEscape<SrcCharacterType>(src);
457 } while (isCSSLetter(src[0]) || (src[0] == '\\' && isCSSEscape(src[1])));
458
459 return src - start;
460 }
461
441 template <typename SrcCharacterType, typename DestCharacterType> 462 template <typename SrcCharacterType, typename DestCharacterType>
442 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCh aracterType*& result, bool& hasEscape) 463 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType*& src, DestCh aracterType*& result, bool& hasEscape)
443 { 464 {
444 hasEscape = false; 465 hasEscape = false;
445 do { 466 do {
446 if (LIKELY(*src != '\\')) { 467 if (LIKELY(*src != '\\')) {
447 *result++ = *src++; 468 *result++ = *src++;
448 } else { 469 } else {
449 hasEscape = true; 470 hasEscape = true;
450 SrcCharacterType* savedEscapeStart = src; 471 SrcCharacterType* savedEscapeStart = src;
(...skipping 13 matching lines...) Expand all
464 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin g& resultString, bool& hasEscape) 485 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin g& resultString, bool& hasEscape)
465 { 486 {
466 // If a valid identifier start is found, we can safely 487 // If a valid identifier start is found, we can safely
467 // parse the identifier until the next invalid character. 488 // parse the identifier until the next invalid character.
468 ASSERT(isIdentifierStart<CharacterType>()); 489 ASSERT(isIdentifierStart<CharacterType>());
469 490
470 CharacterType* start = currentCharacter<CharacterType>(); 491 CharacterType* start = currentCharacter<CharacterType>();
471 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), res ult, hasEscape))) { 492 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), res ult, hasEscape))) {
472 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue 493 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue
473 ASSERT(is8BitSource()); 494 ASSERT(is8BitSource());
474 UChar*& result16 = currentCharacter16(); 495 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxIdent ifierLen(result));
475 UChar* start16 = result16; 496 UChar* start16 = result16;
476 int i = 0; 497 int i = 0;
477 for (; i < result - start; i++) 498 for (; i < result - start; i++)
478 result16[i] = start[i]; 499 result16[i] = start[i];
479 500
480 result16 += i; 501 result16 += i;
481 502
482 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, has Escape); 503 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, has Escape);
483 504
484 resultString.init(start16, result16 - start16); 505 resultString.init(start16, result16 - start16);
485 506
486 return; 507 return;
487 } 508 }
488 509
489 resultString.init(start, result - start); 510 resultString.init(start, result - start);
490 } 511 }
491 512
513 template <typename SrcCharacterType>
514 size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote)
515 {
516 // The decoded form of a CSS string (after resolving escape
517 // sequences) will not contain more characters (ASCII or UTF-16
518 // codepoints) than the input. This code can therefore ignore
519 // escape sequences completely and just return the length of the
520 // input string (possibly including terminating quote if any).
521 SrcCharacterType* end = checkAndSkipString(src, quote);
522 return end ? end - src : 0;
523 }
524
492 template <typename SrcCharacterType, typename DestCharacterType> 525 template <typename SrcCharacterType, typename DestCharacterType>
493 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharac terType*& result, UChar quote) 526 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType*& src, DestCharac terType*& result, UChar quote)
494 { 527 {
495 while (true) { 528 while (true) {
496 if (UNLIKELY(*src == quote)) { 529 if (UNLIKELY(*src == quote)) {
497 // String parsing is done. 530 // String parsing is done.
498 ++src; 531 ++src;
499 return true; 532 return true;
500 } 533 }
501 if (UNLIKELY(!*src)) { 534 if (UNLIKELY(!*src)) {
(...skipping 23 matching lines...) Expand all
525 } 558 }
526 559
527 template <typename CharacterType> 560 template <typename CharacterType>
528 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r esultString, UChar quote) 561 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r esultString, UChar quote)
529 { 562 {
530 CharacterType* start = currentCharacter<CharacterType>(); 563 CharacterType* start = currentCharacter<CharacterType>();
531 564
532 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) { 565 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {
533 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue 566 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue
534 ASSERT(is8BitSource()); 567 ASSERT(is8BitSource());
535 UChar*& result16 = currentCharacter16(); 568 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxStrin gLen(result, quote));
536 UChar* start16 = result16; 569 UChar* start16 = result16;
537 int i = 0; 570 int i = 0;
538 for (; i < result - start; i++) 571 for (; i < result - start; i++)
539 result16[i] = start[i]; 572 result16[i] = start[i];
540 573
541 result16 += i; 574 result16 += i;
542 575
543 parseStringInternal(currentCharacter<CharacterType>(), result16, quote); 576 parseStringInternal(currentCharacter<CharacterType>(), result16, quote);
544 577
545 resultString.init(start16, result16 - start16); 578 resultString.init(start16, result16 - start16);
(...skipping 27 matching lines...) Expand all
573 } 606 }
574 } 607 }
575 608
576 end = skipWhiteSpace(end); 609 end = skipWhiteSpace(end);
577 if (*end != ')') 610 if (*end != ')')
578 return false; 611 return false;
579 612
580 return true; 613 return true;
581 } 614 }
582 615
616 template <typename SrcCharacterType>
617 inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote)
618 {
619 // The decoded form of a URI (after resolving escape sequences)
620 // will not contain more characters (ASCII or UTF-16 codepoints)
621 // than the input. This code can therefore ignore escape sequences
622 // completely.
623 SrcCharacterType* start = src;
624 if (quote) {
625 ASSERT(quote == '"' || quote == '\'');
626 return peekMaxStringLen(src, quote);
627 }
628
629 while (isURILetter(*src)) {
630 if (LIKELY(*src != '\\'))
631 src++;
632 else
633 parseEscape<SrcCharacterType>(src);
634 }
635
636 return src - start;
637 }
638
583 template <typename SrcCharacterType, typename DestCharacterType> 639 template <typename SrcCharacterType, typename DestCharacterType>
584 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacter Type*& dest, UChar quote) 640 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType*& src, DestCharacter Type*& dest, UChar quote)
585 { 641 {
586 if (quote) { 642 if (quote) {
587 ASSERT(quote == '"' || quote == '\''); 643 ASSERT(quote == '"' || quote == '\'');
588 return parseStringInternal(src, dest, quote); 644 return parseStringInternal(src, dest, quote);
589 } 645 }
590 646
591 while (isURILetter(*src)) { 647 while (isURILetter(*src)) {
592 if (LIKELY(*src != '\\')) { 648 if (LIKELY(*src != '\\')) {
593 *dest++ = *src++; 649 *dest++ = *src++;
594 } else { 650 } else {
595 unsigned unicode = parseEscape<SrcCharacterType>(src); 651 unsigned unicode = parseEscape<SrcCharacterType>(src);
596 if (unicode > 0xff && sizeof(SrcCharacterType) == 1) 652 if (unicode > 0xff && sizeof(DestCharacterType) == 1)
597 return false; 653 return false;
598 UnicodeToChars(dest, unicode); 654 UnicodeToChars(dest, unicode);
599 } 655 }
600 } 656 }
601 657
602 return true; 658 return true;
603 } 659 }
604 660
605 template <typename CharacterType> 661 template <typename CharacterType>
606 inline void CSSTokenizer::parseURI(CSSParserString& string) 662 inline void CSSTokenizer::parseURI(CSSParserString& string)
607 { 663 {
608 CharacterType* uriStart; 664 CharacterType* uriStart;
609 CharacterType* uriEnd; 665 CharacterType* uriEnd;
610 UChar quote; 666 UChar quote;
611 if (!findURI(uriStart, uriEnd, quote)) 667 if (!findURI(uriStart, uriEnd, quote))
612 return; 668 return;
613 669
614 CharacterType* dest = currentCharacter<CharacterType>() = uriStart; 670 CharacterType* dest = currentCharacter<CharacterType>() = uriStart;
615 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote)) ) { 671 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote)) ) {
616 string.init(uriStart, dest - uriStart); 672 string.init(uriStart, dest - uriStart);
617 } else { 673 } else {
618 // An escape sequence was encountered that can't be stored in 8 bits. 674 // An escape sequence was encountered that can't be stored in 8 bits.
619 // Reset the current character to the start of the URI and re-parse with 675 // Reset the current character to the start of the URI and re-parse with
620 // a 16-bit destination. 676 // a 16-bit destination.
621 ASSERT(is8BitSource()); 677 ASSERT(is8BitSource());
622 UChar* uriStart16 = currentCharacter16(); 678 UChar* result16 = allocateStringBuffer16(peekMaxURILen(uriStart, quote)) ;
679 UChar* uriStart16 = result16;
623 currentCharacter<CharacterType>() = uriStart; 680 currentCharacter<CharacterType>() = uriStart;
624 bool result = parseURIInternal(currentCharacter<CharacterType>(), curren tCharacter16(), quote); 681 bool result = parseURIInternal(currentCharacter<CharacterType>(), result 16, quote);
625 ASSERT_UNUSED(result, result); 682 ASSERT_UNUSED(result, result);
626 string.init(uriStart16, currentCharacter16() - uriStart16); 683 string.init(uriStart16, result16 - uriStart16);
627 } 684 }
628 685
629 currentCharacter<CharacterType>() = uriEnd + 1; 686 currentCharacter<CharacterType>() = uriEnd + 1;
630 m_token = URI; 687 m_token = URI;
631 } 688 }
632 689
633 template <typename CharacterType> 690 template <typename CharacterType>
634 inline bool CSSTokenizer::parseUnicodeRange() 691 inline bool CSSTokenizer::parseUnicodeRange()
635 { 692 {
636 CharacterType* character = currentCharacter<CharacterType>() + 1; 693 CharacterType* character = currentCharacter<CharacterType>() + 1;
(...skipping 886 matching lines...) Expand 10 before | Expand all | Expand 10 after
1523 m_dataStart16[length - 1] = 0; 1580 m_dataStart16[length - 1] = 0;
1524 1581
1525 m_is8BitSource = false; 1582 m_is8BitSource = false;
1526 m_currentCharacter8 = 0; 1583 m_currentCharacter8 = 0;
1527 m_currentCharacter16 = m_dataStart16.get(); 1584 m_currentCharacter16 = m_dataStart16.get();
1528 setTokenStart<UChar>(m_currentCharacter16); 1585 setTokenStart<UChar>(m_currentCharacter16);
1529 m_lexFunc = &CSSTokenizer::realLex<UChar>; 1586 m_lexFunc = &CSSTokenizer::realLex<UChar>;
1530 } 1587 }
1531 1588
1532 } // namespace WebCore 1589 } // namespace WebCore
OLDNEW
« no previous file with comments | « Source/core/css/CSSTokenizer.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698