Source/core/css/CSSTokenizer-in.cpp - Issue 196353018: Smaller CSSParser UTF16 buffers for escaped strings.

Side by Side Diff: Source/core/css/CSSTokenizer-in.cpp

Issue 196353018: Smaller CSSParser UTF16 buffers for escaped strings. (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master

Patch Set: Rebased Created 6 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org)	2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org)

3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com)	3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com)

4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.	4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.

5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com>	5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com>

6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org>	6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org>

7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmo bile.com/)	7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmo bile.com/)

8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.	8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.

9 * Copyright (C) 2012 Intel Corporation. All rights reserved.	9 * Copyright (C) 2012 Intel Corporation. All rights reserved.

10 *	10 *

(...skipping 286 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
297 {	297 {

298 return m_currentCharacter8;	298 return m_currentCharacter8;

299 }	299 }

300	300

301 template <>	301 template <>

302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()	302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()

303 {	303 {

304 return m_currentCharacter16;	304 return m_currentCharacter16;

305 }	305 }

306	306

307 UChar*& CSSTokenizer::currentCharacter16()	307 UChar* CSSTokenizer::allocateStringBuffer16(size_t len)

308 {	308 {

309 if (!m_currentCharacter16) {	309 // Allocates and returns a CSSTokenizer owned buffer for storing

310 m_dataStart16 = adoptArrayPtr(new UChar[m_length]);	310 // UTF-16 data. Used to get a suitable life span for UTF-16

311 m_currentCharacter16 = m_dataStart16.get();	311 // strings, identifiers and URIs created by the tokenizer.

312 }	312 OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]);

313	313

314 return m_currentCharacter16;	314 UChar* bufferPtr = buffer.get();

	315

	316 m_cssStrings16.append(buffer.release());

	317 return bufferPtr;

315 }	318 }

316	319

317 template <>	320 template <>

318 inline LChar* CSSTokenizer::dataStart<LChar>()	321 inline LChar* CSSTokenizer::dataStart<LChar>()

319 {	322 {

320 return m_dataStart8.get();	323 return m_dataStart8.get();

321 }	324 }

322	325

323 template <>	326 template <>

324 inline UChar* CSSTokenizer::dataStart<UChar>()	327 inline UChar* CSSTokenizer::dataStart<UChar>()

(...skipping 80 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
405 if (unicode > 0x10ffff)	408 if (unicode > 0x10ffff)

406 unicode = 0xfffd;	409 unicode = 0xfffd;

407	410

408 // Optional space after the escape sequence.	411 // Optional space after the escape sequence.

409 if (isHTMLSpace<CharacterType>(*src))	412 if (isHTMLSpace<CharacterType>(*src))

410 ++src;	413 ++src;

411	414

412 return unicode;	415 return unicode;

413 }	416 }

414	417

415 return *currentCharacter<CharacterType>()++;	418 return *src++;

416 }	419 }

417	420

418 template <>	421 template <>

419 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode )	422 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode )

420 {	423 {

421 ASSERT(unicode <= 0xff);	424 ASSERT(unicode <= 0xff);

422 *result = unicode;	425 *result = unicode;

423	426

424 ++result;	427 ++result;

425 }	428 }

426	429

427 template <>	430 template <>

428 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode )	431 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode )

429 {	432 {

430 // Replace unicode with a surrogate pairs when it is bigger than 0xffff	433 // Replace unicode with a surrogate pairs when it is bigger than 0xffff

431 if (U16_LENGTH(unicode) == 2) {	434 if (U16_LENGTH(unicode) == 2) {

432 *result++ = U16_LEAD(unicode);	435 *result++ = U16_LEAD(unicode);

433 *result = U16_TRAIL(unicode);	436 *result = U16_TRAIL(unicode);

434 } else {	437 } else {

435 *result = unicode;	438 *result = unicode;

436 }	439 }

437	440

438 ++result;	441 ++result;

439 }	442 }

440	443

	444 template <typename SrcCharacterType>

	445 size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src)

	446 {

	447 // The decoded form of an identifier (after resolving escape

	448 // sequences) will not contain more characters (ASCII or UTF-16

	449 // codepoints) than the input. This code can therefore ignore

	450 // escape sequences completely.

	451 SrcCharacterType* start = src;

	452 do {

	453 if (LIKELY(*src != '\\'))

	454 src++;

	455 else

	456 parseEscape<SrcCharacterType>(src);

	457 } while (isCSSLetter(src[0]) \|\| (src[0] == '\\' && isCSSEscape(src[1])));

	458

	459 return src - start;

	460 }

	461

441 template <typename SrcCharacterType, typename DestCharacterType>	462 template <typename SrcCharacterType, typename DestCharacterType>

442 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType& src, DestCh aracterType& result, bool& hasEscape)	463 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType& src, DestCh aracterType& result, bool& hasEscape)

443 {	464 {

444 hasEscape = false;	465 hasEscape = false;

445 do {	466 do {

446 if (LIKELY(*src != '\\')) {	467 if (LIKELY(*src != '\\')) {

447 result++ = src++;	468 result++ = src++;

448 } else {	469 } else {

449 hasEscape = true;	470 hasEscape = true;

450 SrcCharacterType* savedEscapeStart = src;	471 SrcCharacterType* savedEscapeStart = src;

(...skipping 13 matching lines...) Expand all Loading...
464 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin g& resultString, bool& hasEscape)	485 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin g& resultString, bool& hasEscape)

465 {	486 {

466 // If a valid identifier start is found, we can safely	487 // If a valid identifier start is found, we can safely

467 // parse the identifier until the next invalid character.	488 // parse the identifier until the next invalid character.

468 ASSERT(isIdentifierStart<CharacterType>());	489 ASSERT(isIdentifierStart<CharacterType>());

469	490

470 CharacterType* start = currentCharacter<CharacterType>();	491 CharacterType* start = currentCharacter<CharacterType>();

471 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), res ult, hasEscape))) {	492 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), res ult, hasEscape))) {

472 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue	493 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue

473 ASSERT(is8BitSource());	494 ASSERT(is8BitSource());

474 UChar*& result16 = currentCharacter16();	495 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxIdent ifierLen(result));

475 UChar* start16 = result16;	496 UChar* start16 = result16;

476 int i = 0;	497 int i = 0;

477 for (; i < result - start; i++)	498 for (; i < result - start; i++)

478 result16[i] = start[i];	499 result16[i] = start[i];

479	500

480 result16 += i;	501 result16 += i;

481	502

482 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, has Escape);	503 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, has Escape);

483	504

484 resultString.init(start16, result16 - start16);	505 resultString.init(start16, result16 - start16);

485	506

486 return;	507 return;

487 }	508 }

488	509

489 resultString.init(start, result - start);	510 resultString.init(start, result - start);

490 }	511 }

491	512

	513 template <typename SrcCharacterType>

	514 size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote)

	515 {

	516 // The decoded form of a CSS string (after resolving escape

	517 // sequences) will not contain more characters (ASCII or UTF-16

	518 // codepoints) than the input. This code can therefore ignore

	519 // escape sequences completely and just return the length of the

	520 // input string (possibly including terminating quote if any).

	521 SrcCharacterType* end = checkAndSkipString(src, quote);

	522 return end ? end - src : 0;

	523 }

	524

492 template <typename SrcCharacterType, typename DestCharacterType>	525 template <typename SrcCharacterType, typename DestCharacterType>

493 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType& src, DestCharac terType& result, UChar quote)	526 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType& src, DestCharac terType& result, UChar quote)

494 {	527 {

495 while (true) {	528 while (true) {

496 if (UNLIKELY(*src == quote)) {	529 if (UNLIKELY(*src == quote)) {

497 // String parsing is done.	530 // String parsing is done.

498 ++src;	531 ++src;

499 return true;	532 return true;

500 }	533 }

501 if (UNLIKELY(!*src)) {	534 if (UNLIKELY(!*src)) {

(...skipping 23 matching lines...) Expand all Loading...
525 }	558 }

526	559

527 template <typename CharacterType>	560 template <typename CharacterType>

528 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r esultString, UChar quote)	561 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r esultString, UChar quote)

529 {	562 {

530 CharacterType* start = currentCharacter<CharacterType>();	563 CharacterType* start = currentCharacter<CharacterType>();

531	564

532 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {	565 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {

533 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue	566 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue

534 ASSERT(is8BitSource());	567 ASSERT(is8BitSource());

535 UChar*& result16 = currentCharacter16();	568 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxStrin gLen(result, quote));

536 UChar* start16 = result16;	569 UChar* start16 = result16;

537 int i = 0;	570 int i = 0;

538 for (; i < result - start; i++)	571 for (; i < result - start; i++)

539 result16[i] = start[i];	572 result16[i] = start[i];

540	573

541 result16 += i;	574 result16 += i;

542	575

543 parseStringInternal(currentCharacter<CharacterType>(), result16, quote);	576 parseStringInternal(currentCharacter<CharacterType>(), result16, quote);

544	577

545 resultString.init(start16, result16 - start16);	578 resultString.init(start16, result16 - start16);

(...skipping 27 matching lines...) Expand all Loading...
573 }	606 }

574 }	607 }

575	608

576 end = skipWhiteSpace(end);	609 end = skipWhiteSpace(end);

577 if (*end != ')')	610 if (*end != ')')

578 return false;	611 return false;

579	612

580 return true;	613 return true;

581 }	614 }

582	615

	616 template <typename SrcCharacterType>

	617 inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote)

	618 {

	619 // The decoded form of a URI (after resolving escape sequences)

	620 // will not contain more characters (ASCII or UTF-16 codepoints)

	621 // than the input. This code can therefore ignore escape sequences

	622 // completely.

	623 SrcCharacterType* start = src;

	624 if (quote) {

	625 ASSERT(quote == '"' \|\| quote == '\'');

	626 return peekMaxStringLen(src, quote);

	627 }

	628

	629 while (isURILetter(*src)) {

	630 if (LIKELY(*src != '\\'))

	631 src++;

	632 else

	633 parseEscape<SrcCharacterType>(src);

	634 }

	635

	636 return src - start;

	637 }

	638

583 template <typename SrcCharacterType, typename DestCharacterType>	639 template <typename SrcCharacterType, typename DestCharacterType>

584 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType& src, DestCharacter Type& dest, UChar quote)	640 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType& src, DestCharacter Type& dest, UChar quote)

585 {	641 {

586 if (quote) {	642 if (quote) {

587 ASSERT(quote == '"' \|\| quote == '\'');	643 ASSERT(quote == '"' \|\| quote == '\'');

588 return parseStringInternal(src, dest, quote);	644 return parseStringInternal(src, dest, quote);

589 }	645 }

590	646

591 while (isURILetter(*src)) {	647 while (isURILetter(*src)) {

592 if (LIKELY(*src != '\\')) {	648 if (LIKELY(*src != '\\')) {

593 dest++ = src++;	649 dest++ = src++;

594 } else {	650 } else {

595 unsigned unicode = parseEscape<SrcCharacterType>(src);	651 unsigned unicode = parseEscape<SrcCharacterType>(src);

596 if (unicode > 0xff && sizeof(SrcCharacterType) == 1)	652 if (unicode > 0xff && sizeof(DestCharacterType) == 1)

597 return false;	653 return false;

598 UnicodeToChars(dest, unicode);	654 UnicodeToChars(dest, unicode);

599 }	655 }

600 }	656 }

601	657

602 return true;	658 return true;

603 }	659 }

604	660

605 template <typename CharacterType>	661 template <typename CharacterType>

606 inline void CSSTokenizer::parseURI(CSSParserString& string)	662 inline void CSSTokenizer::parseURI(CSSParserString& string)

607 {	663 {

608 CharacterType* uriStart;	664 CharacterType* uriStart;

609 CharacterType* uriEnd;	665 CharacterType* uriEnd;

610 UChar quote;	666 UChar quote;

611 if (!findURI(uriStart, uriEnd, quote))	667 if (!findURI(uriStart, uriEnd, quote))

612 return;	668 return;

613	669

614 CharacterType* dest = currentCharacter<CharacterType>() = uriStart;	670 CharacterType* dest = currentCharacter<CharacterType>() = uriStart;

615 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote)) ) {	671 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote)) ) {

616 string.init(uriStart, dest - uriStart);	672 string.init(uriStart, dest - uriStart);

617 } else {	673 } else {

618 // An escape sequence was encountered that can't be stored in 8 bits.	674 // An escape sequence was encountered that can't be stored in 8 bits.

619 // Reset the current character to the start of the URI and re-parse with	675 // Reset the current character to the start of the URI and re-parse with

620 // a 16-bit destination.	676 // a 16-bit destination.

621 ASSERT(is8BitSource());	677 ASSERT(is8BitSource());

622 UChar* uriStart16 = currentCharacter16();	678 UChar* result16 = allocateStringBuffer16(peekMaxURILen(uriStart, quote)) ;

	679 UChar* uriStart16 = result16;

623 currentCharacter<CharacterType>() = uriStart;	680 currentCharacter<CharacterType>() = uriStart;

624 bool result = parseURIInternal(currentCharacter<CharacterType>(), curren tCharacter16(), quote);	681 bool result = parseURIInternal(currentCharacter<CharacterType>(), result 16, quote);

625 ASSERT_UNUSED(result, result);	682 ASSERT_UNUSED(result, result);

626 string.init(uriStart16, currentCharacter16() - uriStart16);	683 string.init(uriStart16, result16 - uriStart16);

627 }	684 }

628	685

629 currentCharacter<CharacterType>() = uriEnd + 1;	686 currentCharacter<CharacterType>() = uriEnd + 1;

630 m_token = URI;	687 m_token = URI;

631 }	688 }

632	689

633 template <typename CharacterType>	690 template <typename CharacterType>

634 inline bool CSSTokenizer::parseUnicodeRange()	691 inline bool CSSTokenizer::parseUnicodeRange()

635 {	692 {

636 CharacterType* character = currentCharacter<CharacterType>() + 1;	693 CharacterType* character = currentCharacter<CharacterType>() + 1;

(...skipping 886 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1523 m_dataStart16[length - 1] = 0;	1580 m_dataStart16[length - 1] = 0;

1524	1581

1525 m_is8BitSource = false;	1582 m_is8BitSource = false;

1526 m_currentCharacter8 = 0;	1583 m_currentCharacter8 = 0;

1527 m_currentCharacter16 = m_dataStart16.get();	1584 m_currentCharacter16 = m_dataStart16.get();

1528 setTokenStart<UChar>(m_currentCharacter16);	1585 setTokenStart<UChar>(m_currentCharacter16);

1529 m_lexFunc = &CSSTokenizer::realLex<UChar>;	1586 m_lexFunc = &CSSTokenizer::realLex<UChar>;

1530 }	1587 }

1531	1588

1532 } // namespace WebCore	1589 } // namespace WebCore

OLD	NEW

« no previous file with comments | « Source/core/css/CSSTokenizer.h ('k') | no next file » | no next file with comments »