Source/core/css/CSSTokenizer-in.cpp - Issue 196353018: Smaller CSSParser UTF16 buffers for escaped strings.

Side by Side Diff: Source/core/css/CSSTokenizer-in.cpp

Issue 196353018: Smaller CSSParser UTF16 buffers for escaped strings. (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master

Patch Set: More comments. Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org)	2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org)

3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com)	3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com)

4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.	4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.

5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com>	5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com>

6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org>	6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org>

7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmo bile.com/)	7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmo bile.com/)

8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.	8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.

9 * Copyright (C) 2012 Intel Corporation. All rights reserved.	9 * Copyright (C) 2012 Intel Corporation. All rights reserved.

10 *	10 *

(...skipping 286 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
297 {	297 {

298 return m_currentCharacter8;	298 return m_currentCharacter8;

299 }	299 }

300	300

301 template <>	301 template <>

302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()	302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()

303 {	303 {

304 return m_currentCharacter16;	304 return m_currentCharacter16;

305 }	305 }

306	306

307 UChar*& CSSTokenizer::currentCharacter16()	307 UChar* CSSTokenizer::allocateStringBuffer16(size_t len)

308 {	308 {

309 if (!m_currentCharacter16) {	309 // Allocates and returns a CSSTokenizer owned buffer for storing

310 m_dataStart16 = adoptArrayPtr(new UChar[m_length]);	310 // UTF-16 data. Used to get a suitable life span for UTF-16

311 m_currentCharacter16 = m_dataStart16.get();	311 // strings, identifiers and URIs created by the tokenizer.

312 }	312 OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]);

313	313

314 return m_currentCharacter16;	314 UChar* bufferPtr = buffer.get();

	315

	316 m_cssStrings16.append(buffer.release());

	317 return bufferPtr;

315 }	318 }

316	319

317 template <>	320 template <>

318 inline LChar* CSSTokenizer::dataStart<LChar>()	321 inline LChar* CSSTokenizer::dataStart<LChar>()

319 {	322 {

320 return m_dataStart8.get();	323 return m_dataStart8.get();

321 }	324 }

322	325

323 template <>	326 template <>

324 inline UChar* CSSTokenizer::dataStart<UChar>()	327 inline UChar* CSSTokenizer::dataStart<UChar>()

(...skipping 80 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
405 if (unicode > 0x10ffff)	408 if (unicode > 0x10ffff)

406 unicode = 0xfffd;	409 unicode = 0xfffd;

407	410

408 // Optional space after the escape sequence.	411 // Optional space after the escape sequence.

409 if (isHTMLSpace<CharacterType>(*src))	412 if (isHTMLSpace<CharacterType>(*src))

410 ++src;	413 ++src;

411	414

412 return unicode;	415 return unicode;

413 }	416 }

414	417

415 return *currentCharacter<CharacterType>()++;	418 return *src++;

416 }	419 }

417	420

418 template <>	421 template <>

419 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode )	422 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode )

420 {	423 {

421 ASSERT(unicode <= 0xff);	424 ASSERT(unicode <= 0xff);

422 *result = unicode;	425 *result = unicode;

423	426

424 ++result;	427 ++result;

425 }	428 }

426	429

427 template <>	430 template <>

428 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode )	431 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode )

429 {	432 {

430 // Replace unicode with a surrogate pairs when it is bigger than 0xffff	433 // Replace unicode with a surrogate pairs when it is bigger than 0xffff

431 if (U16_LENGTH(unicode) == 2) {	434 if (U16_LENGTH(unicode) == 2) {

432 *result++ = U16_LEAD(unicode);	435 *result++ = U16_LEAD(unicode);

433 *result = U16_TRAIL(unicode);	436 *result = U16_TRAIL(unicode);

434 } else {	437 } else {

435 *result = unicode;	438 *result = unicode;

436 }	439 }

437	440

438 ++result;	441 ++result;

439 }	442 }

440	443

	444 template <typename SrcCharacterType>

	445 size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src)

	446 {

	447 // The decoded form of an identifier (after resolving escape

	448 // sequences) will not contain more characters (ASCII or UTF-16

	449 // codepoints) than the input. This code can therefore ignore

	450 // escape sequences completely.

	451 SrcCharacterType* start = src;

	452 do {

	453 if (LIKELY(*src != '\\'))

	454 src++;

	455 else

	456 parseEscape<SrcCharacterType>(src);

	457 } while (isCSSLetter(src[0]) \|\| (src[0] == '\\' && isCSSEscape(src[1])));

	458

	459 return src - start;

	460 }

	461

441 template <typename SrcCharacterType, typename DestCharacterType>	462 template <typename SrcCharacterType, typename DestCharacterType>

442 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType& src, DestCh aracterType& result, bool& hasEscape)	463 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType& src, DestCh aracterType& result, bool& hasEscape)

443 {	464 {

444 hasEscape = false;	465 hasEscape = false;

445 do {	466 do {

446 if (LIKELY(*src != '\\')) {	467 if (LIKELY(*src != '\\')) {

447 result++ = src++;	468 result++ = src++;

448 } else {	469 } else {

449 hasEscape = true;	470 hasEscape = true;

450 SrcCharacterType* savedEscapeStart = src;	471 SrcCharacterType* savedEscapeStart = src;

(...skipping 13 matching lines...) Expand all Loading...
464 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin g& resultString, bool& hasEscape)	485 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin g& resultString, bool& hasEscape)

465 {	486 {

466 // If a valid identifier start is found, we can safely	487 // If a valid identifier start is found, we can safely

467 // parse the identifier until the next invalid character.	488 // parse the identifier until the next invalid character.

468 ASSERT(isIdentifierStart<CharacterType>());	489 ASSERT(isIdentifierStart<CharacterType>());

469	490

470 CharacterType* start = currentCharacter<CharacterType>();	491 CharacterType* start = currentCharacter<CharacterType>();

471 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), res ult, hasEscape))) {	492 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), res ult, hasEscape))) {

472 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue	493 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue

473 ASSERT(is8BitSource());	494 ASSERT(is8BitSource());

474 UChar*& result16 = currentCharacter16();	495 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxIdent ifierLen(result));

475 UChar* start16 = result16;	496 UChar* start16 = result16;

476 int i = 0;	497 int i = 0;

477 for (; i < result - start; i++)	498 for (; i < result - start; i++)

478 result16[i] = start[i];	499 result16[i] = start[i];

479	500

480 result16 += i;	501 result16 += i;

481	502

482 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, has Escape);	503 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, has Escape);

483	504

484 resultString.init(start16, result16 - start16);	505 resultString.init(start16, result16 - start16);

485	506

486 return;	507 return;

487 }	508 }

488	509

489 resultString.init(start, result - start);	510 resultString.init(start, result - start);

490 }	511 }

491	512

	513 template <typename SrcCharacterType>

	514 size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote)

	515 {

	516 // The decoded form of a CSS string (after resolving escape

	517 // sequences) will not contain more characters (ASCII or UTF-16

	518 // codepoints) than the input. This code can therefore ignore

	519 // escape sequences completely.

	520 SrcCharacterType* start = src;

	521 while (true) {

	522 if (UNLIKELY(*src == quote)) {

	523 // String parsing is done.

	524 ++src;

	525 break;

	526 }

	527 if (UNLIKELY(!*src)) {

	528 // String parsing is done, but don't advance pointer if at the end o f input.

	529 break;

	530 }

	531 ASSERT(src > '\r' \|\| (src < '\n' && src) \|\| src == '\v');

	532

	533 if (LIKELY(src[0] != '\\'))

	534 src++;

	535 else if (src[1] == '\n' \|\| src[1] == '\f')

	536 src += 2;

	537 else if (src[1] == '\r')

	538 src += src[2] == '\n' ? 3 : 2;

	539 else

	540 parseEscape<SrcCharacterType>(src);
	Julien - ping for review 2014/03/21 17:56:27 I really think this inner loop should be replaced I really think this inner loop should be replaced by a call to checkAndSkipString as it seems to do the exact same checks. Daniel Bratell 2014/03/31 15:07:16 Done! Good. It does feel like there is a bit too Show quoted text On 2014/03/21 17:56:27, Julien Chaffraix - PST wrote: > I really think this inner loop should be replaced by a call to > checkAndSkipString as it seems to do the exact same checks. Done! Good. It does feel like there is a bit too much duplicated code right now (and even more duplicated machine code since these are templates with multiple invocations).
	541 }

	542

	543 return src - start;

	544 }

	545

492 template <typename SrcCharacterType, typename DestCharacterType>	546 template <typename SrcCharacterType, typename DestCharacterType>

493 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType& src, DestCharac terType& result, UChar quote)	547 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType& src, DestCharac terType& result, UChar quote)

494 {	548 {

495 while (true) {	549 while (true) {

496 if (UNLIKELY(*src == quote)) {	550 if (UNLIKELY(*src == quote)) {

497 // String parsing is done.	551 // String parsing is done.

498 ++src;	552 ++src;

499 return true;	553 return true;

500 }	554 }

501 if (UNLIKELY(!*src)) {	555 if (UNLIKELY(!*src)) {

(...skipping 23 matching lines...) Expand all Loading...
525 }	579 }

526	580

527 template <typename CharacterType>	581 template <typename CharacterType>

528 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r esultString, UChar quote)	582 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r esultString, UChar quote)

529 {	583 {

530 CharacterType* start = currentCharacter<CharacterType>();	584 CharacterType* start = currentCharacter<CharacterType>();

531	585

532 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {	586 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {

533 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue	587 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue

534 ASSERT(is8BitSource());	588 ASSERT(is8BitSource());

535 UChar*& result16 = currentCharacter16();	589 UChar* result16 = allocateStringBuffer16((result - start) + peekMaxStrin gLen(result, quote));

536 UChar* start16 = result16;	590 UChar* start16 = result16;

537 int i = 0;	591 int i = 0;

538 for (; i < result - start; i++)	592 for (; i < result - start; i++)

539 result16[i] = start[i];	593 result16[i] = start[i];

540	594

541 result16 += i;	595 result16 += i;

542	596

543 parseStringInternal(currentCharacter<CharacterType>(), result16, quote);	597 parseStringInternal(currentCharacter<CharacterType>(), result16, quote);

544	598

545 resultString.init(start16, result16 - start16);	599 resultString.init(start16, result16 - start16);

(...skipping 27 matching lines...) Expand all Loading...
573 }	627 }

574 }	628 }

575	629

576 end = skipWhiteSpace(end);	630 end = skipWhiteSpace(end);

577 if (*end != ')')	631 if (*end != ')')

578 return false;	632 return false;

579	633

580 return true;	634 return true;

581 }	635 }

582	636

	637 template <typename SrcCharacterType>

	638 inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote)

	639 {

	640 // The decoded form of a URI (after resolving escape sequences)

	641 // will not contain more characters (ASCII or UTF-16 codepoints)

	642 // than the input. This code can therefore ignore escape sequences

	643 // completely.

	644 SrcCharacterType* start = src;

	645 if (quote) {

	646 ASSERT(quote == '"' \|\| quote == '\'');

	647 return peekMaxStringLen(src, quote);

	648 }

	649

	650 while (isURILetter(*src)) {

	651 if (LIKELY(*src != '\\'))

	652 src++;

	653 else

	654 parseEscape<SrcCharacterType>(src);

	655 }

	656

	657 return src - start;

	658 }

	659

583 template <typename SrcCharacterType, typename DestCharacterType>	660 template <typename SrcCharacterType, typename DestCharacterType>

584 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType& src, DestCharacter Type& dest, UChar quote)	661 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType& src, DestCharacter Type& dest, UChar quote)

585 {	662 {

586 if (quote) {	663 if (quote) {

587 ASSERT(quote == '"' \|\| quote == '\'');	664 ASSERT(quote == '"' \|\| quote == '\'');

588 return parseStringInternal(src, dest, quote);	665 return parseStringInternal(src, dest, quote);

589 }	666 }

590	667

591 while (isURILetter(*src)) {	668 while (isURILetter(*src)) {

592 if (LIKELY(*src != '\\')) {	669 if (LIKELY(*src != '\\')) {

593 dest++ = src++;	670 dest++ = src++;

594 } else {	671 } else {

595 unsigned unicode = parseEscape<SrcCharacterType>(src);	672 unsigned unicode = parseEscape<SrcCharacterType>(src);

596 if (unicode > 0xff && sizeof(SrcCharacterType) == 1)	673 if (unicode > 0xff && sizeof(DestCharacterType) == 1)

597 return false;	674 return false;

598 UnicodeToChars(dest, unicode);	675 UnicodeToChars(dest, unicode);

599 }	676 }

600 }	677 }

601	678

602 return true;	679 return true;

603 }	680 }

604	681

605 template <typename CharacterType>	682 template <typename CharacterType>

606 inline void CSSTokenizer::parseURI(CSSParserString& string)	683 inline void CSSTokenizer::parseURI(CSSParserString& string)

607 {	684 {

608 CharacterType* uriStart;	685 CharacterType* uriStart;

609 CharacterType* uriEnd;	686 CharacterType* uriEnd;

610 UChar quote;	687 UChar quote;

611 if (!findURI(uriStart, uriEnd, quote))	688 if (!findURI(uriStart, uriEnd, quote))

612 return;	689 return;

613	690

614 CharacterType* dest = currentCharacter<CharacterType>() = uriStart;	691 CharacterType* dest = currentCharacter<CharacterType>() = uriStart;

615 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote)) ) {	692 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote)) ) {

616 string.init(uriStart, dest - uriStart);	693 string.init(uriStart, dest - uriStart);

617 } else {	694 } else {

618 // An escape sequence was encountered that can't be stored in 8 bits.	695 // An escape sequence was encountered that can't be stored in 8 bits.

619 // Reset the current character to the start of the URI and re-parse with	696 // Reset the current character to the start of the URI and re-parse with

620 // a 16-bit destination.	697 // a 16-bit destination.

621 ASSERT(is8BitSource());	698 ASSERT(is8BitSource());

622 UChar* uriStart16 = currentCharacter16();	699 UChar* result16 = allocateStringBuffer16(peekMaxURILen(uriStart, quote)) ;

	700 UChar* uriStart16 = result16;

623 currentCharacter<CharacterType>() = uriStart;	701 currentCharacter<CharacterType>() = uriStart;

624 bool result = parseURIInternal(currentCharacter<CharacterType>(), curren tCharacter16(), quote);	702 bool result = parseURIInternal(currentCharacter<CharacterType>(), result 16, quote);

625 ASSERT_UNUSED(result, result);	703 ASSERT_UNUSED(result, result);

626 string.init(uriStart16, currentCharacter16() - uriStart16);	704 string.init(uriStart16, result16 - uriStart16);

627 }	705 }

628	706

629 currentCharacter<CharacterType>() = uriEnd + 1;	707 currentCharacter<CharacterType>() = uriEnd + 1;

630 m_token = URI;	708 m_token = URI;

631 }	709 }

632	710

633 template <typename CharacterType>	711 template <typename CharacterType>

634 inline bool CSSTokenizer::parseUnicodeRange()	712 inline bool CSSTokenizer::parseUnicodeRange()

635 {	713 {

636 CharacterType* character = currentCharacter<CharacterType>() + 1;	714 CharacterType* character = currentCharacter<CharacterType>() + 1;

(...skipping 889 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1526 m_dataStart16[length - 1] = 0;	1604 m_dataStart16[length - 1] = 0;

1527	1605

1528 m_is8BitSource = false;	1606 m_is8BitSource = false;

1529 m_currentCharacter8 = 0;	1607 m_currentCharacter8 = 0;

1530 m_currentCharacter16 = m_dataStart16.get();	1608 m_currentCharacter16 = m_dataStart16.get();

1531 setTokenStart<UChar>(m_currentCharacter16);	1609 setTokenStart<UChar>(m_currentCharacter16);

1532 m_lexFunc = &CSSTokenizer::realLex<UChar>;	1610 m_lexFunc = &CSSTokenizer::realLex<UChar>;

1533 }	1611 }

1534	1612

1535 } // namespace WebCore	1613 } // namespace WebCore

OLD	NEW

« no previous file with comments | « Source/core/css/CSSTokenizer.h ('k') | no next file » | no next file with comments »