Source/core/css/CSSTokenizer-in.cpp - Issue 196353018: Smaller CSSParser UTF16 buffers for escaped strings.

Side by Side Diff: Source/core/css/CSSTokenizer-in.cpp

Issue 196353018: Smaller CSSParser UTF16 buffers for escaped strings. (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master

Patch Set: Rewrote comments. Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 /*	1 /*

2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org)	2 * Copyright (C) 2003 Lars Knoll (knoll@kde.org)

3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com)	3 * Copyright (C) 2005 Allan Sandfeld Jensen (kde@carewolf.com)

4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.	4 * Copyright (C) 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 Apple Inc. All rights reserved.

5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com>	5 * Copyright (C) 2007 Nicholas Shanks <webkit@nickshanks.com>

6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org>	6 * Copyright (C) 2008 Eric Seidel <eric@webkit.org>

7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmo bile.com/)	7 * Copyright (C) 2009 Torch Mobile Inc. All rights reserved. (http://www.torchmo bile.com/)

8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.	8 * Copyright (C) 2012 Adobe Systems Incorporated. All rights reserved.

9 * Copyright (C) 2012 Intel Corporation. All rights reserved.	9 * Copyright (C) 2012 Intel Corporation. All rights reserved.

10 *	10 *

(...skipping 286 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
297 {	297 {

298 return m_currentCharacter8;	298 return m_currentCharacter8;

299 }	299 }

300	300

301 template <>	301 template <>

302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()	302 inline UChar*& CSSTokenizer::currentCharacter<UChar>()

303 {	303 {

304 return m_currentCharacter16;	304 return m_currentCharacter16;

305 }	305 }

306	306

307 UChar*& CSSTokenizer::currentCharacter16()	307 UChar* CSSTokenizer::getStringBuffer16(size_t len)
	Julien - ping for review 2014/03/20 20:57:39 We usually don't put the word "get" on getters as We usually don't put the word "get" on getters as it's redundant. Here we could write allocateStringBuffer16 as it would better match the intent. Maybe we could mention somehow that the string is owned by the tokenizer in the name too but we don't want to make the name too long. Daniel Bratell 2014/03/21 15:14:39 Done. Show quoted text On 2014/03/20 20:57:39, Julien Chaffraix - PST wrote: > We usually don't put the word "get" on getters as it's redundant. Here we could > write allocateStringBuffer16 as it would better match the intent. > > Maybe we could mention somehow that the string is owned by the tokenizer in the > name too but we don't want to make the name too long. Done.
308 {	308 {

309 if (!m_currentCharacter16) {	309 OwnPtr<UChar[]> buffer = adoptArrayPtr(new UChar[len]);

310 m_dataStart16 = adoptArrayPtr(new UChar[m_length]);

311 m_currentCharacter16 = m_dataStart16.get();

312 }

313	310

314 return m_currentCharacter16;	311 UChar* bufferPtr = buffer.get();

	312

	313 m_cssStrings16.append(buffer.release());

	314 return bufferPtr;

315 }	315 }

316	316

317 template <>	317 template <>

318 inline LChar* CSSTokenizer::dataStart<LChar>()	318 inline LChar* CSSTokenizer::dataStart<LChar>()

319 {	319 {

320 return m_dataStart8.get();	320 return m_dataStart8.get();

321 }	321 }

322	322

323 template <>	323 template <>

324 inline UChar* CSSTokenizer::dataStart<UChar>()	324 inline UChar* CSSTokenizer::dataStart<UChar>()

(...skipping 54 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
379 currentCharacter += currentCharacter[2] == '\n' ? 3 : 2;	379 currentCharacter += currentCharacter[2] == '\n' ? 3 : 2;

380 } else {	380 } else {

381 currentCharacter = checkAndSkipEscape(currentCharacter);	381 currentCharacter = checkAndSkipEscape(currentCharacter);

382 if (!currentCharacter)	382 if (!currentCharacter)

383 return 0;	383 return 0;

384 }	384 }

385 }	385 }

386 }	386 }

387	387

388 template <typename CharacterType>	388 template <typename CharacterType>

389 unsigned CSSTokenizer::parseEscape(CharacterType*& src)	389 unsigned CSSTokenizer::parseEscape(CharacterType*& src)
	Julien - ping for review 2014/03/20 20:57:39 This function should now have static linkage. This function should now have static linkage. Daniel Bratell 2014/03/21 15:14:39 Hmm, not sure I understand. A function can have in Show quoted text On 2014/03/20 20:57:39, Julien Chaffraix - PST wrote: > This function should now have static linkage. Hmm, not sure I understand. A function can have internal or external linkage. This has external linkage since it's part of a class. I did make it a static method in the class in this patch though, mostly to prevent it from modifying the tokenizer state by accident which it used to do ("src" and "currentCharacter()" happened to be the same pointer so it "worked(tm)".) I think it's doing fine as a static member function, but I can make it a file level internal function though if you think that matches the style better. Julien - ping for review 2014/03/21 17:56:26 I was talking about file level internal function a Show quoted text On 2014/03/21 15:14:39, Daniel Bratell wrote: > On 2014/03/20 20:57:39, Julien Chaffraix - PST wrote: > > This function should now have static linkage. > > Hmm, not sure I understand. A function can have internal or external linkage. > This has external linkage since it's part of a class. > > I did make it a static method in the class in this patch though, mostly to > prevent it from modifying the tokenizer state by accident which it used to do > ("src" and "currentCharacter()" happened to be the same pointer so it > "worked(tm)".) > > I think it's doing fine as a static member function, but I can make it a file > level internal function though if you think that matches the style better. I was talking about file level internal function as you pointed out. All in all, it's not really blocking and it works either way.
390 {	390 {

391 ASSERT(*src == '\\' && isCSSEscape(src[1]));	391 ASSERT(*src == '\\' && isCSSEscape(src[1]));

392	392

393 unsigned unicode = 0;	393 unsigned unicode = 0;

394	394

395 ++src;	395 ++src;

396 if (isASCIIHexDigit(*src)) {	396 if (isASCIIHexDigit(*src)) {

397	397

398 int length = 6;	398 int length = 6;

399	399

400 do {	400 do {

401 unicode = (unicode << 4) + toASCIIHexValue(*src++);	401 unicode = (unicode << 4) + toASCIIHexValue(*src++);

402 } while (--length && isASCIIHexDigit(*src));	402 } while (--length && isASCIIHexDigit(*src));

403	403

404 // Characters above 0x10ffff are not handled.	404 // Characters above 0x10ffff are not handled.

405 if (unicode > 0x10ffff)	405 if (unicode > 0x10ffff)

406 unicode = 0xfffd;	406 unicode = 0xfffd;

407	407

408 // Optional space after the escape sequence.	408 // Optional space after the escape sequence.

409 if (isHTMLSpace<CharacterType>(*src))	409 if (isHTMLSpace<CharacterType>(*src))

410 ++src;	410 ++src;

411	411

412 return unicode;	412 return unicode;

413 }	413 }

414	414

415 return *currentCharacter<CharacterType>()++;	415 return *src++;

416 }	416 }

417	417

418 template <>	418 template <>

419 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode )	419 inline void CSSTokenizer::UnicodeToChars<LChar>(LChar*& result, unsigned unicode )

420 {	420 {

421 ASSERT(unicode <= 0xff);	421 ASSERT(unicode <= 0xff);

422 *result = unicode;	422 *result = unicode;

423	423

424 ++result;	424 ++result;

425 }	425 }

426	426

427 template <>	427 template <>

428 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode )	428 inline void CSSTokenizer::UnicodeToChars<UChar>(UChar*& result, unsigned unicode )

429 {	429 {

430 // Replace unicode with a surrogate pairs when it is bigger than 0xffff	430 // Replace unicode with a surrogate pairs when it is bigger than 0xffff

431 if (U16_LENGTH(unicode) == 2) {	431 if (U16_LENGTH(unicode) == 2) {

432 *result++ = U16_LEAD(unicode);	432 *result++ = U16_LEAD(unicode);

433 *result = U16_TRAIL(unicode);	433 *result = U16_TRAIL(unicode);

434 } else {	434 } else {

435 *result = unicode;	435 *result = unicode;

436 }	436 }

437	437

438 ++result;	438 ++result;

439 }	439 }

440	440

	441 template <typename SrcCharacterType>

	442 size_t CSSTokenizer::peekMaxIdentifierLen(SrcCharacterType* src)

	443 {

	444 // The decoded form of an identifier (after resolving escape

	445 // sequences) will not contain more characters (ASCII or UTF-16

	446 // codepoints) than the input. This code can therefore ignore

	447 // escape sequences completely.

	448 SrcCharacterType* start = src;

	449 do {

	450 if (LIKELY(*src != '\\'))

	451 src++;

	452 else

	453 parseEscape<SrcCharacterType>(src);

	454 } while (isCSSLetter(src[0]) \|\| (src[0] == '\\' && isCSSEscape(src[1])));

	455

	456 return src - start;

	457 }

	458

441 template <typename SrcCharacterType, typename DestCharacterType>	459 template <typename SrcCharacterType, typename DestCharacterType>

442 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType& src, DestCh aracterType& result, bool& hasEscape)	460 inline bool CSSTokenizer::parseIdentifierInternal(SrcCharacterType& src, DestCh aracterType& result, bool& hasEscape)

443 {	461 {

444 hasEscape = false;	462 hasEscape = false;

445 do {	463 do {

446 if (LIKELY(*src != '\\')) {	464 if (LIKELY(*src != '\\')) {

447 result++ = src++;	465 result++ = src++;

448 } else {	466 } else {

449 hasEscape = true;	467 hasEscape = true;

450 SrcCharacterType* savedEscapeStart = src;	468 SrcCharacterType* savedEscapeStart = src;

(...skipping 13 matching lines...) Expand all Loading...
464 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin g& resultString, bool& hasEscape)	482 inline void CSSTokenizer::parseIdentifier(CharacterType*& result, CSSParserStrin g& resultString, bool& hasEscape)

465 {	483 {

466 // If a valid identifier start is found, we can safely	484 // If a valid identifier start is found, we can safely

467 // parse the identifier until the next invalid character.	485 // parse the identifier until the next invalid character.

468 ASSERT(isIdentifierStart<CharacterType>());	486 ASSERT(isIdentifierStart<CharacterType>());

469	487

470 CharacterType* start = currentCharacter<CharacterType>();	488 CharacterType* start = currentCharacter<CharacterType>();

471 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), res ult, hasEscape))) {	489 if (UNLIKELY(!parseIdentifierInternal(currentCharacter<CharacterType>(), res ult, hasEscape))) {

472 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue	490 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue

473 ASSERT(is8BitSource());	491 ASSERT(is8BitSource());

474 UChar*& result16 = currentCharacter16();	492 UChar* result16 = getStringBuffer16((result - start) + peekMaxIdentifier Len(result));

475 UChar* start16 = result16;	493 UChar* start16 = result16;

476 int i = 0;	494 int i = 0;

477 for (; i < result - start; i++)	495 for (; i < result - start; i++)

478 result16[i] = start[i];	496 result16[i] = start[i];

479	497

480 result16 += i;	498 result16 += i;

481	499

482 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, has Escape);	500 parseIdentifierInternal(currentCharacter<CharacterType>(), result16, has Escape);

483	501

484 resultString.init(start16, result16 - start16);	502 resultString.init(start16, result16 - start16);

485	503

486 return;	504 return;

487 }	505 }

488	506

489 resultString.init(start, result - start);	507 resultString.init(start, result - start);

490 }	508 }

491	509

	510 template <typename SrcCharacterType>

	511 size_t CSSTokenizer::peekMaxStringLen(SrcCharacterType* src, UChar quote)

	512 {

	513 // The decoded form of a CSS string (after resolving escape

	514 // sequences) will not contain more characters (ASCII or UTF-16

	515 // codepoints) than the input. This code can therefore ignore

	516 // escape sequences completely.

	517 SrcCharacterType* start = src;

	518 while (true) {

	519 if (UNLIKELY(*src == quote)) {

	520 // String parsing is done.

	521 ++src;

	522 break;

	523 }

	524 if (UNLIKELY(!*src)) {

	525 // String parsing is done, but don't advance pointer if at the end o f input.

	526 break;

	527 }

	528 ASSERT(src > '\r' \|\| (src < '\n' && src) \|\| src == '\v');

	529

	530 if (LIKELY(src[0] != '\\'))

	531 src++;

	532 else if (src[1] == '\n' \|\| src[1] == '\f')

	533 src += 2;

	534 else if (src[1] == '\r')

	535 src += src[2] == '\n' ? 3 : 2;
	Julien - ping for review 2014/03/20 20:57:39 This looks awfully like checkAndSkipString, maybe This looks awfully like checkAndSkipString, maybe we could share some code?
	536 else

	537 parseEscape<SrcCharacterType>(src);

	538 }

	539

	540 return src - start;

	541 }

	542

492 template <typename SrcCharacterType, typename DestCharacterType>	543 template <typename SrcCharacterType, typename DestCharacterType>

493 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType& src, DestCharac terType& result, UChar quote)	544 inline bool CSSTokenizer::parseStringInternal(SrcCharacterType& src, DestCharac terType& result, UChar quote)

494 {	545 {

495 while (true) {	546 while (true) {

496 if (UNLIKELY(*src == quote)) {	547 if (UNLIKELY(*src == quote)) {

497 // String parsing is done.	548 // String parsing is done.

498 ++src;	549 ++src;

499 return true;	550 return true;

500 }	551 }

501 if (UNLIKELY(!*src)) {	552 if (UNLIKELY(!*src)) {

(...skipping 23 matching lines...) Expand all Loading...
525 }	576 }

526	577

527 template <typename CharacterType>	578 template <typename CharacterType>

528 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r esultString, UChar quote)	579 inline void CSSTokenizer::parseString(CharacterType*& result, CSSParserString& r esultString, UChar quote)

529 {	580 {

530 CharacterType* start = currentCharacter<CharacterType>();	581 CharacterType* start = currentCharacter<CharacterType>();

531	582

532 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {	583 if (UNLIKELY(!parseStringInternal(currentCharacter<CharacterType>(), result, quote))) {

533 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue	584 // Found an escape we couldn't handle with 8 bits, copy what has been re cognized and continue

534 ASSERT(is8BitSource());	585 ASSERT(is8BitSource());

535 UChar*& result16 = currentCharacter16();	586 UChar* result16 = getStringBuffer16((result - start) + peekMaxStringLen( result, quote));

536 UChar* start16 = result16;	587 UChar* start16 = result16;

537 int i = 0;	588 int i = 0;

538 for (; i < result - start; i++)	589 for (; i < result - start; i++)

539 result16[i] = start[i];	590 result16[i] = start[i];

540	591

541 result16 += i;	592 result16 += i;

542	593

543 parseStringInternal(currentCharacter<CharacterType>(), result16, quote);	594 parseStringInternal(currentCharacter<CharacterType>(), result16, quote);

544	595

545 resultString.init(start16, result16 - start16);	596 resultString.init(start16, result16 - start16);

(...skipping 27 matching lines...) Expand all Loading...
573 }	624 }

574 }	625 }

575	626

576 end = skipWhiteSpace(end);	627 end = skipWhiteSpace(end);

577 if (*end != ')')	628 if (*end != ')')

578 return false;	629 return false;

579	630

580 return true;	631 return true;

581 }	632 }

582	633

	634 template <typename SrcCharacterType>

	635 inline size_t CSSTokenizer::peekMaxURILen(SrcCharacterType* src, UChar quote)

	636 {

	637 // The decoded form of a URI (after resolving escape sequences)

	638 // will not contain more characters (ASCII or UTF-16 codepoints)

	639 // than the input. This code can therefore ignore escape sequences

	640 // completely.

	641 SrcCharacterType* start = src;

	642 if (quote) {

	643 ASSERT(quote == '"' \|\| quote == '\'');

	644 return peekMaxStringLen(src, quote);

	645 }

	646

	647 while (isURILetter(*src)) {

	648 if (LIKELY(*src != '\\'))

	649 src++;

	650 else

	651 parseEscape<SrcCharacterType>(src);

	652 }

	653

	654 return src - start;

	655 }

	656

583 template <typename SrcCharacterType, typename DestCharacterType>	657 template <typename SrcCharacterType, typename DestCharacterType>

584 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType& src, DestCharacter Type& dest, UChar quote)	658 inline bool CSSTokenizer::parseURIInternal(SrcCharacterType& src, DestCharacter Type& dest, UChar quote)

585 {	659 {

586 if (quote) {	660 if (quote) {

587 ASSERT(quote == '"' \|\| quote == '\'');	661 ASSERT(quote == '"' \|\| quote == '\'');

588 return parseStringInternal(src, dest, quote);	662 return parseStringInternal(src, dest, quote);

589 }	663 }

590	664

591 while (isURILetter(*src)) {	665 while (isURILetter(*src)) {

592 if (LIKELY(*src != '\\')) {	666 if (LIKELY(*src != '\\')) {

593 dest++ = src++;	667 dest++ = src++;

594 } else {	668 } else {

595 unsigned unicode = parseEscape<SrcCharacterType>(src);	669 unsigned unicode = parseEscape<SrcCharacterType>(src);

596 if (unicode > 0xff && sizeof(SrcCharacterType) == 1)	670 if (unicode > 0xff && sizeof(DestCharacterType) == 1)

597 return false;	671 return false;

598 UnicodeToChars(dest, unicode);	672 UnicodeToChars(dest, unicode);

599 }	673 }

600 }	674 }

601	675

602 return true;	676 return true;

603 }	677 }

604	678

605 template <typename CharacterType>	679 template <typename CharacterType>

606 inline void CSSTokenizer::parseURI(CSSParserString& string)	680 inline void CSSTokenizer::parseURI(CSSParserString& string)

607 {	681 {

608 CharacterType* uriStart;	682 CharacterType* uriStart;

609 CharacterType* uriEnd;	683 CharacterType* uriEnd;

610 UChar quote;	684 UChar quote;

611 if (!findURI(uriStart, uriEnd, quote))	685 if (!findURI(uriStart, uriEnd, quote))

612 return;	686 return;

613	687

614 CharacterType* dest = currentCharacter<CharacterType>() = uriStart;	688 CharacterType* dest = currentCharacter<CharacterType>() = uriStart;

615 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote)) ) {	689 if (LIKELY(parseURIInternal(currentCharacter<CharacterType>(), dest, quote)) ) {

616 string.init(uriStart, dest - uriStart);	690 string.init(uriStart, dest - uriStart);

617 } else {	691 } else {

618 // An escape sequence was encountered that can't be stored in 8 bits.	692 // An escape sequence was encountered that can't be stored in 8 bits.

619 // Reset the current character to the start of the URI and re-parse with	693 // Reset the current character to the start of the URI and re-parse with

620 // a 16-bit destination.	694 // a 16-bit destination.

621 ASSERT(is8BitSource());	695 ASSERT(is8BitSource());

622 UChar* uriStart16 = currentCharacter16();	696 UChar* result16 = getStringBuffer16(peekMaxURILen(uriStart, quote));

	697 UChar* uriStart16 = result16;

623 currentCharacter<CharacterType>() = uriStart;	698 currentCharacter<CharacterType>() = uriStart;

624 bool result = parseURIInternal(currentCharacter<CharacterType>(), curren tCharacter16(), quote);	699 bool result = parseURIInternal(currentCharacter<CharacterType>(), result 16, quote);

625 ASSERT_UNUSED(result, result);	700 ASSERT_UNUSED(result, result);

626 string.init(uriStart16, currentCharacter16() - uriStart16);	701 string.init(uriStart16, result16 - uriStart16);

627 }	702 }

628	703

629 currentCharacter<CharacterType>() = uriEnd + 1;	704 currentCharacter<CharacterType>() = uriEnd + 1;

630 m_token = URI;	705 m_token = URI;

631 }	706 }

632	707

633 template <typename CharacterType>	708 template <typename CharacterType>

634 inline bool CSSTokenizer::parseUnicodeRange()	709 inline bool CSSTokenizer::parseUnicodeRange()

635 {	710 {

636 CharacterType* character = currentCharacter<CharacterType>() + 1;	711 CharacterType* character = currentCharacter<CharacterType>() + 1;

(...skipping 889 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1526 m_dataStart16[length - 1] = 0;	1601 m_dataStart16[length - 1] = 0;

1527	1602

1528 m_is8BitSource = false;	1603 m_is8BitSource = false;

1529 m_currentCharacter8 = 0;	1604 m_currentCharacter8 = 0;

1530 m_currentCharacter16 = m_dataStart16.get();	1605 m_currentCharacter16 = m_dataStart16.get();

1531 setTokenStart<UChar>(m_currentCharacter16);	1606 setTokenStart<UChar>(m_currentCharacter16);

1532 m_lexFunc = &CSSTokenizer::realLex<UChar>;	1607 m_lexFunc = &CSSTokenizer::realLex<UChar>;

1533 }	1608 }

1534	1609

1535 } // namespace WebCore	1610 } // namespace WebCore

OLD	NEW

« Source/core/css/CSSTokenizer.h ('K') | « Source/core/css/CSSTokenizer.h ('k') | no next file » | no next file with comments »