sky/engine/core/html/parser/HTMLEntityParser.cpp - Issue 678073002: Parse Sky entities according to the spec

Unified Diff: sky/engine/core/html/parser/HTMLEntityParser.cpp

Issue 678073002: Parse Sky entities according to the spec (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: sky/engine/core/html/parser/HTMLEntityParser.cpp

diff --git a/sky/engine/core/html/parser/HTMLEntityParser.cpp b/sky/engine/core/html/parser/HTMLEntityParser.cpp

index dc30c83715f4f72ba2b92a598ad4d987b51370c8..6d90746bbe62d43be202489f887aa0f010c27cc7 100644

--- a/sky/engine/core/html/parser/HTMLEntityParser.cpp

+++ b/sky/engine/core/html/parser/HTMLEntityParser.cpp

@@ -1,305 +1,171 @@

-/*

- *

- * Redistribution and use in source and binary forms, with or without

- * modification, are permitted provided that the following conditions

- * are met:

- * 1. Redistributions of source code must retain the above copyright

- * notice, this list of conditions and the following disclaimer.

- * 2. Redistributions in binary form must reproduce the above copyright

- * notice, this list of conditions and the following disclaimer in the

- * documentation and/or other materials provided with the distribution.

- *

- * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY

- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR

- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY

- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

- */

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

#include "config.h"

#include "core/html/parser/HTMLEntityParser.h"

-#include "core/html/parser/HTMLEntitySearch.h"

-#include "core/html/parser/HTMLEntityTable.h"

-#include "wtf/text/StringBuilder.h"

+#include "wtf/unicode/CharacterNames.h"

using namespace WTF;

namespace blink {

-static const UChar windowsLatin1ExtensionArray[32] = {

- 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87

- 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F

- 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97

- 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F

-};

-static bool isAlphaNumeric(UChar cc)

- return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');

+static const UChar32 kInvalidUnicode = -1;

-static UChar adjustEntity(UChar32 value)

+static UChar asHexDigit(UChar cc)

{

- if ((value & ~0x1F) != 0x0080)

- return value;

- return windowsLatin1ExtensionArray[value - 0x80];

+ if (cc >= '0' && cc <= '9')

+ return cc - '0';

+ if (cc >= 'a' && cc <= 'f')

+ return 10 + cc - 'a';

+ if (cc >= 'A' && cc <= 'F')

+ return 10 + cc - 'A';

+ ASSERT_NOT_REACHED();

+ return 0;

}

-static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)

+static bool isAlphaNumeric(UChar cc)

{

- // FIXME: A number of specific entity values generate parse errors.

- if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {

- decodedEntity.append(0xFFFD);

- return;

- }

- if (U_IS_BMP(c)) {

- decodedEntity.append(adjustEntity(c));

- return;

- }

- decodedEntity.append(c);

+ return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');

}

-static const UChar32 kInvalidUnicode = -1;

static bool isHexDigit(UChar cc)

{

return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');

}

-static UChar asHexDigit(UChar cc)

+static UChar decodeEntity(HTMLEntityParser::OutputBuffer buffer)

{

- if (cc >= '0' && cc <= '9')

- return cc - '0';

- if (cc >= 'a' && cc <= 'z')

- return 10 + cc - 'a';

- if (cc >= 'A' && cc <= 'Z')

- return 10 + cc - 'A';

- ASSERT_NOT_REACHED();

- return 0;

+ if (equalIgnoringNullity(buffer, "&amp"))

+ return '&';

+ if (equalIgnoringNullity(buffer, "&apos"))

+ return '\'';

+ if (equalIgnoringNullity(buffer, "&gt"))

+ return '>';

+ if (equalIgnoringNullity(buffer, "&lt"))

+ return '<';

+ if (equalIgnoringNullity(buffer, "&quot"))

+ return '"';

+ return replacementCharacter;

}

-typedef Vector<UChar, 64> ConsumedCharacterBuffer;

-static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)

+HTMLEntityParser::HTMLEntityParser()

{

- if (consumedCharacters.size() == 1)

- source.push(consumedCharacters[0]);

- else if (consumedCharacters.size() == 2) {

- source.push(consumedCharacters[0]);

- source.push(consumedCharacters[1]);

- } else

- source.prepend(SegmentedString(String(consumedCharacters)));

}

-static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)

+HTMLEntityParser::~HTMLEntityParser()

{

- ConsumedCharacterBuffer consumedCharacters;

- HTMLEntitySearch entitySearch;

- while (!source.isEmpty()) {

- cc = source.currentChar();

- entitySearch.advance(cc);

- if (!entitySearch.isEntityPrefix())

- break;

- consumedCharacters.append(cc);

- source.advanceAndASSERT(cc);

- }

- notEnoughCharacters = source.isEmpty();

- if (notEnoughCharacters) {

- // We can't decide on an entity because there might be a longer entity

- // that we could match if we had more data.

- unconsumeCharacters(source, consumedCharacters);

- return false;

- }

- if (!entitySearch.mostRecentMatch()) {

- unconsumeCharacters(source, consumedCharacters);

- return false;

- }

- if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {

- // We've consumed too many characters. We need to walk the

- // source back to the point at which we had consumed an

- // actual entity.

- unconsumeCharacters(source, consumedCharacters);

- consumedCharacters.clear();

- const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch();

- const int length = mostRecent->length;

- const LChar* reference = HTMLEntityTable::entityString(*mostRecent);

- for (int i = 0; i < length; ++i) {

- cc = source.currentChar();

- ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++));

- consumedCharacters.append(cc);

- source.advanceAndASSERT(cc);

- ASSERT(!source.isEmpty());

- }

- cc = source.currentChar();

- }

- if (entitySearch.mostRecentMatch()->lastCharacter() == ';'

- || !additionalAllowedCharacter

- || !(isAlphaNumeric(cc) || cc == '=')) {

- decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);

- if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)

- decodedEntity.append(second);

- return true;

- }

- unconsumeCharacters(source, consumedCharacters);

- return false;

}

-bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)

+void HTMLEntityParser::reset()

{

- ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');

- ASSERT(!notEnoughCharacters);

- ASSERT(decodedEntity.isEmpty());

- enum EntityState {

- Initial,

- Number,

- MaybeHexLowerCaseX,

- MaybeHexUpperCaseX,

- Hex,

- Decimal,

- Named

- };

- EntityState entityState = Initial;

- UChar32 result = 0;

- ConsumedCharacterBuffer consumedCharacters;

+ m_state = Initial;

+ m_result = '\0';

+ m_buffer.clear();

+ m_buffer.append('&');

+bool HTMLEntityParser::parse(SegmentedString& source)

while (!source.isEmpty()) {

UChar cc = source.currentChar();

- switch (entityState) {

+ switch (m_state) {

case Initial: {

- if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')

- return false;

- if (additionalAllowedCharacter && cc == additionalAllowedCharacter)

- return false;

if (cc == '#') {

- entityState = Number;

+ m_state = Numeric;

break;

}

- if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {

- entityState = Named;

+ if (isAlphaNumeric(cc)) {

+ m_state = Named;

continue;

}

- return false;

+ return true;

}

- case Number: {

- if (cc == 'x') {

- entityState = MaybeHexLowerCaseX;

- break;

- }

- if (cc == 'X') {

- entityState = MaybeHexUpperCaseX;

+ case Numeric: {

+ if (cc == 'x' || cc == 'X') {

+ m_state = PossiblyHex;

break;

}

if (cc >= '0' && cc <= '9') {

- entityState = Decimal;

+ m_state = Decimal;

continue;

}

- source.push('#');

- return false;

+ return true;

}

- case MaybeHexLowerCaseX: {

+ case PossiblyHex: {

if (isHexDigit(cc)) {

- entityState = Hex;

+ m_state = Hex;

continue;

}

- source.push('#');

- source.push('x');

- return false;

- }

- case MaybeHexUpperCaseX: {

- if (isHexDigit(cc)) {

- entityState = Hex;

- continue;

- }

- source.push('#');

- source.push('X');

- return false;

+ return true;

}

case Hex: {

if (isHexDigit(cc)) {

- if (result != kInvalidUnicode)

- result = result * 16 + asHexDigit(cc);

- } else if (cc == ';') {

+ if (m_result != kInvalidUnicode)

+ m_result = m_result * 16 + asHexDigit(cc);

+ break;

+ }

+ if (cc == ';') {

source.advanceAndASSERT(cc);

- appendLegalEntityFor(result, decodedEntity);

- return true;

- } else {

- appendLegalEntityFor(result, decodedEntity);

+ finalizeNumericEntity();

return true;

}

- break;

+ return true;

}

case Decimal: {

if (cc >= '0' && cc <= '9') {

- if (result != kInvalidUnicode)

- result = result * 10 + cc - '0';

- } else if (cc == ';') {

+ if (m_result != kInvalidUnicode)

+ m_result = m_result * 10 + cc - '0';

+ break;

+ }

+ if (cc == ';') {

source.advanceAndASSERT(cc);

- appendLegalEntityFor(result, decodedEntity);

- return true;

- } else {

- appendLegalEntityFor(result, decodedEntity);

+ finalizeNumericEntity();

return true;

}

- break;

+ return true;

}

case Named: {

- return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);

+ if (isAlphaNumeric(cc))

+ break;

+ if (cc == ';') {

+ source.advanceAndASSERT(cc);

+ finalizeNamedEntity();

+ return true;

+ }

+ return true;

}

- if (result > UCHAR_MAX_VALUE)

- result = kInvalidUnicode;

+ if (m_result > UCHAR_MAX_VALUE)

+ m_result = kInvalidUnicode;

- consumedCharacters.append(cc);

+ m_buffer.append(cc);

source.advanceAndASSERT(cc);

}

ASSERT(source.isEmpty());

- notEnoughCharacters = true;

- unconsumeCharacters(source, consumedCharacters);

return false;

}

-static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)

+void HTMLEntityParser::finalizeNumericEntity()

{

- if (U_IS_BMP(value)) {

- UChar character = static_cast<UChar>(value);

- ASSERT(character == value);

- result[0] = character;

- return 1;

+ m_buffer.clear();

+ if (m_result <= 0 || m_result > 0x10FFFF || (m_result >= 0xD800 && m_result <= 0xDFFF)) {

+ m_buffer.append(replacementCharacter);

+ } else if (U_IS_BMP(m_result)) {

+ m_buffer.append(m_result);

+ } else {

+ m_buffer.append(U16_LEAD(m_result));

+ m_buffer.append(U16_TRAIL(m_result));

}

- result[0] = U16_LEAD(value);

- result[1] = U16_TRAIL(value);

- return 2;

}

-size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])

+void HTMLEntityParser::finalizeNamedEntity()

{

- HTMLEntitySearch search;

- while (*name) {

- search.advance(*name++);

- if (!search.isEntityPrefix())

- return 0;

- }

- search.advance(';');

- if (!search.isEntityPrefix())

- return 0;

- size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);

- if (!search.mostRecentMatch()->secondValue)

- return numberOfCodePoints;

- return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);

+ UChar decodedEntity = decodeEntity(m_buffer);

+ m_buffer.clear();

+ m_buffer.append(decodedEntity);

}

} // namespace blink

« no previous file with comments | « sky/engine/core/html/parser/HTMLEntityParser.h ('k') | sky/engine/core/html/parser/HTMLTokenizer.h » ('j') | no next file with comments »