| Index: sky/engine/core/html/parser/HTMLEntityParser.cpp
|
| diff --git a/sky/engine/core/html/parser/HTMLEntityParser.cpp b/sky/engine/core/html/parser/HTMLEntityParser.cpp
|
| index dc30c83715f4f72ba2b92a598ad4d987b51370c8..6d90746bbe62d43be202489f887aa0f010c27cc7 100644
|
| --- a/sky/engine/core/html/parser/HTMLEntityParser.cpp
|
| +++ b/sky/engine/core/html/parser/HTMLEntityParser.cpp
|
| @@ -1,305 +1,171 @@
|
| -/*
|
| - * Copyright (C) 2008 Apple Inc. All Rights Reserved.
|
| - * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
|
| - * Copyright (C) 2010 Google, Inc. All Rights Reserved.
|
| - *
|
| - * Redistribution and use in source and binary forms, with or without
|
| - * modification, are permitted provided that the following conditions
|
| - * are met:
|
| - * 1. Redistributions of source code must retain the above copyright
|
| - * notice, this list of conditions and the following disclaimer.
|
| - * 2. Redistributions in binary form must reproduce the above copyright
|
| - * notice, this list of conditions and the following disclaimer in the
|
| - * documentation and/or other materials provided with the distribution.
|
| - *
|
| - * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
|
| - * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
| - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
|
| - * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
| - * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
| - * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
| - * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
| - * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
| - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
| - */
|
| +// Copyright 2014 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
|
|
| #include "config.h"
|
| #include "core/html/parser/HTMLEntityParser.h"
|
|
|
| -#include "core/html/parser/HTMLEntitySearch.h"
|
| -#include "core/html/parser/HTMLEntityTable.h"
|
| -#include "wtf/text/StringBuilder.h"
|
| +#include "wtf/unicode/CharacterNames.h"
|
|
|
| using namespace WTF;
|
|
|
| namespace blink {
|
|
|
| -static const UChar windowsLatin1ExtensionArray[32] = {
|
| - 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
|
| - 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
|
| - 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
|
| - 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
|
| -};
|
| -
|
| -static bool isAlphaNumeric(UChar cc)
|
| -{
|
| - return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
|
| -}
|
| +static const UChar32 kInvalidUnicode = -1;
|
|
|
| -static UChar adjustEntity(UChar32 value)
|
| +static UChar asHexDigit(UChar cc)
|
| {
|
| - if ((value & ~0x1F) != 0x0080)
|
| - return value;
|
| - return windowsLatin1ExtensionArray[value - 0x80];
|
| + if (cc >= '0' && cc <= '9')
|
| + return cc - '0';
|
| + if (cc >= 'a' && cc <= 'f')
|
| + return 10 + cc - 'a';
|
| + if (cc >= 'A' && cc <= 'F')
|
| + return 10 + cc - 'A';
|
| + ASSERT_NOT_REACHED();
|
| + return 0;
|
| }
|
|
|
| -static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
|
| +static bool isAlphaNumeric(UChar cc)
|
| {
|
| - // FIXME: A number of specific entity values generate parse errors.
|
| - if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
|
| - decodedEntity.append(0xFFFD);
|
| - return;
|
| - }
|
| - if (U_IS_BMP(c)) {
|
| - decodedEntity.append(adjustEntity(c));
|
| - return;
|
| - }
|
| - decodedEntity.append(c);
|
| + return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
|
| }
|
|
|
| -static const UChar32 kInvalidUnicode = -1;
|
| -
|
| static bool isHexDigit(UChar cc)
|
| {
|
| return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
|
| }
|
|
|
| -static UChar asHexDigit(UChar cc)
|
| +static UChar decodeEntity(HTMLEntityParser::OutputBuffer buffer)
|
| {
|
| - if (cc >= '0' && cc <= '9')
|
| - return cc - '0';
|
| - if (cc >= 'a' && cc <= 'z')
|
| - return 10 + cc - 'a';
|
| - if (cc >= 'A' && cc <= 'Z')
|
| - return 10 + cc - 'A';
|
| - ASSERT_NOT_REACHED();
|
| - return 0;
|
| + if (equalIgnoringNullity(buffer, "&"))
|
| + return '&';
|
| + if (equalIgnoringNullity(buffer, "&apos"))
|
| + return '\'';
|
| + if (equalIgnoringNullity(buffer, ">"))
|
| + return '>';
|
| + if (equalIgnoringNullity(buffer, "<"))
|
| + return '<';
|
| + if (equalIgnoringNullity(buffer, """))
|
| + return '"';
|
| + return replacementCharacter;
|
| }
|
|
|
| -typedef Vector<UChar, 64> ConsumedCharacterBuffer;
|
| -
|
| -static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)
|
| +HTMLEntityParser::HTMLEntityParser()
|
| {
|
| - if (consumedCharacters.size() == 1)
|
| - source.push(consumedCharacters[0]);
|
| - else if (consumedCharacters.size() == 2) {
|
| - source.push(consumedCharacters[0]);
|
| - source.push(consumedCharacters[1]);
|
| - } else
|
| - source.prepend(SegmentedString(String(consumedCharacters)));
|
| }
|
|
|
| -static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
|
| +HTMLEntityParser::~HTMLEntityParser()
|
| {
|
| - ConsumedCharacterBuffer consumedCharacters;
|
| - HTMLEntitySearch entitySearch;
|
| - while (!source.isEmpty()) {
|
| - cc = source.currentChar();
|
| - entitySearch.advance(cc);
|
| - if (!entitySearch.isEntityPrefix())
|
| - break;
|
| - consumedCharacters.append(cc);
|
| - source.advanceAndASSERT(cc);
|
| - }
|
| - notEnoughCharacters = source.isEmpty();
|
| - if (notEnoughCharacters) {
|
| - // We can't decide on an entity because there might be a longer entity
|
| - // that we could match if we had more data.
|
| - unconsumeCharacters(source, consumedCharacters);
|
| - return false;
|
| - }
|
| - if (!entitySearch.mostRecentMatch()) {
|
| - unconsumeCharacters(source, consumedCharacters);
|
| - return false;
|
| - }
|
| - if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
|
| - // We've consumed too many characters. We need to walk the
|
| - // source back to the point at which we had consumed an
|
| - // actual entity.
|
| - unconsumeCharacters(source, consumedCharacters);
|
| - consumedCharacters.clear();
|
| - const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch();
|
| - const int length = mostRecent->length;
|
| - const LChar* reference = HTMLEntityTable::entityString(*mostRecent);
|
| - for (int i = 0; i < length; ++i) {
|
| - cc = source.currentChar();
|
| - ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++));
|
| - consumedCharacters.append(cc);
|
| - source.advanceAndASSERT(cc);
|
| - ASSERT(!source.isEmpty());
|
| - }
|
| - cc = source.currentChar();
|
| - }
|
| - if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
|
| - || !additionalAllowedCharacter
|
| - || !(isAlphaNumeric(cc) || cc == '=')) {
|
| - decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
|
| - if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
|
| - decodedEntity.append(second);
|
| - return true;
|
| - }
|
| - unconsumeCharacters(source, consumedCharacters);
|
| - return false;
|
| }
|
|
|
| -bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
|
| +void HTMLEntityParser::reset()
|
| {
|
| - ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
|
| - ASSERT(!notEnoughCharacters);
|
| - ASSERT(decodedEntity.isEmpty());
|
| -
|
| - enum EntityState {
|
| - Initial,
|
| - Number,
|
| - MaybeHexLowerCaseX,
|
| - MaybeHexUpperCaseX,
|
| - Hex,
|
| - Decimal,
|
| - Named
|
| - };
|
| - EntityState entityState = Initial;
|
| - UChar32 result = 0;
|
| - ConsumedCharacterBuffer consumedCharacters;
|
| + m_state = Initial;
|
| + m_result = '\0';
|
| + m_buffer.clear();
|
| + m_buffer.append('&');
|
| +}
|
|
|
| +bool HTMLEntityParser::parse(SegmentedString& source)
|
| +{
|
| while (!source.isEmpty()) {
|
| UChar cc = source.currentChar();
|
| - switch (entityState) {
|
| + switch (m_state) {
|
| case Initial: {
|
| - if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
|
| - return false;
|
| - if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
|
| - return false;
|
| if (cc == '#') {
|
| - entityState = Number;
|
| + m_state = Numeric;
|
| break;
|
| }
|
| - if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
|
| - entityState = Named;
|
| + if (isAlphaNumeric(cc)) {
|
| + m_state = Named;
|
| continue;
|
| }
|
| - return false;
|
| + return true;
|
| }
|
| - case Number: {
|
| - if (cc == 'x') {
|
| - entityState = MaybeHexLowerCaseX;
|
| - break;
|
| - }
|
| - if (cc == 'X') {
|
| - entityState = MaybeHexUpperCaseX;
|
| + case Numeric: {
|
| + if (cc == 'x' || cc == 'X') {
|
| + m_state = PossiblyHex;
|
| break;
|
| }
|
| if (cc >= '0' && cc <= '9') {
|
| - entityState = Decimal;
|
| + m_state = Decimal;
|
| continue;
|
| }
|
| - source.push('#');
|
| - return false;
|
| + return true;
|
| }
|
| - case MaybeHexLowerCaseX: {
|
| + case PossiblyHex: {
|
| if (isHexDigit(cc)) {
|
| - entityState = Hex;
|
| + m_state = Hex;
|
| continue;
|
| }
|
| - source.push('#');
|
| - source.push('x');
|
| - return false;
|
| - }
|
| - case MaybeHexUpperCaseX: {
|
| - if (isHexDigit(cc)) {
|
| - entityState = Hex;
|
| - continue;
|
| - }
|
| - source.push('#');
|
| - source.push('X');
|
| - return false;
|
| + return true;
|
| }
|
| case Hex: {
|
| if (isHexDigit(cc)) {
|
| - if (result != kInvalidUnicode)
|
| - result = result * 16 + asHexDigit(cc);
|
| - } else if (cc == ';') {
|
| + if (m_result != kInvalidUnicode)
|
| + m_result = m_result * 16 + asHexDigit(cc);
|
| + break;
|
| + }
|
| + if (cc == ';') {
|
| source.advanceAndASSERT(cc);
|
| - appendLegalEntityFor(result, decodedEntity);
|
| - return true;
|
| - } else {
|
| - appendLegalEntityFor(result, decodedEntity);
|
| + finalizeNumericEntity();
|
| return true;
|
| }
|
| - break;
|
| + return true;
|
| }
|
| case Decimal: {
|
| if (cc >= '0' && cc <= '9') {
|
| - if (result != kInvalidUnicode)
|
| - result = result * 10 + cc - '0';
|
| - } else if (cc == ';') {
|
| + if (m_result != kInvalidUnicode)
|
| + m_result = m_result * 10 + cc - '0';
|
| + break;
|
| + }
|
| + if (cc == ';') {
|
| source.advanceAndASSERT(cc);
|
| - appendLegalEntityFor(result, decodedEntity);
|
| - return true;
|
| - } else {
|
| - appendLegalEntityFor(result, decodedEntity);
|
| + finalizeNumericEntity();
|
| return true;
|
| }
|
| - break;
|
| + return true;
|
| }
|
| case Named: {
|
| - return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);
|
| + if (isAlphaNumeric(cc))
|
| + break;
|
| + if (cc == ';') {
|
| + source.advanceAndASSERT(cc);
|
| + finalizeNamedEntity();
|
| + return true;
|
| + }
|
| + return true;
|
| }
|
| }
|
|
|
| - if (result > UCHAR_MAX_VALUE)
|
| - result = kInvalidUnicode;
|
| + if (m_result > UCHAR_MAX_VALUE)
|
| + m_result = kInvalidUnicode;
|
|
|
| - consumedCharacters.append(cc);
|
| + m_buffer.append(cc);
|
| source.advanceAndASSERT(cc);
|
| }
|
| ASSERT(source.isEmpty());
|
| - notEnoughCharacters = true;
|
| - unconsumeCharacters(source, consumedCharacters);
|
| return false;
|
| }
|
|
|
| -static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
|
| +void HTMLEntityParser::finalizeNumericEntity()
|
| {
|
| - if (U_IS_BMP(value)) {
|
| - UChar character = static_cast<UChar>(value);
|
| - ASSERT(character == value);
|
| - result[0] = character;
|
| - return 1;
|
| + m_buffer.clear();
|
| + if (m_result <= 0 || m_result > 0x10FFFF || (m_result >= 0xD800 && m_result <= 0xDFFF)) {
|
| + m_buffer.append(replacementCharacter);
|
| + } else if (U_IS_BMP(m_result)) {
|
| + m_buffer.append(m_result);
|
| + } else {
|
| + m_buffer.append(U16_LEAD(m_result));
|
| + m_buffer.append(U16_TRAIL(m_result));
|
| }
|
| -
|
| - result[0] = U16_LEAD(value);
|
| - result[1] = U16_TRAIL(value);
|
| - return 2;
|
| }
|
|
|
| -size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
|
| +void HTMLEntityParser::finalizeNamedEntity()
|
| {
|
| - HTMLEntitySearch search;
|
| - while (*name) {
|
| - search.advance(*name++);
|
| - if (!search.isEntityPrefix())
|
| - return 0;
|
| - }
|
| - search.advance(';');
|
| - if (!search.isEntityPrefix())
|
| - return 0;
|
| -
|
| - size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
|
| - if (!search.mostRecentMatch()->secondValue)
|
| - return numberOfCodePoints;
|
| - return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
|
| + UChar decodedEntity = decodeEntity(m_buffer);
|
| + m_buffer.clear();
|
| + m_buffer.append(decodedEntity);
|
| }
|
|
|
| } // namespace blink
|
|
|