Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(354)

Unified Diff: sky/engine/core/html/parser/HTMLEntityParser.cpp

Issue 678073002: Parse Sky entities according to the spec (Closed) Base URL: git@github.com:domokit/mojo.git@master
Patch Set: Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « sky/engine/core/html/parser/HTMLEntityParser.h ('k') | sky/engine/core/html/parser/HTMLTokenizer.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: sky/engine/core/html/parser/HTMLEntityParser.cpp
diff --git a/sky/engine/core/html/parser/HTMLEntityParser.cpp b/sky/engine/core/html/parser/HTMLEntityParser.cpp
index dc30c83715f4f72ba2b92a598ad4d987b51370c8..6d90746bbe62d43be202489f887aa0f010c27cc7 100644
--- a/sky/engine/core/html/parser/HTMLEntityParser.cpp
+++ b/sky/engine/core/html/parser/HTMLEntityParser.cpp
@@ -1,305 +1,171 @@
-/*
- * Copyright (C) 2008 Apple Inc. All Rights Reserved.
- * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
- * Copyright (C) 2010 Google, Inc. All Rights Reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
+// Copyright 2014 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
#include "config.h"
#include "core/html/parser/HTMLEntityParser.h"
-#include "core/html/parser/HTMLEntitySearch.h"
-#include "core/html/parser/HTMLEntityTable.h"
-#include "wtf/text/StringBuilder.h"
+#include "wtf/unicode/CharacterNames.h"
using namespace WTF;
namespace blink {
-static const UChar windowsLatin1ExtensionArray[32] = {
- 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87
- 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F
- 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97
- 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F
-};
-
-static bool isAlphaNumeric(UChar cc)
-{
- return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
-}
+static const UChar32 kInvalidUnicode = -1;
-static UChar adjustEntity(UChar32 value)
+static UChar asHexDigit(UChar cc)
{
- if ((value & ~0x1F) != 0x0080)
- return value;
- return windowsLatin1ExtensionArray[value - 0x80];
+ if (cc >= '0' && cc <= '9')
+ return cc - '0';
+ if (cc >= 'a' && cc <= 'f')
+ return 10 + cc - 'a';
+ if (cc >= 'A' && cc <= 'F')
+ return 10 + cc - 'A';
+ ASSERT_NOT_REACHED();
+ return 0;
}
-static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
+static bool isAlphaNumeric(UChar cc)
{
- // FIXME: A number of specific entity values generate parse errors.
- if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
- decodedEntity.append(0xFFFD);
- return;
- }
- if (U_IS_BMP(c)) {
- decodedEntity.append(adjustEntity(c));
- return;
- }
- decodedEntity.append(c);
+ return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
}
-static const UChar32 kInvalidUnicode = -1;
-
static bool isHexDigit(UChar cc)
{
return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
}
-static UChar asHexDigit(UChar cc)
+static UChar decodeEntity(HTMLEntityParser::OutputBuffer buffer)
{
- if (cc >= '0' && cc <= '9')
- return cc - '0';
- if (cc >= 'a' && cc <= 'z')
- return 10 + cc - 'a';
- if (cc >= 'A' && cc <= 'Z')
- return 10 + cc - 'A';
- ASSERT_NOT_REACHED();
- return 0;
+ if (equalIgnoringNullity(buffer, "&amp"))
+ return '&';
+ if (equalIgnoringNullity(buffer, "&apos"))
+ return '\'';
+ if (equalIgnoringNullity(buffer, "&gt"))
+ return '>';
+ if (equalIgnoringNullity(buffer, "&lt"))
+ return '<';
+ if (equalIgnoringNullity(buffer, "&quot"))
+ return '"';
+ return replacementCharacter;
}
-typedef Vector<UChar, 64> ConsumedCharacterBuffer;
-
-static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer& consumedCharacters)
+HTMLEntityParser::HTMLEntityParser()
{
- if (consumedCharacters.size() == 1)
- source.push(consumedCharacters[0]);
- else if (consumedCharacters.size() == 2) {
- source.push(consumedCharacters[0]);
- source.push(consumedCharacters[1]);
- } else
- source.prepend(SegmentedString(String(consumedCharacters)));
}
-static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
+HTMLEntityParser::~HTMLEntityParser()
{
- ConsumedCharacterBuffer consumedCharacters;
- HTMLEntitySearch entitySearch;
- while (!source.isEmpty()) {
- cc = source.currentChar();
- entitySearch.advance(cc);
- if (!entitySearch.isEntityPrefix())
- break;
- consumedCharacters.append(cc);
- source.advanceAndASSERT(cc);
- }
- notEnoughCharacters = source.isEmpty();
- if (notEnoughCharacters) {
- // We can't decide on an entity because there might be a longer entity
- // that we could match if we had more data.
- unconsumeCharacters(source, consumedCharacters);
- return false;
- }
- if (!entitySearch.mostRecentMatch()) {
- unconsumeCharacters(source, consumedCharacters);
- return false;
- }
- if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
- // We've consumed too many characters. We need to walk the
- // source back to the point at which we had consumed an
- // actual entity.
- unconsumeCharacters(source, consumedCharacters);
- consumedCharacters.clear();
- const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch();
- const int length = mostRecent->length;
- const LChar* reference = HTMLEntityTable::entityString(*mostRecent);
- for (int i = 0; i < length; ++i) {
- cc = source.currentChar();
- ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++));
- consumedCharacters.append(cc);
- source.advanceAndASSERT(cc);
- ASSERT(!source.isEmpty());
- }
- cc = source.currentChar();
- }
- if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
- || !additionalAllowedCharacter
- || !(isAlphaNumeric(cc) || cc == '=')) {
- decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
- if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
- decodedEntity.append(second);
- return true;
- }
- unconsumeCharacters(source, consumedCharacters);
- return false;
}
-bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter)
+void HTMLEntityParser::reset()
{
- ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>');
- ASSERT(!notEnoughCharacters);
- ASSERT(decodedEntity.isEmpty());
-
- enum EntityState {
- Initial,
- Number,
- MaybeHexLowerCaseX,
- MaybeHexUpperCaseX,
- Hex,
- Decimal,
- Named
- };
- EntityState entityState = Initial;
- UChar32 result = 0;
- ConsumedCharacterBuffer consumedCharacters;
+ m_state = Initial;
+ m_result = '\0';
+ m_buffer.clear();
+ m_buffer.append('&');
+}
+bool HTMLEntityParser::parse(SegmentedString& source)
+{
while (!source.isEmpty()) {
UChar cc = source.currentChar();
- switch (entityState) {
+ switch (m_state) {
case Initial: {
- if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
- return false;
- if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
- return false;
if (cc == '#') {
- entityState = Number;
+ m_state = Numeric;
break;
}
- if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
- entityState = Named;
+ if (isAlphaNumeric(cc)) {
+ m_state = Named;
continue;
}
- return false;
+ return true;
}
- case Number: {
- if (cc == 'x') {
- entityState = MaybeHexLowerCaseX;
- break;
- }
- if (cc == 'X') {
- entityState = MaybeHexUpperCaseX;
+ case Numeric: {
+ if (cc == 'x' || cc == 'X') {
+ m_state = PossiblyHex;
break;
}
if (cc >= '0' && cc <= '9') {
- entityState = Decimal;
+ m_state = Decimal;
continue;
}
- source.push('#');
- return false;
+ return true;
}
- case MaybeHexLowerCaseX: {
+ case PossiblyHex: {
if (isHexDigit(cc)) {
- entityState = Hex;
+ m_state = Hex;
continue;
}
- source.push('#');
- source.push('x');
- return false;
- }
- case MaybeHexUpperCaseX: {
- if (isHexDigit(cc)) {
- entityState = Hex;
- continue;
- }
- source.push('#');
- source.push('X');
- return false;
+ return true;
}
case Hex: {
if (isHexDigit(cc)) {
- if (result != kInvalidUnicode)
- result = result * 16 + asHexDigit(cc);
- } else if (cc == ';') {
+ if (m_result != kInvalidUnicode)
+ m_result = m_result * 16 + asHexDigit(cc);
+ break;
+ }
+ if (cc == ';') {
source.advanceAndASSERT(cc);
- appendLegalEntityFor(result, decodedEntity);
- return true;
- } else {
- appendLegalEntityFor(result, decodedEntity);
+ finalizeNumericEntity();
return true;
}
- break;
+ return true;
}
case Decimal: {
if (cc >= '0' && cc <= '9') {
- if (result != kInvalidUnicode)
- result = result * 10 + cc - '0';
- } else if (cc == ';') {
+ if (m_result != kInvalidUnicode)
+ m_result = m_result * 10 + cc - '0';
+ break;
+ }
+ if (cc == ';') {
source.advanceAndASSERT(cc);
- appendLegalEntityFor(result, decodedEntity);
- return true;
- } else {
- appendLegalEntityFor(result, decodedEntity);
+ finalizeNumericEntity();
return true;
}
- break;
+ return true;
}
case Named: {
- return consumeNamedEntity(source, decodedEntity, notEnoughCharacters, additionalAllowedCharacter, cc);
+ if (isAlphaNumeric(cc))
+ break;
+ if (cc == ';') {
+ source.advanceAndASSERT(cc);
+ finalizeNamedEntity();
+ return true;
+ }
+ return true;
}
}
- if (result > UCHAR_MAX_VALUE)
- result = kInvalidUnicode;
+ if (m_result > UCHAR_MAX_VALUE)
+ m_result = kInvalidUnicode;
- consumedCharacters.append(cc);
+ m_buffer.append(cc);
source.advanceAndASSERT(cc);
}
ASSERT(source.isEmpty());
- notEnoughCharacters = true;
- unconsumeCharacters(source, consumedCharacters);
return false;
}
-static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)
+void HTMLEntityParser::finalizeNumericEntity()
{
- if (U_IS_BMP(value)) {
- UChar character = static_cast<UChar>(value);
- ASSERT(character == value);
- result[0] = character;
- return 1;
+ m_buffer.clear();
+ if (m_result <= 0 || m_result > 0x10FFFF || (m_result >= 0xD800 && m_result <= 0xDFFF)) {
+ m_buffer.append(replacementCharacter);
+ } else if (U_IS_BMP(m_result)) {
+ m_buffer.append(m_result);
+ } else {
+ m_buffer.append(U16_LEAD(m_result));
+ m_buffer.append(U16_TRAIL(m_result));
}
-
- result[0] = U16_LEAD(value);
- result[1] = U16_TRAIL(value);
- return 2;
}
-size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
+void HTMLEntityParser::finalizeNamedEntity()
{
- HTMLEntitySearch search;
- while (*name) {
- search.advance(*name++);
- if (!search.isEntityPrefix())
- return 0;
- }
- search.advance(';');
- if (!search.isEntityPrefix())
- return 0;
-
- size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result);
- if (!search.mostRecentMatch()->secondValue)
- return numberOfCodePoints;
- return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints);
+ UChar decodedEntity = decodeEntity(m_buffer);
+ m_buffer.clear();
+ m_buffer.append(decodedEntity);
}
} // namespace blink
« no previous file with comments | « sky/engine/core/html/parser/HTMLEntityParser.h ('k') | sky/engine/core/html/parser/HTMLTokenizer.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698