| OLD | NEW |
| 1 /* | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ | 3 // found in the LICENSE file. |
| 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. | |
| 5 * | |
| 6 * Redistribution and use in source and binary forms, with or without | |
| 7 * modification, are permitted provided that the following conditions | |
| 8 * are met: | |
| 9 * 1. Redistributions of source code must retain the above copyright | |
| 10 * notice, this list of conditions and the following disclaimer. | |
| 11 * 2. Redistributions in binary form must reproduce the above copyright | |
| 12 * notice, this list of conditions and the following disclaimer in the | |
| 13 * documentation and/or other materials provided with the distribution. | |
| 14 * | |
| 15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
| 16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
| 18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
| 19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
| 20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
| 21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
| 22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
| 23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 26 */ | |
| 27 | 4 |
| 28 #include "config.h" | 5 #include "config.h" |
| 29 #include "core/html/parser/HTMLEntityParser.h" | 6 #include "core/html/parser/HTMLEntityParser.h" |
| 30 | 7 |
| 31 #include "core/html/parser/HTMLEntitySearch.h" | 8 #include "wtf/unicode/CharacterNames.h" |
| 32 #include "core/html/parser/HTMLEntityTable.h" | |
| 33 #include "wtf/text/StringBuilder.h" | |
| 34 | 9 |
| 35 using namespace WTF; | 10 using namespace WTF; |
| 36 | 11 |
| 37 namespace blink { | 12 namespace blink { |
| 38 | 13 |
| 39 static const UChar windowsLatin1ExtensionArray[32] = { | 14 static const UChar32 kInvalidUnicode = -1; |
| 40 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 | 15 |
| 41 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F | 16 static UChar asHexDigit(UChar cc) |
| 42 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 | 17 { |
| 43 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F | 18 if (cc >= '0' && cc <= '9') |
| 44 }; | 19 return cc - '0'; |
| 20 if (cc >= 'a' && cc <= 'f') |
| 21 return 10 + cc - 'a'; |
| 22 if (cc >= 'A' && cc <= 'F') |
| 23 return 10 + cc - 'A'; |
| 24 ASSERT_NOT_REACHED(); |
| 25 return 0; |
| 26 } |
| 45 | 27 |
| 46 static bool isAlphaNumeric(UChar cc) | 28 static bool isAlphaNumeric(UChar cc) |
| 47 { | 29 { |
| 48 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' &&
cc <= 'Z'); | 30 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' &&
cc <= 'Z'); |
| 49 } | 31 } |
| 50 | 32 |
| 51 static UChar adjustEntity(UChar32 value) | |
| 52 { | |
| 53 if ((value & ~0x1F) != 0x0080) | |
| 54 return value; | |
| 55 return windowsLatin1ExtensionArray[value - 0x80]; | |
| 56 } | |
| 57 | |
| 58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity) | |
| 59 { | |
| 60 // FIXME: A number of specific entity values generate parse errors. | |
| 61 if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) { | |
| 62 decodedEntity.append(0xFFFD); | |
| 63 return; | |
| 64 } | |
| 65 if (U_IS_BMP(c)) { | |
| 66 decodedEntity.append(adjustEntity(c)); | |
| 67 return; | |
| 68 } | |
| 69 decodedEntity.append(c); | |
| 70 } | |
| 71 | |
| 72 static const UChar32 kInvalidUnicode = -1; | |
| 73 | |
| 74 static bool isHexDigit(UChar cc) | 33 static bool isHexDigit(UChar cc) |
| 75 { | 34 { |
| 76 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' &&
cc <= 'F'); | 35 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' &&
cc <= 'F'); |
| 77 } | 36 } |
| 78 | 37 |
| 79 static UChar asHexDigit(UChar cc) | 38 static UChar decodeEntity(HTMLEntityParser::OutputBuffer buffer) |
| 80 { | 39 { |
| 81 if (cc >= '0' && cc <= '9') | 40 if (equalIgnoringNullity(buffer, "&")) |
| 82 return cc - '0'; | 41 return '&'; |
| 83 if (cc >= 'a' && cc <= 'z') | 42 if (equalIgnoringNullity(buffer, "&apos")) |
| 84 return 10 + cc - 'a'; | 43 return '\''; |
| 85 if (cc >= 'A' && cc <= 'Z') | 44 if (equalIgnoringNullity(buffer, ">")) |
| 86 return 10 + cc - 'A'; | 45 return '>'; |
| 87 ASSERT_NOT_REACHED(); | 46 if (equalIgnoringNullity(buffer, "<")) |
| 88 return 0; | 47 return '<'; |
| 48 if (equalIgnoringNullity(buffer, """)) |
| 49 return '"'; |
| 50 return replacementCharacter; |
| 89 } | 51 } |
| 90 | 52 |
| 91 typedef Vector<UChar, 64> ConsumedCharacterBuffer; | 53 HTMLEntityParser::HTMLEntityParser() |
| 92 | |
| 93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer
& consumedCharacters) | |
| 94 { | 54 { |
| 95 if (consumedCharacters.size() == 1) | |
| 96 source.push(consumedCharacters[0]); | |
| 97 else if (consumedCharacters.size() == 2) { | |
| 98 source.push(consumedCharacters[0]); | |
| 99 source.push(consumedCharacters[1]); | |
| 100 } else | |
| 101 source.prepend(SegmentedString(String(consumedCharacters))); | |
| 102 } | 55 } |
| 103 | 56 |
| 104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decod
edEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc
) | 57 HTMLEntityParser::~HTMLEntityParser() |
| 105 { | 58 { |
| 106 ConsumedCharacterBuffer consumedCharacters; | 59 } |
| 107 HTMLEntitySearch entitySearch; | 60 |
| 61 void HTMLEntityParser::reset() |
| 62 { |
| 63 m_state = Initial; |
| 64 m_result = '\0'; |
| 65 m_buffer.clear(); |
| 66 m_buffer.append('&'); |
| 67 } |
| 68 |
| 69 bool HTMLEntityParser::parse(SegmentedString& source) |
| 70 { |
| 108 while (!source.isEmpty()) { | 71 while (!source.isEmpty()) { |
| 109 cc = source.currentChar(); | 72 UChar cc = source.currentChar(); |
| 110 entitySearch.advance(cc); | 73 switch (m_state) { |
| 111 if (!entitySearch.isEntityPrefix()) | 74 case Initial: { |
| 112 break; | 75 if (cc == '#') { |
| 113 consumedCharacters.append(cc); | 76 m_state = Numeric; |
| 77 break; |
| 78 } |
| 79 if (isAlphaNumeric(cc)) { |
| 80 m_state = Named; |
| 81 continue; |
| 82 } |
| 83 return true; |
| 84 } |
| 85 case Numeric: { |
| 86 if (cc == 'x' || cc == 'X') { |
| 87 m_state = PossiblyHex; |
| 88 break; |
| 89 } |
| 90 if (cc >= '0' && cc <= '9') { |
| 91 m_state = Decimal; |
| 92 continue; |
| 93 } |
| 94 return true; |
| 95 } |
| 96 case PossiblyHex: { |
| 97 if (isHexDigit(cc)) { |
| 98 m_state = Hex; |
| 99 continue; |
| 100 } |
| 101 return true; |
| 102 } |
| 103 case Hex: { |
| 104 if (isHexDigit(cc)) { |
| 105 if (m_result != kInvalidUnicode) |
| 106 m_result = m_result * 16 + asHexDigit(cc); |
| 107 break; |
| 108 } |
| 109 if (cc == ';') { |
| 110 source.advanceAndASSERT(cc); |
| 111 finalizeNumericEntity(); |
| 112 return true; |
| 113 } |
| 114 return true; |
| 115 } |
| 116 case Decimal: { |
| 117 if (cc >= '0' && cc <= '9') { |
| 118 if (m_result != kInvalidUnicode) |
| 119 m_result = m_result * 10 + cc - '0'; |
| 120 break; |
| 121 } |
| 122 if (cc == ';') { |
| 123 source.advanceAndASSERT(cc); |
| 124 finalizeNumericEntity(); |
| 125 return true; |
| 126 } |
| 127 return true; |
| 128 } |
| 129 case Named: { |
| 130 if (isAlphaNumeric(cc)) |
| 131 break; |
| 132 if (cc == ';') { |
| 133 source.advanceAndASSERT(cc); |
| 134 finalizeNamedEntity(); |
| 135 return true; |
| 136 } |
| 137 return true; |
| 138 } |
| 139 } |
| 140 |
| 141 if (m_result > UCHAR_MAX_VALUE) |
| 142 m_result = kInvalidUnicode; |
| 143 |
| 144 m_buffer.append(cc); |
| 114 source.advanceAndASSERT(cc); | 145 source.advanceAndASSERT(cc); |
| 115 } | 146 } |
| 116 notEnoughCharacters = source.isEmpty(); | 147 ASSERT(source.isEmpty()); |
| 117 if (notEnoughCharacters) { | |
| 118 // We can't decide on an entity because there might be a longer entity | |
| 119 // that we could match if we had more data. | |
| 120 unconsumeCharacters(source, consumedCharacters); | |
| 121 return false; | |
| 122 } | |
| 123 if (!entitySearch.mostRecentMatch()) { | |
| 124 unconsumeCharacters(source, consumedCharacters); | |
| 125 return false; | |
| 126 } | |
| 127 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength())
{ | |
| 128 // We've consumed too many characters. We need to walk the | |
| 129 // source back to the point at which we had consumed an | |
| 130 // actual entity. | |
| 131 unconsumeCharacters(source, consumedCharacters); | |
| 132 consumedCharacters.clear(); | |
| 133 const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch(); | |
| 134 const int length = mostRecent->length; | |
| 135 const LChar* reference = HTMLEntityTable::entityString(*mostRecent); | |
| 136 for (int i = 0; i < length; ++i) { | |
| 137 cc = source.currentChar(); | |
| 138 ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++)); | |
| 139 consumedCharacters.append(cc); | |
| 140 source.advanceAndASSERT(cc); | |
| 141 ASSERT(!source.isEmpty()); | |
| 142 } | |
| 143 cc = source.currentChar(); | |
| 144 } | |
| 145 if (entitySearch.mostRecentMatch()->lastCharacter() == ';' | |
| 146 || !additionalAllowedCharacter | |
| 147 || !(isAlphaNumeric(cc) || cc == '=')) { | |
| 148 decodedEntity.append(entitySearch.mostRecentMatch()->firstValue); | |
| 149 if (UChar32 second = entitySearch.mostRecentMatch()->secondValue) | |
| 150 decodedEntity.append(second); | |
| 151 return true; | |
| 152 } | |
| 153 unconsumeCharacters(source, consumedCharacters); | |
| 154 return false; | 148 return false; |
| 155 } | 149 } |
| 156 | 150 |
| 157 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity
, bool& notEnoughCharacters, UChar additionalAllowedCharacter) | 151 void HTMLEntityParser::finalizeNumericEntity() |
| 158 { | 152 { |
| 159 ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || a
dditionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); | 153 m_buffer.clear(); |
| 160 ASSERT(!notEnoughCharacters); | 154 if (m_result <= 0 || m_result > 0x10FFFF || (m_result >= 0xD800 && m_result
<= 0xDFFF)) { |
| 161 ASSERT(decodedEntity.isEmpty()); | 155 m_buffer.append(replacementCharacter); |
| 162 | 156 } else if (U_IS_BMP(m_result)) { |
| 163 enum EntityState { | 157 m_buffer.append(m_result); |
| 164 Initial, | 158 } else { |
| 165 Number, | 159 m_buffer.append(U16_LEAD(m_result)); |
| 166 MaybeHexLowerCaseX, | 160 m_buffer.append(U16_TRAIL(m_result)); |
| 167 MaybeHexUpperCaseX, | |
| 168 Hex, | |
| 169 Decimal, | |
| 170 Named | |
| 171 }; | |
| 172 EntityState entityState = Initial; | |
| 173 UChar32 result = 0; | |
| 174 ConsumedCharacterBuffer consumedCharacters; | |
| 175 | |
| 176 while (!source.isEmpty()) { | |
| 177 UChar cc = source.currentChar(); | |
| 178 switch (entityState) { | |
| 179 case Initial: { | |
| 180 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc
== '<' || cc == '&') | |
| 181 return false; | |
| 182 if (additionalAllowedCharacter && cc == additionalAllowedCharacter) | |
| 183 return false; | |
| 184 if (cc == '#') { | |
| 185 entityState = Number; | |
| 186 break; | |
| 187 } | |
| 188 if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { | |
| 189 entityState = Named; | |
| 190 continue; | |
| 191 } | |
| 192 return false; | |
| 193 } | |
| 194 case Number: { | |
| 195 if (cc == 'x') { | |
| 196 entityState = MaybeHexLowerCaseX; | |
| 197 break; | |
| 198 } | |
| 199 if (cc == 'X') { | |
| 200 entityState = MaybeHexUpperCaseX; | |
| 201 break; | |
| 202 } | |
| 203 if (cc >= '0' && cc <= '9') { | |
| 204 entityState = Decimal; | |
| 205 continue; | |
| 206 } | |
| 207 source.push('#'); | |
| 208 return false; | |
| 209 } | |
| 210 case MaybeHexLowerCaseX: { | |
| 211 if (isHexDigit(cc)) { | |
| 212 entityState = Hex; | |
| 213 continue; | |
| 214 } | |
| 215 source.push('#'); | |
| 216 source.push('x'); | |
| 217 return false; | |
| 218 } | |
| 219 case MaybeHexUpperCaseX: { | |
| 220 if (isHexDigit(cc)) { | |
| 221 entityState = Hex; | |
| 222 continue; | |
| 223 } | |
| 224 source.push('#'); | |
| 225 source.push('X'); | |
| 226 return false; | |
| 227 } | |
| 228 case Hex: { | |
| 229 if (isHexDigit(cc)) { | |
| 230 if (result != kInvalidUnicode) | |
| 231 result = result * 16 + asHexDigit(cc); | |
| 232 } else if (cc == ';') { | |
| 233 source.advanceAndASSERT(cc); | |
| 234 appendLegalEntityFor(result, decodedEntity); | |
| 235 return true; | |
| 236 } else { | |
| 237 appendLegalEntityFor(result, decodedEntity); | |
| 238 return true; | |
| 239 } | |
| 240 break; | |
| 241 } | |
| 242 case Decimal: { | |
| 243 if (cc >= '0' && cc <= '9') { | |
| 244 if (result != kInvalidUnicode) | |
| 245 result = result * 10 + cc - '0'; | |
| 246 } else if (cc == ';') { | |
| 247 source.advanceAndASSERT(cc); | |
| 248 appendLegalEntityFor(result, decodedEntity); | |
| 249 return true; | |
| 250 } else { | |
| 251 appendLegalEntityFor(result, decodedEntity); | |
| 252 return true; | |
| 253 } | |
| 254 break; | |
| 255 } | |
| 256 case Named: { | |
| 257 return consumeNamedEntity(source, decodedEntity, notEnoughCharacters
, additionalAllowedCharacter, cc); | |
| 258 } | |
| 259 } | |
| 260 | |
| 261 if (result > UCHAR_MAX_VALUE) | |
| 262 result = kInvalidUnicode; | |
| 263 | |
| 264 consumedCharacters.append(cc); | |
| 265 source.advanceAndASSERT(cc); | |
| 266 } | 161 } |
| 267 ASSERT(source.isEmpty()); | |
| 268 notEnoughCharacters = true; | |
| 269 unconsumeCharacters(source, consumedCharacters); | |
| 270 return false; | |
| 271 } | 162 } |
| 272 | 163 |
| 273 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result) | 164 void HTMLEntityParser::finalizeNamedEntity() |
| 274 { | 165 { |
| 275 if (U_IS_BMP(value)) { | 166 UChar decodedEntity = decodeEntity(m_buffer); |
| 276 UChar character = static_cast<UChar>(value); | 167 m_buffer.clear(); |
| 277 ASSERT(character == value); | 168 m_buffer.append(decodedEntity); |
| 278 result[0] = character; | |
| 279 return 1; | |
| 280 } | |
| 281 | |
| 282 result[0] = U16_LEAD(value); | |
| 283 result[1] = U16_TRAIL(value); | |
| 284 return 2; | |
| 285 } | |
| 286 | |
| 287 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4]) | |
| 288 { | |
| 289 HTMLEntitySearch search; | |
| 290 while (*name) { | |
| 291 search.advance(*name++); | |
| 292 if (!search.isEntityPrefix()) | |
| 293 return 0; | |
| 294 } | |
| 295 search.advance(';'); | |
| 296 if (!search.isEntityPrefix()) | |
| 297 return 0; | |
| 298 | |
| 299 size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch
()->firstValue, result); | |
| 300 if (!search.mostRecentMatch()->secondValue) | |
| 301 return numberOfCodePoints; | |
| 302 return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch
()->secondValue, result + numberOfCodePoints); | |
| 303 } | 169 } |
| 304 | 170 |
| 305 } // namespace blink | 171 } // namespace blink |
| OLD | NEW |