Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(65)

Side by Side Diff: sky/engine/core/html/parser/HTMLEntityParser.cpp

Issue 678073002: Parse Sky entities according to the spec (Closed) Base URL: git@github.com:domokit/mojo.git@master
Patch Set: Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 /* 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 2 // Use of this source code is governed by a BSD-style license that can be
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 3 // found in the LICENSE file.
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27 4
28 #include "config.h" 5 #include "config.h"
29 #include "core/html/parser/HTMLEntityParser.h" 6 #include "core/html/parser/HTMLEntityParser.h"
30 7
31 #include "core/html/parser/HTMLEntitySearch.h" 8 #include "wtf/unicode/CharacterNames.h"
32 #include "core/html/parser/HTMLEntityTable.h"
33 #include "wtf/text/StringBuilder.h"
34 9
35 using namespace WTF; 10 using namespace WTF;
36 11
37 namespace blink { 12 namespace blink {
38 13
39 static const UChar windowsLatin1ExtensionArray[32] = { 14 static const UChar32 kInvalidUnicode = -1;
40 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 15
41 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F 16 static UChar asHexDigit(UChar cc)
42 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 17 {
43 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F 18 if (cc >= '0' && cc <= '9')
44 }; 19 return cc - '0';
20 if (cc >= 'a' && cc <= 'f')
21 return 10 + cc - 'a';
22 if (cc >= 'A' && cc <= 'F')
23 return 10 + cc - 'A';
24 ASSERT_NOT_REACHED();
25 return 0;
26 }
45 27
46 static bool isAlphaNumeric(UChar cc) 28 static bool isAlphaNumeric(UChar cc)
47 { 29 {
48 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z'); 30 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z');
49 } 31 }
50 32
51 static UChar adjustEntity(UChar32 value)
52 {
53 if ((value & ~0x1F) != 0x0080)
54 return value;
55 return windowsLatin1ExtensionArray[value - 0x80];
56 }
57
58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)
59 {
60 // FIXME: A number of specific entity values generate parse errors.
61 if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) {
62 decodedEntity.append(0xFFFD);
63 return;
64 }
65 if (U_IS_BMP(c)) {
66 decodedEntity.append(adjustEntity(c));
67 return;
68 }
69 decodedEntity.append(c);
70 }
71
72 static const UChar32 kInvalidUnicode = -1;
73
74 static bool isHexDigit(UChar cc) 33 static bool isHexDigit(UChar cc)
75 { 34 {
76 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'); 35 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F');
77 } 36 }
78 37
79 static UChar asHexDigit(UChar cc) 38 static UChar decodeEntity(HTMLEntityParser::OutputBuffer buffer)
80 { 39 {
81 if (cc >= '0' && cc <= '9') 40 if (equalIgnoringNullity(buffer, "&amp"))
82 return cc - '0'; 41 return '&';
83 if (cc >= 'a' && cc <= 'z') 42 if (equalIgnoringNullity(buffer, "&apos"))
84 return 10 + cc - 'a'; 43 return '\'';
85 if (cc >= 'A' && cc <= 'Z') 44 if (equalIgnoringNullity(buffer, "&gt"))
86 return 10 + cc - 'A'; 45 return '>';
87 ASSERT_NOT_REACHED(); 46 if (equalIgnoringNullity(buffer, "&lt"))
88 return 0; 47 return '<';
48 if (equalIgnoringNullity(buffer, "&quot"))
49 return '"';
50 return replacementCharacter;
89 } 51 }
90 52
91 typedef Vector<UChar, 64> ConsumedCharacterBuffer; 53 HTMLEntityParser::HTMLEntityParser()
92
93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer & consumedCharacters)
94 { 54 {
95 if (consumedCharacters.size() == 1)
96 source.push(consumedCharacters[0]);
97 else if (consumedCharacters.size() == 2) {
98 source.push(consumedCharacters[0]);
99 source.push(consumedCharacters[1]);
100 } else
101 source.prepend(SegmentedString(String(consumedCharacters)));
102 } 55 }
103 56
104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decod edEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc ) 57 HTMLEntityParser::~HTMLEntityParser()
105 { 58 {
106 ConsumedCharacterBuffer consumedCharacters; 59 }
107 HTMLEntitySearch entitySearch; 60
61 void HTMLEntityParser::reset()
62 {
63 m_state = Initial;
64 m_result = '\0';
65 m_buffer.clear();
66 m_buffer.append('&');
67 }
68
69 bool HTMLEntityParser::parse(SegmentedString& source)
70 {
108 while (!source.isEmpty()) { 71 while (!source.isEmpty()) {
109 cc = source.currentChar(); 72 UChar cc = source.currentChar();
110 entitySearch.advance(cc); 73 switch (m_state) {
111 if (!entitySearch.isEntityPrefix()) 74 case Initial: {
112 break; 75 if (cc == '#') {
113 consumedCharacters.append(cc); 76 m_state = Numeric;
77 break;
78 }
79 if (isAlphaNumeric(cc)) {
80 m_state = Named;
81 continue;
82 }
83 return true;
84 }
85 case Numeric: {
86 if (cc == 'x' || cc == 'X') {
87 m_state = PossiblyHex;
88 break;
89 }
90 if (cc >= '0' && cc <= '9') {
91 m_state = Decimal;
92 continue;
93 }
94 return true;
95 }
96 case PossiblyHex: {
97 if (isHexDigit(cc)) {
98 m_state = Hex;
99 continue;
100 }
101 return true;
102 }
103 case Hex: {
104 if (isHexDigit(cc)) {
105 if (m_result != kInvalidUnicode)
106 m_result = m_result * 16 + asHexDigit(cc);
107 break;
108 }
109 if (cc == ';') {
110 source.advanceAndASSERT(cc);
111 finalizeNumericEntity();
112 return true;
113 }
114 return true;
115 }
116 case Decimal: {
117 if (cc >= '0' && cc <= '9') {
118 if (m_result != kInvalidUnicode)
119 m_result = m_result * 10 + cc - '0';
120 break;
121 }
122 if (cc == ';') {
123 source.advanceAndASSERT(cc);
124 finalizeNumericEntity();
125 return true;
126 }
127 return true;
128 }
129 case Named: {
130 if (isAlphaNumeric(cc))
131 break;
132 if (cc == ';') {
133 source.advanceAndASSERT(cc);
134 finalizeNamedEntity();
135 return true;
136 }
137 return true;
138 }
139 }
140
141 if (m_result > UCHAR_MAX_VALUE)
142 m_result = kInvalidUnicode;
143
144 m_buffer.append(cc);
114 source.advanceAndASSERT(cc); 145 source.advanceAndASSERT(cc);
115 } 146 }
116 notEnoughCharacters = source.isEmpty(); 147 ASSERT(source.isEmpty());
117 if (notEnoughCharacters) {
118 // We can't decide on an entity because there might be a longer entity
119 // that we could match if we had more data.
120 unconsumeCharacters(source, consumedCharacters);
121 return false;
122 }
123 if (!entitySearch.mostRecentMatch()) {
124 unconsumeCharacters(source, consumedCharacters);
125 return false;
126 }
127 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
128 // We've consumed too many characters. We need to walk the
129 // source back to the point at which we had consumed an
130 // actual entity.
131 unconsumeCharacters(source, consumedCharacters);
132 consumedCharacters.clear();
133 const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch();
134 const int length = mostRecent->length;
135 const LChar* reference = HTMLEntityTable::entityString(*mostRecent);
136 for (int i = 0; i < length; ++i) {
137 cc = source.currentChar();
138 ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++));
139 consumedCharacters.append(cc);
140 source.advanceAndASSERT(cc);
141 ASSERT(!source.isEmpty());
142 }
143 cc = source.currentChar();
144 }
145 if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
146 || !additionalAllowedCharacter
147 || !(isAlphaNumeric(cc) || cc == '=')) {
148 decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
149 if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)
150 decodedEntity.append(second);
151 return true;
152 }
153 unconsumeCharacters(source, consumedCharacters);
154 return false; 148 return false;
155 } 149 }
156 150
157 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity , bool& notEnoughCharacters, UChar additionalAllowedCharacter) 151 void HTMLEntityParser::finalizeNumericEntity()
158 { 152 {
159 ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || a dditionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); 153 m_buffer.clear();
160 ASSERT(!notEnoughCharacters); 154 if (m_result <= 0 || m_result > 0x10FFFF || (m_result >= 0xD800 && m_result <= 0xDFFF)) {
161 ASSERT(decodedEntity.isEmpty()); 155 m_buffer.append(replacementCharacter);
162 156 } else if (U_IS_BMP(m_result)) {
163 enum EntityState { 157 m_buffer.append(m_result);
164 Initial, 158 } else {
165 Number, 159 m_buffer.append(U16_LEAD(m_result));
166 MaybeHexLowerCaseX, 160 m_buffer.append(U16_TRAIL(m_result));
167 MaybeHexUpperCaseX,
168 Hex,
169 Decimal,
170 Named
171 };
172 EntityState entityState = Initial;
173 UChar32 result = 0;
174 ConsumedCharacterBuffer consumedCharacters;
175
176 while (!source.isEmpty()) {
177 UChar cc = source.currentChar();
178 switch (entityState) {
179 case Initial: {
180 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&')
181 return false;
182 if (additionalAllowedCharacter && cc == additionalAllowedCharacter)
183 return false;
184 if (cc == '#') {
185 entityState = Number;
186 break;
187 }
188 if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) {
189 entityState = Named;
190 continue;
191 }
192 return false;
193 }
194 case Number: {
195 if (cc == 'x') {
196 entityState = MaybeHexLowerCaseX;
197 break;
198 }
199 if (cc == 'X') {
200 entityState = MaybeHexUpperCaseX;
201 break;
202 }
203 if (cc >= '0' && cc <= '9') {
204 entityState = Decimal;
205 continue;
206 }
207 source.push('#');
208 return false;
209 }
210 case MaybeHexLowerCaseX: {
211 if (isHexDigit(cc)) {
212 entityState = Hex;
213 continue;
214 }
215 source.push('#');
216 source.push('x');
217 return false;
218 }
219 case MaybeHexUpperCaseX: {
220 if (isHexDigit(cc)) {
221 entityState = Hex;
222 continue;
223 }
224 source.push('#');
225 source.push('X');
226 return false;
227 }
228 case Hex: {
229 if (isHexDigit(cc)) {
230 if (result != kInvalidUnicode)
231 result = result * 16 + asHexDigit(cc);
232 } else if (cc == ';') {
233 source.advanceAndASSERT(cc);
234 appendLegalEntityFor(result, decodedEntity);
235 return true;
236 } else {
237 appendLegalEntityFor(result, decodedEntity);
238 return true;
239 }
240 break;
241 }
242 case Decimal: {
243 if (cc >= '0' && cc <= '9') {
244 if (result != kInvalidUnicode)
245 result = result * 10 + cc - '0';
246 } else if (cc == ';') {
247 source.advanceAndASSERT(cc);
248 appendLegalEntityFor(result, decodedEntity);
249 return true;
250 } else {
251 appendLegalEntityFor(result, decodedEntity);
252 return true;
253 }
254 break;
255 }
256 case Named: {
257 return consumeNamedEntity(source, decodedEntity, notEnoughCharacters , additionalAllowedCharacter, cc);
258 }
259 }
260
261 if (result > UCHAR_MAX_VALUE)
262 result = kInvalidUnicode;
263
264 consumedCharacters.append(cc);
265 source.advanceAndASSERT(cc);
266 } 161 }
267 ASSERT(source.isEmpty());
268 notEnoughCharacters = true;
269 unconsumeCharacters(source, consumedCharacters);
270 return false;
271 } 162 }
272 163
273 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result) 164 void HTMLEntityParser::finalizeNamedEntity()
274 { 165 {
275 if (U_IS_BMP(value)) { 166 UChar decodedEntity = decodeEntity(m_buffer);
276 UChar character = static_cast<UChar>(value); 167 m_buffer.clear();
277 ASSERT(character == value); 168 m_buffer.append(decodedEntity);
278 result[0] = character;
279 return 1;
280 }
281
282 result[0] = U16_LEAD(value);
283 result[1] = U16_TRAIL(value);
284 return 2;
285 }
286
287 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])
288 {
289 HTMLEntitySearch search;
290 while (*name) {
291 search.advance(*name++);
292 if (!search.isEntityPrefix())
293 return 0;
294 }
295 search.advance(';');
296 if (!search.isEntityPrefix())
297 return 0;
298
299 size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch ()->firstValue, result);
300 if (!search.mostRecentMatch()->secondValue)
301 return numberOfCodePoints;
302 return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch ()->secondValue, result + numberOfCodePoints);
303 } 169 }
304 170
305 } // namespace blink 171 } // namespace blink
OLDNEW
« no previous file with comments | « sky/engine/core/html/parser/HTMLEntityParser.h ('k') | sky/engine/core/html/parser/HTMLTokenizer.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698