OLD | NEW |
1 /* | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. | 2 // Use of this source code is governed by a BSD-style license that can be |
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ | 3 // found in the LICENSE file. |
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. | |
5 * | |
6 * Redistribution and use in source and binary forms, with or without | |
7 * modification, are permitted provided that the following conditions | |
8 * are met: | |
9 * 1. Redistributions of source code must retain the above copyright | |
10 * notice, this list of conditions and the following disclaimer. | |
11 * 2. Redistributions in binary form must reproduce the above copyright | |
12 * notice, this list of conditions and the following disclaimer in the | |
13 * documentation and/or other materials provided with the distribution. | |
14 * | |
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY | |
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR | |
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR | |
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, | |
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, | |
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR | |
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY | |
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
26 */ | |
27 | 4 |
28 #include "config.h" | 5 #include "config.h" |
29 #include "core/html/parser/HTMLEntityParser.h" | 6 #include "core/html/parser/HTMLEntityParser.h" |
30 | 7 |
31 #include "core/html/parser/HTMLEntitySearch.h" | 8 #include "wtf/unicode/CharacterNames.h" |
32 #include "core/html/parser/HTMLEntityTable.h" | |
33 #include "wtf/text/StringBuilder.h" | |
34 | 9 |
35 using namespace WTF; | 10 using namespace WTF; |
36 | 11 |
37 namespace blink { | 12 namespace blink { |
38 | 13 |
39 static const UChar windowsLatin1ExtensionArray[32] = { | 14 static const UChar32 kInvalidUnicode = -1; |
40 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87 | 15 |
41 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F | 16 static UChar asHexDigit(UChar cc) |
42 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97 | 17 { |
43 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F | 18 if (cc >= '0' && cc <= '9') |
44 }; | 19 return cc - '0'; |
| 20 if (cc >= 'a' && cc <= 'f') |
| 21 return 10 + cc - 'a'; |
| 22 if (cc >= 'A' && cc <= 'F') |
| 23 return 10 + cc - 'A'; |
| 24 ASSERT_NOT_REACHED(); |
| 25 return 0; |
| 26 } |
45 | 27 |
46 static bool isAlphaNumeric(UChar cc) | 28 static bool isAlphaNumeric(UChar cc) |
47 { | 29 { |
48 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' &&
cc <= 'Z'); | 30 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'z') || (cc >= 'A' &&
cc <= 'Z'); |
49 } | 31 } |
50 | 32 |
51 static UChar adjustEntity(UChar32 value) | |
52 { | |
53 if ((value & ~0x1F) != 0x0080) | |
54 return value; | |
55 return windowsLatin1ExtensionArray[value - 0x80]; | |
56 } | |
57 | |
58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity) | |
59 { | |
60 // FIXME: A number of specific entity values generate parse errors. | |
61 if (c <= 0 || c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) { | |
62 decodedEntity.append(0xFFFD); | |
63 return; | |
64 } | |
65 if (U_IS_BMP(c)) { | |
66 decodedEntity.append(adjustEntity(c)); | |
67 return; | |
68 } | |
69 decodedEntity.append(c); | |
70 } | |
71 | |
72 static const UChar32 kInvalidUnicode = -1; | |
73 | |
74 static bool isHexDigit(UChar cc) | 33 static bool isHexDigit(UChar cc) |
75 { | 34 { |
76 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' &&
cc <= 'F'); | 35 return (cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' &&
cc <= 'F'); |
77 } | 36 } |
78 | 37 |
79 static UChar asHexDigit(UChar cc) | 38 static UChar decodeEntity(HTMLEntityParser::OutputBuffer buffer) |
80 { | 39 { |
81 if (cc >= '0' && cc <= '9') | 40 if (equalIgnoringNullity(buffer, "&")) |
82 return cc - '0'; | 41 return '&'; |
83 if (cc >= 'a' && cc <= 'z') | 42 if (equalIgnoringNullity(buffer, "&apos")) |
84 return 10 + cc - 'a'; | 43 return '\''; |
85 if (cc >= 'A' && cc <= 'Z') | 44 if (equalIgnoringNullity(buffer, ">")) |
86 return 10 + cc - 'A'; | 45 return '>'; |
87 ASSERT_NOT_REACHED(); | 46 if (equalIgnoringNullity(buffer, "<")) |
88 return 0; | 47 return '<'; |
| 48 if (equalIgnoringNullity(buffer, """)) |
| 49 return '"'; |
| 50 return replacementCharacter; |
89 } | 51 } |
90 | 52 |
91 typedef Vector<UChar, 64> ConsumedCharacterBuffer; | 53 HTMLEntityParser::HTMLEntityParser() |
92 | |
93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer
& consumedCharacters) | |
94 { | 54 { |
95 if (consumedCharacters.size() == 1) | |
96 source.push(consumedCharacters[0]); | |
97 else if (consumedCharacters.size() == 2) { | |
98 source.push(consumedCharacters[0]); | |
99 source.push(consumedCharacters[1]); | |
100 } else | |
101 source.prepend(SegmentedString(String(consumedCharacters))); | |
102 } | 55 } |
103 | 56 |
104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decod
edEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc
) | 57 HTMLEntityParser::~HTMLEntityParser() |
105 { | 58 { |
106 ConsumedCharacterBuffer consumedCharacters; | 59 } |
107 HTMLEntitySearch entitySearch; | 60 |
| 61 void HTMLEntityParser::reset() |
| 62 { |
| 63 m_state = Initial; |
| 64 m_result = '\0'; |
| 65 m_buffer.clear(); |
| 66 m_buffer.append('&'); |
| 67 } |
| 68 |
| 69 bool HTMLEntityParser::parse(SegmentedString& source) |
| 70 { |
108 while (!source.isEmpty()) { | 71 while (!source.isEmpty()) { |
109 cc = source.currentChar(); | 72 UChar cc = source.currentChar(); |
110 entitySearch.advance(cc); | 73 switch (m_state) { |
111 if (!entitySearch.isEntityPrefix()) | 74 case Initial: { |
112 break; | 75 if (cc == '#') { |
113 consumedCharacters.append(cc); | 76 m_state = Numeric; |
| 77 break; |
| 78 } |
| 79 if (isAlphaNumeric(cc)) { |
| 80 m_state = Named; |
| 81 continue; |
| 82 } |
| 83 return true; |
| 84 } |
| 85 case Numeric: { |
| 86 if (cc == 'x' || cc == 'X') { |
| 87 m_state = PossiblyHex; |
| 88 break; |
| 89 } |
| 90 if (cc >= '0' && cc <= '9') { |
| 91 m_state = Decimal; |
| 92 continue; |
| 93 } |
| 94 return true; |
| 95 } |
| 96 case PossiblyHex: { |
| 97 if (isHexDigit(cc)) { |
| 98 m_state = Hex; |
| 99 continue; |
| 100 } |
| 101 return true; |
| 102 } |
| 103 case Hex: { |
| 104 if (isHexDigit(cc)) { |
| 105 if (m_result != kInvalidUnicode) |
| 106 m_result = m_result * 16 + asHexDigit(cc); |
| 107 break; |
| 108 } |
| 109 if (cc == ';') { |
| 110 source.advanceAndASSERT(cc); |
| 111 finalizeNumericEntity(); |
| 112 return true; |
| 113 } |
| 114 return true; |
| 115 } |
| 116 case Decimal: { |
| 117 if (cc >= '0' && cc <= '9') { |
| 118 if (m_result != kInvalidUnicode) |
| 119 m_result = m_result * 10 + cc - '0'; |
| 120 break; |
| 121 } |
| 122 if (cc == ';') { |
| 123 source.advanceAndASSERT(cc); |
| 124 finalizeNumericEntity(); |
| 125 return true; |
| 126 } |
| 127 return true; |
| 128 } |
| 129 case Named: { |
| 130 if (isAlphaNumeric(cc)) |
| 131 break; |
| 132 if (cc == ';') { |
| 133 source.advanceAndASSERT(cc); |
| 134 finalizeNamedEntity(); |
| 135 return true; |
| 136 } |
| 137 return true; |
| 138 } |
| 139 } |
| 140 |
| 141 if (m_result > UCHAR_MAX_VALUE) |
| 142 m_result = kInvalidUnicode; |
| 143 |
| 144 m_buffer.append(cc); |
114 source.advanceAndASSERT(cc); | 145 source.advanceAndASSERT(cc); |
115 } | 146 } |
116 notEnoughCharacters = source.isEmpty(); | 147 ASSERT(source.isEmpty()); |
117 if (notEnoughCharacters) { | |
118 // We can't decide on an entity because there might be a longer entity | |
119 // that we could match if we had more data. | |
120 unconsumeCharacters(source, consumedCharacters); | |
121 return false; | |
122 } | |
123 if (!entitySearch.mostRecentMatch()) { | |
124 unconsumeCharacters(source, consumedCharacters); | |
125 return false; | |
126 } | |
127 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength())
{ | |
128 // We've consumed too many characters. We need to walk the | |
129 // source back to the point at which we had consumed an | |
130 // actual entity. | |
131 unconsumeCharacters(source, consumedCharacters); | |
132 consumedCharacters.clear(); | |
133 const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch(); | |
134 const int length = mostRecent->length; | |
135 const LChar* reference = HTMLEntityTable::entityString(*mostRecent); | |
136 for (int i = 0; i < length; ++i) { | |
137 cc = source.currentChar(); | |
138 ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++)); | |
139 consumedCharacters.append(cc); | |
140 source.advanceAndASSERT(cc); | |
141 ASSERT(!source.isEmpty()); | |
142 } | |
143 cc = source.currentChar(); | |
144 } | |
145 if (entitySearch.mostRecentMatch()->lastCharacter() == ';' | |
146 || !additionalAllowedCharacter | |
147 || !(isAlphaNumeric(cc) || cc == '=')) { | |
148 decodedEntity.append(entitySearch.mostRecentMatch()->firstValue); | |
149 if (UChar32 second = entitySearch.mostRecentMatch()->secondValue) | |
150 decodedEntity.append(second); | |
151 return true; | |
152 } | |
153 unconsumeCharacters(source, consumedCharacters); | |
154 return false; | 148 return false; |
155 } | 149 } |
156 | 150 |
157 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity
, bool& notEnoughCharacters, UChar additionalAllowedCharacter) | 151 void HTMLEntityParser::finalizeNumericEntity() |
158 { | 152 { |
159 ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || a
dditionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); | 153 m_buffer.clear(); |
160 ASSERT(!notEnoughCharacters); | 154 if (m_result <= 0 || m_result > 0x10FFFF || (m_result >= 0xD800 && m_result
<= 0xDFFF)) { |
161 ASSERT(decodedEntity.isEmpty()); | 155 m_buffer.append(replacementCharacter); |
162 | 156 } else if (U_IS_BMP(m_result)) { |
163 enum EntityState { | 157 m_buffer.append(m_result); |
164 Initial, | 158 } else { |
165 Number, | 159 m_buffer.append(U16_LEAD(m_result)); |
166 MaybeHexLowerCaseX, | 160 m_buffer.append(U16_TRAIL(m_result)); |
167 MaybeHexUpperCaseX, | |
168 Hex, | |
169 Decimal, | |
170 Named | |
171 }; | |
172 EntityState entityState = Initial; | |
173 UChar32 result = 0; | |
174 ConsumedCharacterBuffer consumedCharacters; | |
175 | |
176 while (!source.isEmpty()) { | |
177 UChar cc = source.currentChar(); | |
178 switch (entityState) { | |
179 case Initial: { | |
180 if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc
== '<' || cc == '&') | |
181 return false; | |
182 if (additionalAllowedCharacter && cc == additionalAllowedCharacter) | |
183 return false; | |
184 if (cc == '#') { | |
185 entityState = Number; | |
186 break; | |
187 } | |
188 if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { | |
189 entityState = Named; | |
190 continue; | |
191 } | |
192 return false; | |
193 } | |
194 case Number: { | |
195 if (cc == 'x') { | |
196 entityState = MaybeHexLowerCaseX; | |
197 break; | |
198 } | |
199 if (cc == 'X') { | |
200 entityState = MaybeHexUpperCaseX; | |
201 break; | |
202 } | |
203 if (cc >= '0' && cc <= '9') { | |
204 entityState = Decimal; | |
205 continue; | |
206 } | |
207 source.push('#'); | |
208 return false; | |
209 } | |
210 case MaybeHexLowerCaseX: { | |
211 if (isHexDigit(cc)) { | |
212 entityState = Hex; | |
213 continue; | |
214 } | |
215 source.push('#'); | |
216 source.push('x'); | |
217 return false; | |
218 } | |
219 case MaybeHexUpperCaseX: { | |
220 if (isHexDigit(cc)) { | |
221 entityState = Hex; | |
222 continue; | |
223 } | |
224 source.push('#'); | |
225 source.push('X'); | |
226 return false; | |
227 } | |
228 case Hex: { | |
229 if (isHexDigit(cc)) { | |
230 if (result != kInvalidUnicode) | |
231 result = result * 16 + asHexDigit(cc); | |
232 } else if (cc == ';') { | |
233 source.advanceAndASSERT(cc); | |
234 appendLegalEntityFor(result, decodedEntity); | |
235 return true; | |
236 } else { | |
237 appendLegalEntityFor(result, decodedEntity); | |
238 return true; | |
239 } | |
240 break; | |
241 } | |
242 case Decimal: { | |
243 if (cc >= '0' && cc <= '9') { | |
244 if (result != kInvalidUnicode) | |
245 result = result * 10 + cc - '0'; | |
246 } else if (cc == ';') { | |
247 source.advanceAndASSERT(cc); | |
248 appendLegalEntityFor(result, decodedEntity); | |
249 return true; | |
250 } else { | |
251 appendLegalEntityFor(result, decodedEntity); | |
252 return true; | |
253 } | |
254 break; | |
255 } | |
256 case Named: { | |
257 return consumeNamedEntity(source, decodedEntity, notEnoughCharacters
, additionalAllowedCharacter, cc); | |
258 } | |
259 } | |
260 | |
261 if (result > UCHAR_MAX_VALUE) | |
262 result = kInvalidUnicode; | |
263 | |
264 consumedCharacters.append(cc); | |
265 source.advanceAndASSERT(cc); | |
266 } | 161 } |
267 ASSERT(source.isEmpty()); | |
268 notEnoughCharacters = true; | |
269 unconsumeCharacters(source, consumedCharacters); | |
270 return false; | |
271 } | 162 } |
272 | 163 |
273 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result) | 164 void HTMLEntityParser::finalizeNamedEntity() |
274 { | 165 { |
275 if (U_IS_BMP(value)) { | 166 UChar decodedEntity = decodeEntity(m_buffer); |
276 UChar character = static_cast<UChar>(value); | 167 m_buffer.clear(); |
277 ASSERT(character == value); | 168 m_buffer.append(decodedEntity); |
278 result[0] = character; | |
279 return 1; | |
280 } | |
281 | |
282 result[0] = U16_LEAD(value); | |
283 result[1] = U16_TRAIL(value); | |
284 return 2; | |
285 } | |
286 | |
287 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4]) | |
288 { | |
289 HTMLEntitySearch search; | |
290 while (*name) { | |
291 search.advance(*name++); | |
292 if (!search.isEntityPrefix()) | |
293 return 0; | |
294 } | |
295 search.advance(';'); | |
296 if (!search.isEntityPrefix()) | |
297 return 0; | |
298 | |
299 size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch
()->firstValue, result); | |
300 if (!search.mostRecentMatch()->secondValue) | |
301 return numberOfCodePoints; | |
302 return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch
()->secondValue, result + numberOfCodePoints); | |
303 } | 169 } |
304 | 170 |
305 } // namespace blink | 171 } // namespace blink |
OLD | NEW |