sky/engine/core/html/parser/HTMLEntityParser.cpp - Issue 678073002: Parse Sky entities according to the spec

Side by Side Diff: sky/engine/core/html/parser/HTMLEntityParser.cpp

Issue 678073002: Parse Sky entities according to the spec (Closed) Base URL: git@github.com:domokit/mojo.git@master

Patch Set: Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 /*	1 // Copyright 2014 The Chromium Authors. All rights reserved.

2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.	2 // Use of this source code is governed by a BSD-style license that can be

3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/	3 // found in the LICENSE file.

4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.

5 *

6 * Redistribution and use in source and binary forms, with or without

7 * modification, are permitted provided that the following conditions

8 * are met:

9 * 1. Redistributions of source code must retain the above copyright

10 * notice, this list of conditions and the following disclaimer.

11 * 2. Redistributions in binary form must reproduce the above copyright

12 * notice, this list of conditions and the following disclaimer in the

13 * documentation and/or other materials provided with the distribution.

14 *

15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY

16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE

17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR

18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR

19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,

20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,

21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR

22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY

23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

26 */

27	4

28 #include "config.h"	5 #include "config.h"

29 #include "core/html/parser/HTMLEntityParser.h"	6 #include "core/html/parser/HTMLEntityParser.h"

30	7

31 #include "core/html/parser/HTMLEntitySearch.h"	8 #include "wtf/unicode/CharacterNames.h"

32 #include "core/html/parser/HTMLEntityTable.h"

33 #include "wtf/text/StringBuilder.h"

34	9

35 using namespace WTF;	10 using namespace WTF;

36	11

37 namespace blink {	12 namespace blink {

38	13

39 static const UChar windowsLatin1ExtensionArray[32] = {	14 static const UChar32 kInvalidUnicode = -1;

40 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87	15

41 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F	16 static UChar asHexDigit(UChar cc)

42 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97	17 {

43 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178, // 98-9F	18 if (cc >= '0' && cc <= '9')

44 };	19 return cc - '0';

	20 if (cc >= 'a' && cc <= 'f')

	21 return 10 + cc - 'a';

	22 if (cc >= 'A' && cc <= 'F')

	23 return 10 + cc - 'A';

	24 ASSERT_NOT_REACHED();

	25 return 0;

	26 }

45	27

46 static bool isAlphaNumeric(UChar cc)	28 static bool isAlphaNumeric(UChar cc)

47 {	29 {

48 return (cc >= '0' && cc <= '9') \|\| (cc >= 'a' && cc <= 'z') \|\| (cc >= 'A' && cc <= 'Z');	30 return (cc >= '0' && cc <= '9') \|\| (cc >= 'a' && cc <= 'z') \|\| (cc >= 'A' && cc <= 'Z');

49 }	31 }

50	32

51 static UChar adjustEntity(UChar32 value)

52 {

53 if ((value & ~0x1F) != 0x0080)

54 return value;

55 return windowsLatin1ExtensionArray[value - 0x80];

56 }

57

58 static void appendLegalEntityFor(UChar32 c, DecodedHTMLEntity& decodedEntity)

59 {

60 // FIXME: A number of specific entity values generate parse errors.

61 if (c <= 0 \|\| c > 0x10FFFF \|\| (c >= 0xD800 && c <= 0xDFFF)) {

62 decodedEntity.append(0xFFFD);

63 return;

64 }

65 if (U_IS_BMP(c)) {

66 decodedEntity.append(adjustEntity(c));

67 return;

68 }

69 decodedEntity.append(c);

70 }

71

72 static const UChar32 kInvalidUnicode = -1;

73

74 static bool isHexDigit(UChar cc)	33 static bool isHexDigit(UChar cc)

75 {	34 {

76 return (cc >= '0' && cc <= '9') \|\| (cc >= 'a' && cc <= 'f') \|\| (cc >= 'A' && cc <= 'F');	35 return (cc >= '0' && cc <= '9') \|\| (cc >= 'a' && cc <= 'f') \|\| (cc >= 'A' && cc <= 'F');

77 }	36 }

78	37

79 static UChar asHexDigit(UChar cc)	38 static UChar decodeEntity(HTMLEntityParser::OutputBuffer buffer)

80 {	39 {

81 if (cc >= '0' && cc <= '9')	40 if (equalIgnoringNullity(buffer, "&amp"))

82 return cc - '0';	41 return '&';

83 if (cc >= 'a' && cc <= 'z')	42 if (equalIgnoringNullity(buffer, "&apos"))

84 return 10 + cc - 'a';	43 return '\'';

85 if (cc >= 'A' && cc <= 'Z')	44 if (equalIgnoringNullity(buffer, "&gt"))

86 return 10 + cc - 'A';	45 return '>';

87 ASSERT_NOT_REACHED();	46 if (equalIgnoringNullity(buffer, "&lt"))

88 return 0;	47 return '<';

	48 if (equalIgnoringNullity(buffer, "&quot"))

	49 return '"';

	50 return replacementCharacter;

89 }	51 }

90	52

91 typedef Vector<UChar, 64> ConsumedCharacterBuffer;	53 HTMLEntityParser::HTMLEntityParser()

92

93 static void unconsumeCharacters(SegmentedString& source, ConsumedCharacterBuffer & consumedCharacters)

94 {	54 {

95 if (consumedCharacters.size() == 1)

96 source.push(consumedCharacters[0]);

97 else if (consumedCharacters.size() == 2) {

98 source.push(consumedCharacters[0]);

99 source.push(consumedCharacters[1]);

100 } else

101 source.prepend(SegmentedString(String(consumedCharacters)));

102 }	55 }

103	56

104 static bool consumeNamedEntity(SegmentedString& source, DecodedHTMLEntity& decod edEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc )	57 HTMLEntityParser::~HTMLEntityParser()

105 {	58 {

106 ConsumedCharacterBuffer consumedCharacters;	59 }

107 HTMLEntitySearch entitySearch;	60

	61 void HTMLEntityParser::reset()

	62 {

	63 m_state = Initial;

	64 m_result = '\0';

	65 m_buffer.clear();

	66 m_buffer.append('&');

	67 }

	68

	69 bool HTMLEntityParser::parse(SegmentedString& source)

	70 {

108 while (!source.isEmpty()) {	71 while (!source.isEmpty()) {

109 cc = source.currentChar();	72 UChar cc = source.currentChar();

110 entitySearch.advance(cc);	73 switch (m_state) {

111 if (!entitySearch.isEntityPrefix())	74 case Initial: {

112 break;	75 if (cc == '#') {

113 consumedCharacters.append(cc);	76 m_state = Numeric;

	77 break;

	78 }

	79 if (isAlphaNumeric(cc)) {

	80 m_state = Named;

	81 continue;

	82 }

	83 return true;

	84 }

	85 case Numeric: {

	86 if (cc == 'x' \|\| cc == 'X') {

	87 m_state = PossiblyHex;

	88 break;

	89 }

	90 if (cc >= '0' && cc <= '9') {

	91 m_state = Decimal;

	92 continue;

	93 }

	94 return true;

	95 }

	96 case PossiblyHex: {

	97 if (isHexDigit(cc)) {

	98 m_state = Hex;

	99 continue;

	100 }

	101 return true;

	102 }

	103 case Hex: {

	104 if (isHexDigit(cc)) {

	105 if (m_result != kInvalidUnicode)

	106 m_result = m_result * 16 + asHexDigit(cc);

	107 break;

	108 }

	109 if (cc == ';') {

	110 source.advanceAndASSERT(cc);

	111 finalizeNumericEntity();

	112 return true;

	113 }

	114 return true;

	115 }

	116 case Decimal: {

	117 if (cc >= '0' && cc <= '9') {

	118 if (m_result != kInvalidUnicode)

	119 m_result = m_result * 10 + cc - '0';

	120 break;

	121 }

	122 if (cc == ';') {

	123 source.advanceAndASSERT(cc);

	124 finalizeNumericEntity();

	125 return true;

	126 }

	127 return true;

	128 }

	129 case Named: {

	130 if (isAlphaNumeric(cc))

	131 break;

	132 if (cc == ';') {

	133 source.advanceAndASSERT(cc);

	134 finalizeNamedEntity();

	135 return true;

	136 }

	137 return true;

	138 }

	139 }

	140

	141 if (m_result > UCHAR_MAX_VALUE)

	142 m_result = kInvalidUnicode;

	143

	144 m_buffer.append(cc);

114 source.advanceAndASSERT(cc);	145 source.advanceAndASSERT(cc);

115 }	146 }

116 notEnoughCharacters = source.isEmpty();	147 ASSERT(source.isEmpty());

117 if (notEnoughCharacters) {

118 // We can't decide on an entity because there might be a longer entity

119 // that we could match if we had more data.

120 unconsumeCharacters(source, consumedCharacters);

121 return false;

122 }

123 if (!entitySearch.mostRecentMatch()) {

124 unconsumeCharacters(source, consumedCharacters);

125 return false;

126 }

127 if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {

128 // We've consumed too many characters. We need to walk the

129 // source back to the point at which we had consumed an

130 // actual entity.

131 unconsumeCharacters(source, consumedCharacters);

132 consumedCharacters.clear();

133 const HTMLEntityTableEntry* mostRecent = entitySearch.mostRecentMatch();

134 const int length = mostRecent->length;

135 const LChar* reference = HTMLEntityTable::entityString(*mostRecent);

136 for (int i = 0; i < length; ++i) {

137 cc = source.currentChar();

138 ASSERT_UNUSED(reference, cc == static_cast<UChar>(*reference++));

139 consumedCharacters.append(cc);

140 source.advanceAndASSERT(cc);

141 ASSERT(!source.isEmpty());

142 }

143 cc = source.currentChar();

144 }

145 if (entitySearch.mostRecentMatch()->lastCharacter() == ';'

146 \|\| !additionalAllowedCharacter

147 \|\| !(isAlphaNumeric(cc) \|\| cc == '=')) {

148 decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);

149 if (UChar32 second = entitySearch.mostRecentMatch()->secondValue)

150 decodedEntity.append(second);

151 return true;

152 }

153 unconsumeCharacters(source, consumedCharacters);

154 return false;	148 return false;

155 }	149 }

156	150

157 bool consumeHTMLEntity(SegmentedString& source, DecodedHTMLEntity& decodedEntity , bool& notEnoughCharacters, UChar additionalAllowedCharacter)	151 void HTMLEntityParser::finalizeNumericEntity()

158 {	152 {

159 ASSERT(!additionalAllowedCharacter \|\| additionalAllowedCharacter == '"' \|\| a dditionalAllowedCharacter == '\'' \|\| additionalAllowedCharacter == '>');	153 m_buffer.clear();

160 ASSERT(!notEnoughCharacters);	154 if (m_result <= 0 \|\| m_result > 0x10FFFF \|\| (m_result >= 0xD800 && m_result <= 0xDFFF)) {

161 ASSERT(decodedEntity.isEmpty());	155 m_buffer.append(replacementCharacter);

162	156 } else if (U_IS_BMP(m_result)) {

163 enum EntityState {	157 m_buffer.append(m_result);

164 Initial,	158 } else {

165 Number,	159 m_buffer.append(U16_LEAD(m_result));

166 MaybeHexLowerCaseX,	160 m_buffer.append(U16_TRAIL(m_result));

167 MaybeHexUpperCaseX,

168 Hex,

169 Decimal,

170 Named

171 };

172 EntityState entityState = Initial;

173 UChar32 result = 0;

174 ConsumedCharacterBuffer consumedCharacters;

175

176 while (!source.isEmpty()) {

177 UChar cc = source.currentChar();

178 switch (entityState) {

179 case Initial: {

180 if (cc == '\x09' \|\| cc == '\x0A' \|\| cc == '\x0C' \|\| cc == ' ' \|\| cc == '<' \|\| cc == '&')

181 return false;

182 if (additionalAllowedCharacter && cc == additionalAllowedCharacter)

183 return false;

184 if (cc == '#') {

185 entityState = Number;

186 break;

187 }

188 if ((cc >= 'a' && cc <= 'z') \|\| (cc >= 'A' && cc <= 'Z')) {

189 entityState = Named;

190 continue;

191 }

192 return false;

193 }

194 case Number: {

195 if (cc == 'x') {

196 entityState = MaybeHexLowerCaseX;

197 break;

198 }

199 if (cc == 'X') {

200 entityState = MaybeHexUpperCaseX;

201 break;

202 }

203 if (cc >= '0' && cc <= '9') {

204 entityState = Decimal;

205 continue;

206 }

207 source.push('#');

208 return false;

209 }

210 case MaybeHexLowerCaseX: {

211 if (isHexDigit(cc)) {

212 entityState = Hex;

213 continue;

214 }

215 source.push('#');

216 source.push('x');

217 return false;

218 }

219 case MaybeHexUpperCaseX: {

220 if (isHexDigit(cc)) {

221 entityState = Hex;

222 continue;

223 }

224 source.push('#');

225 source.push('X');

226 return false;

227 }

228 case Hex: {

229 if (isHexDigit(cc)) {

230 if (result != kInvalidUnicode)

231 result = result * 16 + asHexDigit(cc);

232 } else if (cc == ';') {

233 source.advanceAndASSERT(cc);

234 appendLegalEntityFor(result, decodedEntity);

235 return true;

236 } else {

237 appendLegalEntityFor(result, decodedEntity);

238 return true;

239 }

240 break;

241 }

242 case Decimal: {

243 if (cc >= '0' && cc <= '9') {

244 if (result != kInvalidUnicode)

245 result = result * 10 + cc - '0';

246 } else if (cc == ';') {

247 source.advanceAndASSERT(cc);

248 appendLegalEntityFor(result, decodedEntity);

249 return true;

250 } else {

251 appendLegalEntityFor(result, decodedEntity);

252 return true;

253 }

254 break;

255 }

256 case Named: {

257 return consumeNamedEntity(source, decodedEntity, notEnoughCharacters , additionalAllowedCharacter, cc);

258 }

259 }

260

261 if (result > UCHAR_MAX_VALUE)

262 result = kInvalidUnicode;

263

264 consumedCharacters.append(cc);

265 source.advanceAndASSERT(cc);

266 }	161 }

267 ASSERT(source.isEmpty());

268 notEnoughCharacters = true;

269 unconsumeCharacters(source, consumedCharacters);

270 return false;

271 }	162 }

272	163

273 static size_t appendUChar32ToUCharArray(UChar32 value, UChar* result)	164 void HTMLEntityParser::finalizeNamedEntity()

274 {	165 {

275 if (U_IS_BMP(value)) {	166 UChar decodedEntity = decodeEntity(m_buffer);

276 UChar character = static_cast<UChar>(value);	167 m_buffer.clear();

277 ASSERT(character == value);	168 m_buffer.append(decodedEntity);

278 result[0] = character;

279 return 1;

280 }

281

282 result[0] = U16_LEAD(value);

283 result[1] = U16_TRAIL(value);

284 return 2;

285 }

286

287 size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4])

288 {

289 HTMLEntitySearch search;

290 while (*name) {

291 search.advance(*name++);

292 if (!search.isEntityPrefix())

293 return 0;

294 }

295 search.advance(';');

296 if (!search.isEntityPrefix())

297 return 0;

298

299 size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch ()->firstValue, result);

300 if (!search.mostRecentMatch()->secondValue)

301 return numberOfCodePoints;

302 return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch ()->secondValue, result + numberOfCodePoints);

303 }	169 }

304	170

305 } // namespace blink	171 } // namespace blink

OLD	NEW

« no previous file with comments | « sky/engine/core/html/parser/HTMLEntityParser.h ('k') | sky/engine/core/html/parser/HTMLTokenizer.h » ('j') | no next file with comments »