Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(328)

Side by Side Diff: sky/engine/core/html/parser/HTMLTokenizer.cpp

Issue 1215103007: Remove remaining HTML elements (Closed) Base URL: git@github.com:domokit/mojo.git@master
Patch Set: Created 5 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
14 *
15 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
16 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
19 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 */
27
28 #include "sky/engine/core/html/parser/HTMLTokenizer.h"
29
30 #include "gen/sky/core/HTMLNames.h"
31 #include "sky/engine/core/html/parser/AtomicHTMLToken.h"
32 #include "sky/engine/core/html/parser/HTMLEntityParser.h"
33 #include "sky/engine/core/html/parser/HTMLParserIdioms.h"
34 #include "sky/engine/core/html/parser/HTMLTreeBuilder.h"
35 #include "sky/engine/core/html/parser/MarkupTokenizerInlines.h"
36 #include "sky/engine/platform/NotImplemented.h"
37 #include "sky/engine/wtf/ASCIICType.h"
38 #include "sky/engine/wtf/text/AtomicString.h"
39 #include "sky/engine/wtf/unicode/Unicode.h"
40
41 // Please don't use DEFINE_STATIC_LOCAL in this file. The HTMLTokenizer is used
42 // from multiple threads and DEFINE_STATIC_LOCAL isn't threadsafe.
43 #undef DEFINE_STATIC_LOCAL
44
45 namespace blink {
46
47 // This has to go in a .cpp file, as the linker doesn't like it being included m ore than once.
48 // We don't have an HTMLToken.cpp though, so this is the next best place.
49 QualifiedName AtomicHTMLToken::nameForAttribute(const HTMLToken::Attribute& attr ibute) const
50 {
51 return QualifiedName(AtomicString(attribute.name));
52 }
53
54 bool AtomicHTMLToken::usesName() const
55 {
56 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
57 }
58
59 bool AtomicHTMLToken::usesAttributes() const
60 {
61 return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
62 }
63
64 static inline bool isEndTagBufferingState(HTMLTokenizer::State state)
65 {
66 return state == HTMLTokenizer::RawDataEndTagOpenState || state == HTMLTokeni zer::RawDataEndTagNameState;
67 }
68
69 #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName)
70 #define HTML_RECONSUME_IN(stateName) RECONSUME_IN(HTMLTokenizer, stateName)
71 #define HTML_ADVANCE_TO(stateName) ADVANCE_TO(HTMLTokenizer, stateName)
72 #define HTML_SWITCH_TO(stateName) SWITCH_TO(HTMLTokenizer, stateName)
73
74 HTMLTokenizer::HTMLTokenizer()
75 : m_inputStreamPreprocessor(this)
76 {
77 reset();
78 }
79
80 HTMLTokenizer::~HTMLTokenizer()
81 {
82 }
83
84 void HTMLTokenizer::reset()
85 {
86 m_state = HTMLTokenizer::DataState;
87 m_token = 0;
88 }
89
90 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
91 {
92 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLTok en::Uninitialized);
93 source.advanceAndUpdateLineNumber();
94 if (m_token->type() == HTMLToken::Character)
95 return true;
96 m_token->beginEndTag(m_temporaryBuffer);
97 m_appropriateEndTagName.clear();
98 m_temporaryBuffer.clear();
99 return false;
100 }
101
102 #define FLUSH_AND_ADVANCE_TO(stateName) \
103 do { \
104 m_state = HTMLTokenizer::stateName; \
105 if (flushBufferedEndTag(source)) \
106 return true; \
107 if (source.isEmpty() \
108 || !m_inputStreamPreprocessor.peek(source)) \
109 return haveBufferedCharacterToken(); \
110 cc = m_inputStreamPreprocessor.nextInputCharacter(); \
111 goto stateName; \
112 } while (false)
113
114 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer: :State state)
115 {
116 m_state = state;
117 flushBufferedEndTag(source);
118 return true;
119 }
120
121 bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
122 {
123 // If we have a token in progress, then we're supposed to be called back
124 // with the same token so we can finish it.
125 ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitial ized);
126 m_token = &token;
127
128 if (!m_temporaryBuffer.isEmpty() && !isEndTagBufferingState(m_state)) {
129 // FIXME: This should call flushBufferedEndTag().
130 // We started an end tag during our last iteration.
131 m_token->beginEndTag(m_temporaryBuffer);
132 m_appropriateEndTagName.clear();
133 m_temporaryBuffer.clear();
134 if (m_state == HTMLTokenizer::DataState) {
135 // We're back in the data state, so we must be done with the tag.
136 return true;
137 }
138 }
139
140 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source))
141 return haveBufferedCharacterToken();
142 UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
143
144 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
145 switch (m_state) {
146 HTML_BEGIN_STATE(DataState) {
147 if (cc == '&') {
148 m_returnState = DataState;
149 m_entityParser.reset();
150 HTML_ADVANCE_TO(CharacterReferenceInDataState);
151 } else if (cc == '<') {
152 if (m_token->type() == HTMLToken::Character) {
153 // We have a bunch of character tokens queued up that we
154 // are emitting lazily here.
155 return true;
156 }
157 HTML_ADVANCE_TO(TagOpenState);
158 } else if (cc == kEndOfFileMarker) {
159 return emitEndOfFile(source);
160 } else {
161 bufferCharacter(cc);
162 HTML_ADVANCE_TO(DataState);
163 }
164 }
165 END_STATE()
166
167 HTML_BEGIN_STATE(CharacterReferenceInDataState) {
168 if (!m_entityParser.parse(source))
169 return haveBufferedCharacterToken();
170 for (const UChar& entityCharacter : m_entityParser.result())
171 bufferCharacter(entityCharacter);
172 cc = m_inputStreamPreprocessor.nextInputCharacter();
173 ASSERT(m_returnState == m_returnState);
174 HTML_SWITCH_TO(DataState);
175 }
176 END_STATE()
177
178 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
179 if (!m_entityParser.parse(source))
180 return haveBufferedCharacterToken();
181 for (const UChar& entityCharacter : m_entityParser.result())
182 m_token->appendToAttributeValue(entityCharacter);
183 cc = m_inputStreamPreprocessor.nextInputCharacter();
184
185 if (m_returnState == AttributeValueDoubleQuotedState)
186 HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
187 else if (m_returnState == AttributeValueSingleQuotedState)
188 HTML_SWITCH_TO(AttributeValueSingleQuotedState);
189 else if (m_returnState == AttributeValueUnquotedState)
190 HTML_SWITCH_TO(AttributeValueUnquotedState);
191 else
192 ASSERT_NOT_REACHED();
193 }
194 END_STATE()
195
196 HTML_BEGIN_STATE(RawDataState) {
197 if (cc == '<') {
198 HTML_ADVANCE_TO(RawDataLessThanSignState);
199 } else {
200 bufferCharacter(cc);
201 HTML_ADVANCE_TO(RawDataState);
202 }
203 }
204 END_STATE()
205
206 HTML_BEGIN_STATE(RawDataLessThanSignState) {
207 if (cc == '/') {
208 m_temporaryBuffer.clear();
209 HTML_ADVANCE_TO(RawDataEndTagOpenState);
210 } else {
211 bufferCharacter('<');
212 HTML_RECONSUME_IN(RawDataState);
213 }
214 }
215 END_STATE()
216
217 HTML_BEGIN_STATE(RawDataEndTagOpenState) {
218 if (isASCIILower(cc)) {
219 m_temporaryBuffer.append(static_cast<LChar>(cc));
220 HTML_ADVANCE_TO(RawDataEndTagNameState);
221 } else {
222 bufferCharacter('<');
223 bufferCharacter('/');
224 HTML_RECONSUME_IN(RawDataState);
225 }
226 }
227 END_STATE()
228
229 HTML_BEGIN_STATE(RawDataEndTagNameState) {
230 if (isASCIILower(cc)) {
231 m_temporaryBuffer.append(static_cast<LChar>(cc));
232 HTML_ADVANCE_TO(RawDataEndTagNameState);
233 } else {
234 if (isTokenizerWhitespace(cc)) {
235 if (isAppropriateEndTag())
236 FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
237 } else if (cc == '/') {
238 if (isAppropriateEndTag())
239 FLUSH_AND_ADVANCE_TO(VoidTagState);
240 } else if (cc == '>') {
241 if (isAppropriateEndTag())
242 return flushEmitAndResumeIn(source, HTMLTokenizer::DataState );
243 }
244 bufferCharacter('<');
245 bufferCharacter('/');
246 m_token->appendToCharacter(m_temporaryBuffer);
247 m_temporaryBuffer.clear();
248 HTML_RECONSUME_IN(RawDataState);
249 }
250 }
251 END_STATE()
252
253 HTML_BEGIN_STATE(TagOpenState) {
254 if (cc == '!') {
255 HTML_ADVANCE_TO(CommentStart1State);
256 } else if (cc == '/') {
257 HTML_ADVANCE_TO(CloseTagState);
258 } else if (isTokenizerTagName(cc)) {
259 m_token->beginStartTag(static_cast<LChar>(cc));
260 HTML_ADVANCE_TO(TagNameState);
261 } else {
262 bufferCharacter('<');
263 HTML_RECONSUME_IN(DataState);
264 }
265 }
266 END_STATE()
267
268 HTML_BEGIN_STATE(CloseTagState) {
269 if (isTokenizerTagName(cc)) {
270 m_token->beginEndTag(static_cast<LChar>(cc));
271 HTML_ADVANCE_TO(TagNameState);
272 } else if (cc == '>') {
273 bufferCharacter('<');
274 bufferCharacter('/');
275 bufferCharacter('>');
276 HTML_ADVANCE_TO(DataState);
277 } else {
278 bufferCharacter('<');
279 bufferCharacter('/');
280 HTML_RECONSUME_IN(DataState);
281 }
282 }
283 END_STATE()
284
285 HTML_BEGIN_STATE(TagNameState) {
286 if (isTokenizerWhitespace(cc)) {
287 HTML_ADVANCE_TO(BeforeAttributeNameState);
288 } else if (cc == '/') {
289 HTML_ADVANCE_TO(VoidTagState);
290 } else if (cc == '>') {
291 return emitAndResumeIn(source, HTMLTokenizer::DataState);
292 } else {
293 m_token->appendToName(cc);
294 HTML_ADVANCE_TO(TagNameState);
295 }
296 }
297 END_STATE()
298
299 HTML_BEGIN_STATE(BeforeAttributeNameState) {
300 if (isTokenizerWhitespace(cc)) {
301 HTML_ADVANCE_TO(BeforeAttributeNameState);
302 } else if (cc == '/') {
303 HTML_ADVANCE_TO(VoidTagState);
304 } else if (cc == '>') {
305 return emitAndResumeIn(source, HTMLTokenizer::DataState);
306 } else {
307 m_token->addNewAttribute();
308 m_token->beginAttributeName(source.numberOfCharactersConsumed());
309 m_token->appendToAttributeName(cc);
310 HTML_ADVANCE_TO(AttributeNameState);
311 }
312 }
313 END_STATE()
314
315 HTML_BEGIN_STATE(AttributeNameState) {
316 if (isTokenizerWhitespace(cc)) {
317 m_token->endAttributeName(source.numberOfCharactersConsumed());
318 HTML_ADVANCE_TO(AfterAttributeNameState);
319 } else if (cc == '/') {
320 m_token->endAttributeName(source.numberOfCharactersConsumed());
321 HTML_ADVANCE_TO(VoidTagState);
322 } else if (cc == '=') {
323 m_token->endAttributeName(source.numberOfCharactersConsumed());
324 HTML_ADVANCE_TO(BeforeAttributeValueState);
325 } else if (cc == '>') {
326 m_token->endAttributeName(source.numberOfCharactersConsumed());
327 return emitAndResumeIn(source, HTMLTokenizer::DataState);
328 } else {
329 m_token->appendToAttributeName(cc);
330 HTML_ADVANCE_TO(AttributeNameState);
331 }
332 }
333 END_STATE()
334
335 HTML_BEGIN_STATE(AfterAttributeNameState) {
336 if (isTokenizerWhitespace(cc)) {
337 HTML_ADVANCE_TO(AfterAttributeNameState);
338 } else if (cc == '/') {
339 HTML_ADVANCE_TO(VoidTagState);
340 } else if (cc == '=') {
341 HTML_ADVANCE_TO(BeforeAttributeValueState);
342 } else if (cc == '>') {
343 return emitAndResumeIn(source, HTMLTokenizer::DataState);
344 } else {
345 m_token->addNewAttribute();
346 m_token->beginAttributeName(source.numberOfCharactersConsumed());
347 m_token->appendToAttributeName(cc);
348 HTML_ADVANCE_TO(AttributeNameState);
349 }
350 }
351 END_STATE()
352
353 HTML_BEGIN_STATE(BeforeAttributeValueState) {
354 if (isTokenizerWhitespace(cc))
355 HTML_ADVANCE_TO(BeforeAttributeValueState);
356 else if (cc == '"') {
357 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1 );
358 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
359 } else if (cc == '&') {
360 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
361 HTML_RECONSUME_IN(AttributeValueUnquotedState);
362 } else if (cc == '\'') {
363 m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1 );
364 HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
365 } else if (cc == '>') {
366 return emitAndResumeIn(source, HTMLTokenizer::DataState);
367 } else {
368 m_token->beginAttributeValue(source.numberOfCharactersConsumed());
369 m_token->appendToAttributeValue(cc);
370 HTML_ADVANCE_TO(AttributeValueUnquotedState);
371 }
372 }
373 END_STATE()
374
375 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
376 if (cc == '"') {
377 m_token->endAttributeValue(source.numberOfCharactersConsumed());
378 HTML_ADVANCE_TO(BeforeAttributeNameState);
379 } else if (cc == '&') {
380 m_returnState = AttributeValueDoubleQuotedState;
381 m_entityParser.reset();
382 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
383 } else {
384 m_token->appendToAttributeValue(cc);
385 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
386 }
387 }
388 END_STATE()
389
390 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
391 if (cc == '\'') {
392 m_token->endAttributeValue(source.numberOfCharactersConsumed());
393 HTML_ADVANCE_TO(BeforeAttributeNameState);
394 } else if (cc == '&') {
395 m_returnState = AttributeValueSingleQuotedState;
396 m_entityParser.reset();
397 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
398 } else {
399 m_token->appendToAttributeValue(cc);
400 HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
401 }
402 }
403 END_STATE()
404
405 HTML_BEGIN_STATE(AttributeValueUnquotedState) {
406 if (isTokenizerWhitespace(cc)) {
407 m_token->endAttributeValue(source.numberOfCharactersConsumed());
408 HTML_ADVANCE_TO(BeforeAttributeNameState);
409 } else if (cc == '&') {
410 m_returnState = AttributeValueUnquotedState;
411 m_entityParser.reset();
412 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
413 } else if (cc == '>') {
414 m_token->endAttributeValue(source.numberOfCharactersConsumed());
415 return emitAndResumeIn(source, HTMLTokenizer::DataState);
416 } else {
417 m_token->appendToAttributeValue(cc);
418 HTML_ADVANCE_TO(AttributeValueUnquotedState);
419 }
420 }
421 END_STATE()
422
423 HTML_BEGIN_STATE(VoidTagState) {
424 if (cc == '>') {
425 m_token->setSelfClosing();
426 return emitAndResumeIn(source, HTMLTokenizer::DataState);
427 } else {
428 HTML_RECONSUME_IN(BeforeAttributeNameState);
429 }
430 }
431 END_STATE()
432
433 HTML_BEGIN_STATE(CommentStart1State) {
434 if (cc == '-') {
435 HTML_ADVANCE_TO(CommentStart2State);
436 } else {
437 bufferCharacter('<');
438 bufferCharacter('!');
439 HTML_RECONSUME_IN(DataState);
440 }
441 }
442 END_STATE()
443
444 HTML_BEGIN_STATE(CommentStart2State) {
445 if (cc == '-') {
446 HTML_ADVANCE_TO(CommentState);
447 } else {
448 bufferCharacter('<');
449 bufferCharacter('!');
450 bufferCharacter('-');
451 HTML_RECONSUME_IN(DataState);
452 }
453 }
454 END_STATE()
455
456 HTML_BEGIN_STATE(CommentState) {
457 if (cc == '-')
458 HTML_ADVANCE_TO(CommentEnd1State);
459 else
460 HTML_ADVANCE_TO(CommentState);
461 }
462 END_STATE()
463
464 HTML_BEGIN_STATE(CommentEnd1State) {
465 if (cc == '-')
466 HTML_ADVANCE_TO(CommentEnd2State);
467 else
468 HTML_ADVANCE_TO(CommentState);
469 }
470 END_STATE()
471
472 HTML_BEGIN_STATE(CommentEnd2State) {
473 if (cc == '-')
474 HTML_ADVANCE_TO(CommentEnd2State);
475 else if (cc == '>')
476 HTML_ADVANCE_TO(DataState);
477 else
478 HTML_ADVANCE_TO(CommentState);
479 }
480 END_STATE()
481 }
482
483 ASSERT_NOT_REACHED();
484 return false;
485 }
486
487 inline bool HTMLTokenizer::isAppropriateEndTag()
488 {
489 if (m_temporaryBuffer.size() != m_appropriateEndTagName.size())
490 return false;
491
492 size_t numCharacters = m_temporaryBuffer.size();
493
494 for (size_t i = 0; i < numCharacters; i++) {
495 if (m_temporaryBuffer[i] != m_appropriateEndTagName[i])
496 return false;
497 }
498
499 return true;
500 }
501
502 inline void HTMLTokenizer::parseError()
503 {
504 notImplemented();
505 }
506
507 }
OLDNEW
« no previous file with comments | « sky/engine/core/html/parser/HTMLTokenizer.h ('k') | sky/engine/core/html/parser/HTMLTreeBuilder.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698