Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(409)

Side by Side Diff: sky/engine/core/html/parser/HTMLTokenizer.cpp

Issue 678073002: Parse Sky entities according to the spec (Closed) Base URL: git@github.com:domokit/mojo.git@master
Patch Set: Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « sky/engine/core/html/parser/HTMLTokenizer.h ('k') | sky/tests/parser/entity.html » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved.
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved.
5 * 5 *
6 * Redistribution and use in source and binary forms, with or without 6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions 7 * modification, are permitted provided that the following conditions
8 * are met: 8 * are met:
9 * 1. Redistributions of source code must retain the above copyright 9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer. 10 * notice, this list of conditions and the following disclaimer.
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after
104 } 104 }
105 105
106 HTMLTokenizer::~HTMLTokenizer() 106 HTMLTokenizer::~HTMLTokenizer()
107 { 107 {
108 } 108 }
109 109
110 void HTMLTokenizer::reset() 110 void HTMLTokenizer::reset()
111 { 111 {
112 m_state = HTMLTokenizer::DataState; 112 m_state = HTMLTokenizer::DataState;
113 m_token = 0; 113 m_token = 0;
114 m_additionalAllowedCharacter = '\0';
115 }
116
117 inline bool HTMLTokenizer::processEntity(SegmentedString& source)
118 {
119 bool notEnoughCharacters = false;
120 DecodedHTMLEntity decodedEntity;
121 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters) ;
122 if (notEnoughCharacters)
123 return false;
124 if (!success) {
125 ASSERT(decodedEntity.isEmpty());
126 bufferCharacter('&');
127 } else {
128 for (unsigned i = 0; i < decodedEntity.length; ++i)
129 bufferCharacter(decodedEntity.data[i]);
130 }
131 return true;
132 } 114 }
133 115
134 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) 116 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
135 { 117 {
136 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLTok en::Uninitialized); 118 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLTok en::Uninitialized);
137 source.advanceAndUpdateLineNumber(); 119 source.advanceAndUpdateLineNumber();
138 if (m_token->type() == HTMLToken::Character) 120 if (m_token->type() == HTMLToken::Character)
139 return true; 121 return true;
140 m_token->beginEndTag(m_bufferedEndTagName); 122 m_token->beginEndTag(m_bufferedEndTagName);
141 m_bufferedEndTagName.clear(); 123 m_bufferedEndTagName.clear();
142 m_appropriateEndTagName.clear(); 124 m_appropriateEndTagName.clear();
143 m_temporaryBuffer.clear(); 125 m_temporaryBuffer.clear();
144 return false; 126 return false;
145 } 127 }
146 128
147 #define FLUSH_AND_ADVANCE_TO(stateName) \ 129 #define FLUSH_AND_ADVANCE_TO(stateName) \
148 do { \ 130 do { \
149 m_state = HTMLTokenizer::stateName; \ 131 m_state = HTMLTokenizer::stateName; \
150 if (flushBufferedEndTag(source)) \ 132 if (flushBufferedEndTag(source)) \
151 return true; \ 133 return true; \
152 if (source.isEmpty() \ 134 if (source.isEmpty() \
153 || !m_inputStreamPreprocessor.peek(source)) \ 135 || !m_inputStreamPreprocessor.peek(source)) \
154 return haveBufferedCharacterToken(); \ 136 return haveBufferedCharacterToken(); \
155 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ 137 cc = m_inputStreamPreprocessor.nextInputCharacter(); \
156 goto stateName; \ 138 goto stateName; \
157 } while (false) 139 } while (false)
158 140
159 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer: :State state) 141 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer: :State state)
(...skipping 23 matching lines...) Expand all
183 } 165 }
184 } 166 }
185 167
186 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source)) 168 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source))
187 return haveBufferedCharacterToken(); 169 return haveBufferedCharacterToken();
188 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); 170 UChar cc = m_inputStreamPreprocessor.nextInputCharacter();
189 171
190 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 172 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0
191 switch (m_state) { 173 switch (m_state) {
192 HTML_BEGIN_STATE(DataState) { 174 HTML_BEGIN_STATE(DataState) {
193 if (cc == '&') 175 if (cc == '&') {
176 m_returnState = DataState;
177 m_entityParser.reset();
194 HTML_ADVANCE_TO(CharacterReferenceInDataState); 178 HTML_ADVANCE_TO(CharacterReferenceInDataState);
195 else if (cc == '<') { 179 } else if (cc == '<') {
196 if (m_token->type() == HTMLToken::Character) { 180 if (m_token->type() == HTMLToken::Character) {
197 // We have a bunch of character tokens queued up that we 181 // We have a bunch of character tokens queued up that we
198 // are emitting lazily here. 182 // are emitting lazily here.
199 return true; 183 return true;
200 } 184 }
201 HTML_ADVANCE_TO(TagOpenState); 185 HTML_ADVANCE_TO(TagOpenState);
202 } else if (cc == kEndOfFileMarker) 186 } else if (cc == kEndOfFileMarker)
203 return emitEndOfFile(source); 187 return emitEndOfFile(source);
204 else { 188 else {
205 bufferCharacter(cc); 189 bufferCharacter(cc);
206 HTML_ADVANCE_TO(DataState); 190 HTML_ADVANCE_TO(DataState);
207 } 191 }
208 } 192 }
209 END_STATE() 193 END_STATE()
210 194
211 HTML_BEGIN_STATE(CharacterReferenceInDataState) { 195 HTML_BEGIN_STATE(CharacterReferenceInDataState) {
212 if (!processEntity(source)) 196 if (!m_entityParser.parse(source))
213 return haveBufferedCharacterToken(); 197 return haveBufferedCharacterToken();
198 for (const UChar& entityCharacter : m_entityParser.result())
199 bufferCharacter(entityCharacter);
200 cc = m_inputStreamPreprocessor.nextInputCharacter();
201 ASSERT(m_returnState == m_returnState);
214 HTML_SWITCH_TO(DataState); 202 HTML_SWITCH_TO(DataState);
215 } 203 }
216 END_STATE() 204 END_STATE()
217 205
206 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
207 if (!m_entityParser.parse(source))
208 return haveBufferedCharacterToken();
209 for (const UChar& entityCharacter : m_entityParser.result())
210 m_token->appendToAttributeValue(entityCharacter);
211 cc = m_inputStreamPreprocessor.nextInputCharacter();
212
213 if (m_returnState == AttributeValueDoubleQuotedState)
214 HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
215 else if (m_returnState == AttributeValueSingleQuotedState)
216 HTML_SWITCH_TO(AttributeValueSingleQuotedState);
217 else if (m_returnState == AttributeValueUnquotedState)
218 HTML_SWITCH_TO(AttributeValueUnquotedState);
219 else
220 ASSERT_NOT_REACHED();
221 }
222 END_STATE()
223
218 HTML_BEGIN_STATE(RAWTEXTState) { 224 HTML_BEGIN_STATE(RAWTEXTState) {
219 if (cc == '<') 225 if (cc == '<')
220 HTML_ADVANCE_TO(RAWTEXTLessThanSignState); 226 HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
221 else if (cc == kEndOfFileMarker) 227 else if (cc == kEndOfFileMarker)
222 return emitEndOfFile(source); 228 return emitEndOfFile(source);
223 else { 229 else {
224 bufferCharacter(cc); 230 bufferCharacter(cc);
225 HTML_ADVANCE_TO(RAWTEXTState); 231 HTML_ADVANCE_TO(RAWTEXTState);
226 } 232 }
227 } 233 }
(...skipping 242 matching lines...) Expand 10 before | Expand all | Expand 10 after
470 HTML_ADVANCE_TO(AttributeValueUnquotedState); 476 HTML_ADVANCE_TO(AttributeValueUnquotedState);
471 } 477 }
472 } 478 }
473 END_STATE() 479 END_STATE()
474 480
475 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) { 481 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
476 if (cc == '"') { 482 if (cc == '"') {
477 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 483 m_token->endAttributeValue(source.numberOfCharactersConsumed());
478 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 484 HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
479 } else if (cc == '&') { 485 } else if (cc == '&') {
480 m_additionalAllowedCharacter = '"'; 486 m_returnState = AttributeValueDoubleQuotedState;
487 m_entityParser.reset();
481 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 488 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
482 } else if (cc == kEndOfFileMarker) { 489 } else if (cc == kEndOfFileMarker) {
483 parseError(); 490 parseError();
484 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 491 m_token->endAttributeValue(source.numberOfCharactersConsumed());
485 HTML_RECONSUME_IN(DataState); 492 HTML_RECONSUME_IN(DataState);
486 } else { 493 } else {
487 m_token->appendToAttributeValue(cc); 494 m_token->appendToAttributeValue(cc);
488 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); 495 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
489 } 496 }
490 } 497 }
491 END_STATE() 498 END_STATE()
492 499
493 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) { 500 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
494 if (cc == '\'') { 501 if (cc == '\'') {
495 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 502 m_token->endAttributeValue(source.numberOfCharactersConsumed());
496 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); 503 HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
497 } else if (cc == '&') { 504 } else if (cc == '&') {
498 m_additionalAllowedCharacter = '\''; 505 m_returnState = AttributeValueSingleQuotedState;
506 m_entityParser.reset();
499 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 507 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
500 } else if (cc == kEndOfFileMarker) { 508 } else if (cc == kEndOfFileMarker) {
501 parseError(); 509 parseError();
502 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 510 m_token->endAttributeValue(source.numberOfCharactersConsumed());
503 HTML_RECONSUME_IN(DataState); 511 HTML_RECONSUME_IN(DataState);
504 } else { 512 } else {
505 m_token->appendToAttributeValue(cc); 513 m_token->appendToAttributeValue(cc);
506 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); 514 HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
507 } 515 }
508 } 516 }
509 END_STATE() 517 END_STATE()
510 518
511 HTML_BEGIN_STATE(AttributeValueUnquotedState) { 519 HTML_BEGIN_STATE(AttributeValueUnquotedState) {
512 if (isTokenizerWhitespace(cc)) { 520 if (isTokenizerWhitespace(cc)) {
513 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 521 m_token->endAttributeValue(source.numberOfCharactersConsumed());
514 HTML_ADVANCE_TO(BeforeAttributeNameState); 522 HTML_ADVANCE_TO(BeforeAttributeNameState);
515 } else if (cc == '&') { 523 } else if (cc == '&') {
516 m_additionalAllowedCharacter = '>'; 524 m_returnState = AttributeValueUnquotedState;
525 m_entityParser.reset();
517 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); 526 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
518 } else if (cc == '>') { 527 } else if (cc == '>') {
519 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 528 m_token->endAttributeValue(source.numberOfCharactersConsumed());
520 return emitAndResumeIn(source, HTMLTokenizer::DataState); 529 return emitAndResumeIn(source, HTMLTokenizer::DataState);
521 } else if (cc == kEndOfFileMarker) { 530 } else if (cc == kEndOfFileMarker) {
522 parseError(); 531 parseError();
523 m_token->endAttributeValue(source.numberOfCharactersConsumed()); 532 m_token->endAttributeValue(source.numberOfCharactersConsumed());
524 HTML_RECONSUME_IN(DataState); 533 HTML_RECONSUME_IN(DataState);
525 } else { 534 } else {
526 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') 535 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
527 parseError(); 536 parseError();
528 m_token->appendToAttributeValue(cc); 537 m_token->appendToAttributeValue(cc);
529 HTML_ADVANCE_TO(AttributeValueUnquotedState); 538 HTML_ADVANCE_TO(AttributeValueUnquotedState);
530 } 539 }
531 } 540 }
532 END_STATE() 541 END_STATE()
533 542
534 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) {
535 bool notEnoughCharacters = false;
536 DecodedHTMLEntity decodedEntity;
537 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharact ers, m_additionalAllowedCharacter);
538 if (notEnoughCharacters)
539 return haveBufferedCharacterToken();
540 if (!success) {
541 ASSERT(decodedEntity.isEmpty());
542 m_token->appendToAttributeValue('&');
543 } else {
544 for (unsigned i = 0; i < decodedEntity.length; ++i)
545 m_token->appendToAttributeValue(decodedEntity.data[i]);
546 }
547 // We're supposed to switch back to the attribute value state that
548 // we were in when we were switched into this state. Rather than
549 // keeping track of this explictly, we observe that the previous
550 // state can be determined by m_additionalAllowedCharacter.
551 if (m_additionalAllowedCharacter == '"')
552 HTML_SWITCH_TO(AttributeValueDoubleQuotedState);
553 else if (m_additionalAllowedCharacter == '\'')
554 HTML_SWITCH_TO(AttributeValueSingleQuotedState);
555 else if (m_additionalAllowedCharacter == '>')
556 HTML_SWITCH_TO(AttributeValueUnquotedState);
557 else
558 ASSERT_NOT_REACHED();
559 }
560 END_STATE()
561
562 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) { 543 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
563 if (isTokenizerWhitespace(cc)) 544 if (isTokenizerWhitespace(cc))
564 HTML_ADVANCE_TO(BeforeAttributeNameState); 545 HTML_ADVANCE_TO(BeforeAttributeNameState);
565 else if (cc == '/') 546 else if (cc == '/')
566 HTML_ADVANCE_TO(SelfClosingStartTagState); 547 HTML_ADVANCE_TO(SelfClosingStartTagState);
567 else if (cc == '>') 548 else if (cc == '>')
568 return emitAndResumeIn(source, HTMLTokenizer::DataState); 549 return emitAndResumeIn(source, HTMLTokenizer::DataState);
569 else if (cc == kEndOfFileMarker) { 550 else if (cc == kEndOfFileMarker) {
570 parseError(); 551 parseError();
571 HTML_RECONSUME_IN(DataState); 552 HTML_RECONSUME_IN(DataState);
(...skipping 187 matching lines...) Expand 10 before | Expand all | Expand 10 after
759 740
760 return true; 741 return true;
761 } 742 }
762 743
763 inline void HTMLTokenizer::parseError() 744 inline void HTMLTokenizer::parseError()
764 { 745 {
765 notImplemented(); 746 notImplemented();
766 } 747 }
767 748
768 } 749 }
OLDNEW
« no previous file with comments | « sky/engine/core/html/parser/HTMLTokenizer.h ('k') | sky/tests/parser/entity.html » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698