OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. | 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. |
3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ | 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ |
4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. | 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. |
5 * | 5 * |
6 * Redistribution and use in source and binary forms, with or without | 6 * Redistribution and use in source and binary forms, with or without |
7 * modification, are permitted provided that the following conditions | 7 * modification, are permitted provided that the following conditions |
8 * are met: | 8 * are met: |
9 * 1. Redistributions of source code must retain the above copyright | 9 * 1. Redistributions of source code must retain the above copyright |
10 * notice, this list of conditions and the following disclaimer. | 10 * notice, this list of conditions and the following disclaimer. |
(...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
104 } | 104 } |
105 | 105 |
106 HTMLTokenizer::~HTMLTokenizer() | 106 HTMLTokenizer::~HTMLTokenizer() |
107 { | 107 { |
108 } | 108 } |
109 | 109 |
110 void HTMLTokenizer::reset() | 110 void HTMLTokenizer::reset() |
111 { | 111 { |
112 m_state = HTMLTokenizer::DataState; | 112 m_state = HTMLTokenizer::DataState; |
113 m_token = 0; | 113 m_token = 0; |
114 m_additionalAllowedCharacter = '\0'; | |
115 } | |
116 | |
117 inline bool HTMLTokenizer::processEntity(SegmentedString& source) | |
118 { | |
119 bool notEnoughCharacters = false; | |
120 DecodedHTMLEntity decodedEntity; | |
121 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters)
; | |
122 if (notEnoughCharacters) | |
123 return false; | |
124 if (!success) { | |
125 ASSERT(decodedEntity.isEmpty()); | |
126 bufferCharacter('&'); | |
127 } else { | |
128 for (unsigned i = 0; i < decodedEntity.length; ++i) | |
129 bufferCharacter(decodedEntity.data[i]); | |
130 } | |
131 return true; | |
132 } | 114 } |
133 | 115 |
134 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) | 116 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) |
135 { | 117 { |
136 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLTok
en::Uninitialized); | 118 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLTok
en::Uninitialized); |
137 source.advanceAndUpdateLineNumber(); | 119 source.advanceAndUpdateLineNumber(); |
138 if (m_token->type() == HTMLToken::Character) | 120 if (m_token->type() == HTMLToken::Character) |
139 return true; | 121 return true; |
140 m_token->beginEndTag(m_bufferedEndTagName); | 122 m_token->beginEndTag(m_bufferedEndTagName); |
141 m_bufferedEndTagName.clear(); | 123 m_bufferedEndTagName.clear(); |
142 m_appropriateEndTagName.clear(); | 124 m_appropriateEndTagName.clear(); |
143 m_temporaryBuffer.clear(); | 125 m_temporaryBuffer.clear(); |
144 return false; | 126 return false; |
145 } | 127 } |
146 | 128 |
147 #define FLUSH_AND_ADVANCE_TO(stateName) \ | 129 #define FLUSH_AND_ADVANCE_TO(stateName) \ |
148 do { \ | 130 do { \ |
149 m_state = HTMLTokenizer::stateName; \ | 131 m_state = HTMLTokenizer::stateName; \ |
150 if (flushBufferedEndTag(source)) \ | 132 if (flushBufferedEndTag(source)) \ |
151 return true; \ | 133 return true; \ |
152 if (source.isEmpty() \ | 134 if (source.isEmpty() \ |
153 || !m_inputStreamPreprocessor.peek(source)) \ | 135 || !m_inputStreamPreprocessor.peek(source)) \ |
154 return haveBufferedCharacterToken(); \ | 136 return haveBufferedCharacterToken(); \ |
155 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ | 137 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ |
156 goto stateName; \ | 138 goto stateName; \ |
157 } while (false) | 139 } while (false) |
158 | 140 |
159 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer:
:State state) | 141 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer:
:State state) |
(...skipping 23 matching lines...) Expand all Loading... |
183 } | 165 } |
184 } | 166 } |
185 | 167 |
186 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source)) | 168 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source)) |
187 return haveBufferedCharacterToken(); | 169 return haveBufferedCharacterToken(); |
188 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); | 170 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); |
189 | 171 |
190 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 | 172 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 |
191 switch (m_state) { | 173 switch (m_state) { |
192 HTML_BEGIN_STATE(DataState) { | 174 HTML_BEGIN_STATE(DataState) { |
193 if (cc == '&') | 175 if (cc == '&') { |
| 176 m_returnState = DataState; |
| 177 m_entityParser.reset(); |
194 HTML_ADVANCE_TO(CharacterReferenceInDataState); | 178 HTML_ADVANCE_TO(CharacterReferenceInDataState); |
195 else if (cc == '<') { | 179 } else if (cc == '<') { |
196 if (m_token->type() == HTMLToken::Character) { | 180 if (m_token->type() == HTMLToken::Character) { |
197 // We have a bunch of character tokens queued up that we | 181 // We have a bunch of character tokens queued up that we |
198 // are emitting lazily here. | 182 // are emitting lazily here. |
199 return true; | 183 return true; |
200 } | 184 } |
201 HTML_ADVANCE_TO(TagOpenState); | 185 HTML_ADVANCE_TO(TagOpenState); |
202 } else if (cc == kEndOfFileMarker) | 186 } else if (cc == kEndOfFileMarker) |
203 return emitEndOfFile(source); | 187 return emitEndOfFile(source); |
204 else { | 188 else { |
205 bufferCharacter(cc); | 189 bufferCharacter(cc); |
206 HTML_ADVANCE_TO(DataState); | 190 HTML_ADVANCE_TO(DataState); |
207 } | 191 } |
208 } | 192 } |
209 END_STATE() | 193 END_STATE() |
210 | 194 |
211 HTML_BEGIN_STATE(CharacterReferenceInDataState) { | 195 HTML_BEGIN_STATE(CharacterReferenceInDataState) { |
212 if (!processEntity(source)) | 196 if (!m_entityParser.parse(source)) |
213 return haveBufferedCharacterToken(); | 197 return haveBufferedCharacterToken(); |
| 198 for (const UChar& entityCharacter : m_entityParser.result()) |
| 199 bufferCharacter(entityCharacter); |
| 200 cc = m_inputStreamPreprocessor.nextInputCharacter(); |
| 201 ASSERT(m_returnState == m_returnState); |
214 HTML_SWITCH_TO(DataState); | 202 HTML_SWITCH_TO(DataState); |
215 } | 203 } |
216 END_STATE() | 204 END_STATE() |
217 | 205 |
| 206 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) { |
| 207 if (!m_entityParser.parse(source)) |
| 208 return haveBufferedCharacterToken(); |
| 209 for (const UChar& entityCharacter : m_entityParser.result()) |
| 210 m_token->appendToAttributeValue(entityCharacter); |
| 211 cc = m_inputStreamPreprocessor.nextInputCharacter(); |
| 212 |
| 213 if (m_returnState == AttributeValueDoubleQuotedState) |
| 214 HTML_SWITCH_TO(AttributeValueDoubleQuotedState); |
| 215 else if (m_returnState == AttributeValueSingleQuotedState) |
| 216 HTML_SWITCH_TO(AttributeValueSingleQuotedState); |
| 217 else if (m_returnState == AttributeValueUnquotedState) |
| 218 HTML_SWITCH_TO(AttributeValueUnquotedState); |
| 219 else |
| 220 ASSERT_NOT_REACHED(); |
| 221 } |
| 222 END_STATE() |
| 223 |
218 HTML_BEGIN_STATE(RAWTEXTState) { | 224 HTML_BEGIN_STATE(RAWTEXTState) { |
219 if (cc == '<') | 225 if (cc == '<') |
220 HTML_ADVANCE_TO(RAWTEXTLessThanSignState); | 226 HTML_ADVANCE_TO(RAWTEXTLessThanSignState); |
221 else if (cc == kEndOfFileMarker) | 227 else if (cc == kEndOfFileMarker) |
222 return emitEndOfFile(source); | 228 return emitEndOfFile(source); |
223 else { | 229 else { |
224 bufferCharacter(cc); | 230 bufferCharacter(cc); |
225 HTML_ADVANCE_TO(RAWTEXTState); | 231 HTML_ADVANCE_TO(RAWTEXTState); |
226 } | 232 } |
227 } | 233 } |
(...skipping 242 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
470 HTML_ADVANCE_TO(AttributeValueUnquotedState); | 476 HTML_ADVANCE_TO(AttributeValueUnquotedState); |
471 } | 477 } |
472 } | 478 } |
473 END_STATE() | 479 END_STATE() |
474 | 480 |
475 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) { | 481 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) { |
476 if (cc == '"') { | 482 if (cc == '"') { |
477 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 483 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
478 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); | 484 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); |
479 } else if (cc == '&') { | 485 } else if (cc == '&') { |
480 m_additionalAllowedCharacter = '"'; | 486 m_returnState = AttributeValueDoubleQuotedState; |
| 487 m_entityParser.reset(); |
481 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); | 488 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); |
482 } else if (cc == kEndOfFileMarker) { | 489 } else if (cc == kEndOfFileMarker) { |
483 parseError(); | 490 parseError(); |
484 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 491 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
485 HTML_RECONSUME_IN(DataState); | 492 HTML_RECONSUME_IN(DataState); |
486 } else { | 493 } else { |
487 m_token->appendToAttributeValue(cc); | 494 m_token->appendToAttributeValue(cc); |
488 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); | 495 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); |
489 } | 496 } |
490 } | 497 } |
491 END_STATE() | 498 END_STATE() |
492 | 499 |
493 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) { | 500 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) { |
494 if (cc == '\'') { | 501 if (cc == '\'') { |
495 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 502 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
496 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); | 503 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); |
497 } else if (cc == '&') { | 504 } else if (cc == '&') { |
498 m_additionalAllowedCharacter = '\''; | 505 m_returnState = AttributeValueSingleQuotedState; |
| 506 m_entityParser.reset(); |
499 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); | 507 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); |
500 } else if (cc == kEndOfFileMarker) { | 508 } else if (cc == kEndOfFileMarker) { |
501 parseError(); | 509 parseError(); |
502 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 510 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
503 HTML_RECONSUME_IN(DataState); | 511 HTML_RECONSUME_IN(DataState); |
504 } else { | 512 } else { |
505 m_token->appendToAttributeValue(cc); | 513 m_token->appendToAttributeValue(cc); |
506 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); | 514 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); |
507 } | 515 } |
508 } | 516 } |
509 END_STATE() | 517 END_STATE() |
510 | 518 |
511 HTML_BEGIN_STATE(AttributeValueUnquotedState) { | 519 HTML_BEGIN_STATE(AttributeValueUnquotedState) { |
512 if (isTokenizerWhitespace(cc)) { | 520 if (isTokenizerWhitespace(cc)) { |
513 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 521 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
514 HTML_ADVANCE_TO(BeforeAttributeNameState); | 522 HTML_ADVANCE_TO(BeforeAttributeNameState); |
515 } else if (cc == '&') { | 523 } else if (cc == '&') { |
516 m_additionalAllowedCharacter = '>'; | 524 m_returnState = AttributeValueUnquotedState; |
| 525 m_entityParser.reset(); |
517 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); | 526 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); |
518 } else if (cc == '>') { | 527 } else if (cc == '>') { |
519 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 528 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
520 return emitAndResumeIn(source, HTMLTokenizer::DataState); | 529 return emitAndResumeIn(source, HTMLTokenizer::DataState); |
521 } else if (cc == kEndOfFileMarker) { | 530 } else if (cc == kEndOfFileMarker) { |
522 parseError(); | 531 parseError(); |
523 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 532 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
524 HTML_RECONSUME_IN(DataState); | 533 HTML_RECONSUME_IN(DataState); |
525 } else { | 534 } else { |
526 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') | 535 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') |
527 parseError(); | 536 parseError(); |
528 m_token->appendToAttributeValue(cc); | 537 m_token->appendToAttributeValue(cc); |
529 HTML_ADVANCE_TO(AttributeValueUnquotedState); | 538 HTML_ADVANCE_TO(AttributeValueUnquotedState); |
530 } | 539 } |
531 } | 540 } |
532 END_STATE() | 541 END_STATE() |
533 | 542 |
534 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) { | |
535 bool notEnoughCharacters = false; | |
536 DecodedHTMLEntity decodedEntity; | |
537 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharact
ers, m_additionalAllowedCharacter); | |
538 if (notEnoughCharacters) | |
539 return haveBufferedCharacterToken(); | |
540 if (!success) { | |
541 ASSERT(decodedEntity.isEmpty()); | |
542 m_token->appendToAttributeValue('&'); | |
543 } else { | |
544 for (unsigned i = 0; i < decodedEntity.length; ++i) | |
545 m_token->appendToAttributeValue(decodedEntity.data[i]); | |
546 } | |
547 // We're supposed to switch back to the attribute value state that | |
548 // we were in when we were switched into this state. Rather than | |
549 // keeping track of this explictly, we observe that the previous | |
550 // state can be determined by m_additionalAllowedCharacter. | |
551 if (m_additionalAllowedCharacter == '"') | |
552 HTML_SWITCH_TO(AttributeValueDoubleQuotedState); | |
553 else if (m_additionalAllowedCharacter == '\'') | |
554 HTML_SWITCH_TO(AttributeValueSingleQuotedState); | |
555 else if (m_additionalAllowedCharacter == '>') | |
556 HTML_SWITCH_TO(AttributeValueUnquotedState); | |
557 else | |
558 ASSERT_NOT_REACHED(); | |
559 } | |
560 END_STATE() | |
561 | |
562 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) { | 543 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) { |
563 if (isTokenizerWhitespace(cc)) | 544 if (isTokenizerWhitespace(cc)) |
564 HTML_ADVANCE_TO(BeforeAttributeNameState); | 545 HTML_ADVANCE_TO(BeforeAttributeNameState); |
565 else if (cc == '/') | 546 else if (cc == '/') |
566 HTML_ADVANCE_TO(SelfClosingStartTagState); | 547 HTML_ADVANCE_TO(SelfClosingStartTagState); |
567 else if (cc == '>') | 548 else if (cc == '>') |
568 return emitAndResumeIn(source, HTMLTokenizer::DataState); | 549 return emitAndResumeIn(source, HTMLTokenizer::DataState); |
569 else if (cc == kEndOfFileMarker) { | 550 else if (cc == kEndOfFileMarker) { |
570 parseError(); | 551 parseError(); |
571 HTML_RECONSUME_IN(DataState); | 552 HTML_RECONSUME_IN(DataState); |
(...skipping 187 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
759 | 740 |
760 return true; | 741 return true; |
761 } | 742 } |
762 | 743 |
763 inline void HTMLTokenizer::parseError() | 744 inline void HTMLTokenizer::parseError() |
764 { | 745 { |
765 notImplemented(); | 746 notImplemented(); |
766 } | 747 } |
767 | 748 |
768 } | 749 } |
OLD | NEW |