| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. | 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. |
| 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ | 3 * Copyright (C) 2009 Torch Mobile, Inc. http://www.torchmobile.com/ |
| 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. | 4 * Copyright (C) 2010 Google, Inc. All Rights Reserved. |
| 5 * | 5 * |
| 6 * Redistribution and use in source and binary forms, with or without | 6 * Redistribution and use in source and binary forms, with or without |
| 7 * modification, are permitted provided that the following conditions | 7 * modification, are permitted provided that the following conditions |
| 8 * are met: | 8 * are met: |
| 9 * 1. Redistributions of source code must retain the above copyright | 9 * 1. Redistributions of source code must retain the above copyright |
| 10 * notice, this list of conditions and the following disclaimer. | 10 * notice, this list of conditions and the following disclaimer. |
| (...skipping 93 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 104 } | 104 } |
| 105 | 105 |
| 106 HTMLTokenizer::~HTMLTokenizer() | 106 HTMLTokenizer::~HTMLTokenizer() |
| 107 { | 107 { |
| 108 } | 108 } |
| 109 | 109 |
| 110 void HTMLTokenizer::reset() | 110 void HTMLTokenizer::reset() |
| 111 { | 111 { |
| 112 m_state = HTMLTokenizer::DataState; | 112 m_state = HTMLTokenizer::DataState; |
| 113 m_token = 0; | 113 m_token = 0; |
| 114 m_additionalAllowedCharacter = '\0'; | |
| 115 } | |
| 116 | |
| 117 inline bool HTMLTokenizer::processEntity(SegmentedString& source) | |
| 118 { | |
| 119 bool notEnoughCharacters = false; | |
| 120 DecodedHTMLEntity decodedEntity; | |
| 121 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharacters)
; | |
| 122 if (notEnoughCharacters) | |
| 123 return false; | |
| 124 if (!success) { | |
| 125 ASSERT(decodedEntity.isEmpty()); | |
| 126 bufferCharacter('&'); | |
| 127 } else { | |
| 128 for (unsigned i = 0; i < decodedEntity.length; ++i) | |
| 129 bufferCharacter(decodedEntity.data[i]); | |
| 130 } | |
| 131 return true; | |
| 132 } | 114 } |
| 133 | 115 |
| 134 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) | 116 bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source) |
| 135 { | 117 { |
| 136 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLTok
en::Uninitialized); | 118 ASSERT(m_token->type() == HTMLToken::Character || m_token->type() == HTMLTok
en::Uninitialized); |
| 137 source.advanceAndUpdateLineNumber(); | 119 source.advanceAndUpdateLineNumber(); |
| 138 if (m_token->type() == HTMLToken::Character) | 120 if (m_token->type() == HTMLToken::Character) |
| 139 return true; | 121 return true; |
| 140 m_token->beginEndTag(m_bufferedEndTagName); | 122 m_token->beginEndTag(m_bufferedEndTagName); |
| 141 m_bufferedEndTagName.clear(); | 123 m_bufferedEndTagName.clear(); |
| 142 m_appropriateEndTagName.clear(); | 124 m_appropriateEndTagName.clear(); |
| 143 m_temporaryBuffer.clear(); | 125 m_temporaryBuffer.clear(); |
| 144 return false; | 126 return false; |
| 145 } | 127 } |
| 146 | 128 |
| 147 #define FLUSH_AND_ADVANCE_TO(stateName) \ | 129 #define FLUSH_AND_ADVANCE_TO(stateName) \ |
| 148 do { \ | 130 do { \ |
| 149 m_state = HTMLTokenizer::stateName; \ | 131 m_state = HTMLTokenizer::stateName; \ |
| 150 if (flushBufferedEndTag(source)) \ | 132 if (flushBufferedEndTag(source)) \ |
| 151 return true; \ | 133 return true; \ |
| 152 if (source.isEmpty() \ | 134 if (source.isEmpty() \ |
| 153 || !m_inputStreamPreprocessor.peek(source)) \ | 135 || !m_inputStreamPreprocessor.peek(source)) \ |
| 154 return haveBufferedCharacterToken(); \ | 136 return haveBufferedCharacterToken(); \ |
| 155 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ | 137 cc = m_inputStreamPreprocessor.nextInputCharacter(); \ |
| 156 goto stateName; \ | 138 goto stateName; \ |
| 157 } while (false) | 139 } while (false) |
| 158 | 140 |
| 159 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer:
:State state) | 141 bool HTMLTokenizer::flushEmitAndResumeIn(SegmentedString& source, HTMLTokenizer:
:State state) |
| (...skipping 23 matching lines...) Expand all Loading... |
| 183 } | 165 } |
| 184 } | 166 } |
| 185 | 167 |
| 186 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source)) | 168 if (source.isEmpty() || !m_inputStreamPreprocessor.peek(source)) |
| 187 return haveBufferedCharacterToken(); | 169 return haveBufferedCharacterToken(); |
| 188 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); | 170 UChar cc = m_inputStreamPreprocessor.nextInputCharacter(); |
| 189 | 171 |
| 190 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 | 172 // Source: http://www.whatwg.org/specs/web-apps/current-work/#tokenisation0 |
| 191 switch (m_state) { | 173 switch (m_state) { |
| 192 HTML_BEGIN_STATE(DataState) { | 174 HTML_BEGIN_STATE(DataState) { |
| 193 if (cc == '&') | 175 if (cc == '&') { |
| 176 m_returnState = DataState; |
| 177 m_entityParser.reset(); |
| 194 HTML_ADVANCE_TO(CharacterReferenceInDataState); | 178 HTML_ADVANCE_TO(CharacterReferenceInDataState); |
| 195 else if (cc == '<') { | 179 } else if (cc == '<') { |
| 196 if (m_token->type() == HTMLToken::Character) { | 180 if (m_token->type() == HTMLToken::Character) { |
| 197 // We have a bunch of character tokens queued up that we | 181 // We have a bunch of character tokens queued up that we |
| 198 // are emitting lazily here. | 182 // are emitting lazily here. |
| 199 return true; | 183 return true; |
| 200 } | 184 } |
| 201 HTML_ADVANCE_TO(TagOpenState); | 185 HTML_ADVANCE_TO(TagOpenState); |
| 202 } else if (cc == kEndOfFileMarker) | 186 } else if (cc == kEndOfFileMarker) |
| 203 return emitEndOfFile(source); | 187 return emitEndOfFile(source); |
| 204 else { | 188 else { |
| 205 bufferCharacter(cc); | 189 bufferCharacter(cc); |
| 206 HTML_ADVANCE_TO(DataState); | 190 HTML_ADVANCE_TO(DataState); |
| 207 } | 191 } |
| 208 } | 192 } |
| 209 END_STATE() | 193 END_STATE() |
| 210 | 194 |
| 211 HTML_BEGIN_STATE(CharacterReferenceInDataState) { | 195 HTML_BEGIN_STATE(CharacterReferenceInDataState) { |
| 212 if (!processEntity(source)) | 196 if (!m_entityParser.parse(source)) |
| 213 return haveBufferedCharacterToken(); | 197 return haveBufferedCharacterToken(); |
| 198 for (const UChar& entityCharacter : m_entityParser.result()) |
| 199 bufferCharacter(entityCharacter); |
| 200 cc = m_inputStreamPreprocessor.nextInputCharacter(); |
| 201 ASSERT(m_returnState == m_returnState); |
| 214 HTML_SWITCH_TO(DataState); | 202 HTML_SWITCH_TO(DataState); |
| 215 } | 203 } |
| 216 END_STATE() | 204 END_STATE() |
| 217 | 205 |
| 206 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) { |
| 207 if (!m_entityParser.parse(source)) |
| 208 return haveBufferedCharacterToken(); |
| 209 for (const UChar& entityCharacter : m_entityParser.result()) |
| 210 m_token->appendToAttributeValue(entityCharacter); |
| 211 cc = m_inputStreamPreprocessor.nextInputCharacter(); |
| 212 |
| 213 if (m_returnState == AttributeValueDoubleQuotedState) |
| 214 HTML_SWITCH_TO(AttributeValueDoubleQuotedState); |
| 215 else if (m_returnState == AttributeValueSingleQuotedState) |
| 216 HTML_SWITCH_TO(AttributeValueSingleQuotedState); |
| 217 else if (m_returnState == AttributeValueUnquotedState) |
| 218 HTML_SWITCH_TO(AttributeValueUnquotedState); |
| 219 else |
| 220 ASSERT_NOT_REACHED(); |
| 221 } |
| 222 END_STATE() |
| 223 |
| 218 HTML_BEGIN_STATE(RAWTEXTState) { | 224 HTML_BEGIN_STATE(RAWTEXTState) { |
| 219 if (cc == '<') | 225 if (cc == '<') |
| 220 HTML_ADVANCE_TO(RAWTEXTLessThanSignState); | 226 HTML_ADVANCE_TO(RAWTEXTLessThanSignState); |
| 221 else if (cc == kEndOfFileMarker) | 227 else if (cc == kEndOfFileMarker) |
| 222 return emitEndOfFile(source); | 228 return emitEndOfFile(source); |
| 223 else { | 229 else { |
| 224 bufferCharacter(cc); | 230 bufferCharacter(cc); |
| 225 HTML_ADVANCE_TO(RAWTEXTState); | 231 HTML_ADVANCE_TO(RAWTEXTState); |
| 226 } | 232 } |
| 227 } | 233 } |
| (...skipping 242 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 470 HTML_ADVANCE_TO(AttributeValueUnquotedState); | 476 HTML_ADVANCE_TO(AttributeValueUnquotedState); |
| 471 } | 477 } |
| 472 } | 478 } |
| 473 END_STATE() | 479 END_STATE() |
| 474 | 480 |
| 475 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) { | 481 HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) { |
| 476 if (cc == '"') { | 482 if (cc == '"') { |
| 477 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 483 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
| 478 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); | 484 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); |
| 479 } else if (cc == '&') { | 485 } else if (cc == '&') { |
| 480 m_additionalAllowedCharacter = '"'; | 486 m_returnState = AttributeValueDoubleQuotedState; |
| 487 m_entityParser.reset(); |
| 481 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); | 488 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); |
| 482 } else if (cc == kEndOfFileMarker) { | 489 } else if (cc == kEndOfFileMarker) { |
| 483 parseError(); | 490 parseError(); |
| 484 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 491 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
| 485 HTML_RECONSUME_IN(DataState); | 492 HTML_RECONSUME_IN(DataState); |
| 486 } else { | 493 } else { |
| 487 m_token->appendToAttributeValue(cc); | 494 m_token->appendToAttributeValue(cc); |
| 488 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); | 495 HTML_ADVANCE_TO(AttributeValueDoubleQuotedState); |
| 489 } | 496 } |
| 490 } | 497 } |
| 491 END_STATE() | 498 END_STATE() |
| 492 | 499 |
| 493 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) { | 500 HTML_BEGIN_STATE(AttributeValueSingleQuotedState) { |
| 494 if (cc == '\'') { | 501 if (cc == '\'') { |
| 495 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 502 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
| 496 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); | 503 HTML_ADVANCE_TO(AfterAttributeValueQuotedState); |
| 497 } else if (cc == '&') { | 504 } else if (cc == '&') { |
| 498 m_additionalAllowedCharacter = '\''; | 505 m_returnState = AttributeValueSingleQuotedState; |
| 506 m_entityParser.reset(); |
| 499 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); | 507 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); |
| 500 } else if (cc == kEndOfFileMarker) { | 508 } else if (cc == kEndOfFileMarker) { |
| 501 parseError(); | 509 parseError(); |
| 502 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 510 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
| 503 HTML_RECONSUME_IN(DataState); | 511 HTML_RECONSUME_IN(DataState); |
| 504 } else { | 512 } else { |
| 505 m_token->appendToAttributeValue(cc); | 513 m_token->appendToAttributeValue(cc); |
| 506 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); | 514 HTML_ADVANCE_TO(AttributeValueSingleQuotedState); |
| 507 } | 515 } |
| 508 } | 516 } |
| 509 END_STATE() | 517 END_STATE() |
| 510 | 518 |
| 511 HTML_BEGIN_STATE(AttributeValueUnquotedState) { | 519 HTML_BEGIN_STATE(AttributeValueUnquotedState) { |
| 512 if (isTokenizerWhitespace(cc)) { | 520 if (isTokenizerWhitespace(cc)) { |
| 513 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 521 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
| 514 HTML_ADVANCE_TO(BeforeAttributeNameState); | 522 HTML_ADVANCE_TO(BeforeAttributeNameState); |
| 515 } else if (cc == '&') { | 523 } else if (cc == '&') { |
| 516 m_additionalAllowedCharacter = '>'; | 524 m_returnState = AttributeValueUnquotedState; |
| 525 m_entityParser.reset(); |
| 517 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); | 526 HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState); |
| 518 } else if (cc == '>') { | 527 } else if (cc == '>') { |
| 519 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 528 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
| 520 return emitAndResumeIn(source, HTMLTokenizer::DataState); | 529 return emitAndResumeIn(source, HTMLTokenizer::DataState); |
| 521 } else if (cc == kEndOfFileMarker) { | 530 } else if (cc == kEndOfFileMarker) { |
| 522 parseError(); | 531 parseError(); |
| 523 m_token->endAttributeValue(source.numberOfCharactersConsumed()); | 532 m_token->endAttributeValue(source.numberOfCharactersConsumed()); |
| 524 HTML_RECONSUME_IN(DataState); | 533 HTML_RECONSUME_IN(DataState); |
| 525 } else { | 534 } else { |
| 526 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') | 535 if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`') |
| 527 parseError(); | 536 parseError(); |
| 528 m_token->appendToAttributeValue(cc); | 537 m_token->appendToAttributeValue(cc); |
| 529 HTML_ADVANCE_TO(AttributeValueUnquotedState); | 538 HTML_ADVANCE_TO(AttributeValueUnquotedState); |
| 530 } | 539 } |
| 531 } | 540 } |
| 532 END_STATE() | 541 END_STATE() |
| 533 | 542 |
| 534 HTML_BEGIN_STATE(CharacterReferenceInAttributeValueState) { | |
| 535 bool notEnoughCharacters = false; | |
| 536 DecodedHTMLEntity decodedEntity; | |
| 537 bool success = consumeHTMLEntity(source, decodedEntity, notEnoughCharact
ers, m_additionalAllowedCharacter); | |
| 538 if (notEnoughCharacters) | |
| 539 return haveBufferedCharacterToken(); | |
| 540 if (!success) { | |
| 541 ASSERT(decodedEntity.isEmpty()); | |
| 542 m_token->appendToAttributeValue('&'); | |
| 543 } else { | |
| 544 for (unsigned i = 0; i < decodedEntity.length; ++i) | |
| 545 m_token->appendToAttributeValue(decodedEntity.data[i]); | |
| 546 } | |
| 547 // We're supposed to switch back to the attribute value state that | |
| 548 // we were in when we were switched into this state. Rather than | |
| 549 // keeping track of this explictly, we observe that the previous | |
| 550 // state can be determined by m_additionalAllowedCharacter. | |
| 551 if (m_additionalAllowedCharacter == '"') | |
| 552 HTML_SWITCH_TO(AttributeValueDoubleQuotedState); | |
| 553 else if (m_additionalAllowedCharacter == '\'') | |
| 554 HTML_SWITCH_TO(AttributeValueSingleQuotedState); | |
| 555 else if (m_additionalAllowedCharacter == '>') | |
| 556 HTML_SWITCH_TO(AttributeValueUnquotedState); | |
| 557 else | |
| 558 ASSERT_NOT_REACHED(); | |
| 559 } | |
| 560 END_STATE() | |
| 561 | |
| 562 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) { | 543 HTML_BEGIN_STATE(AfterAttributeValueQuotedState) { |
| 563 if (isTokenizerWhitespace(cc)) | 544 if (isTokenizerWhitespace(cc)) |
| 564 HTML_ADVANCE_TO(BeforeAttributeNameState); | 545 HTML_ADVANCE_TO(BeforeAttributeNameState); |
| 565 else if (cc == '/') | 546 else if (cc == '/') |
| 566 HTML_ADVANCE_TO(SelfClosingStartTagState); | 547 HTML_ADVANCE_TO(SelfClosingStartTagState); |
| 567 else if (cc == '>') | 548 else if (cc == '>') |
| 568 return emitAndResumeIn(source, HTMLTokenizer::DataState); | 549 return emitAndResumeIn(source, HTMLTokenizer::DataState); |
| 569 else if (cc == kEndOfFileMarker) { | 550 else if (cc == kEndOfFileMarker) { |
| 570 parseError(); | 551 parseError(); |
| 571 HTML_RECONSUME_IN(DataState); | 552 HTML_RECONSUME_IN(DataState); |
| (...skipping 187 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 759 | 740 |
| 760 return true; | 741 return true; |
| 761 } | 742 } |
| 762 | 743 |
| 763 inline void HTMLTokenizer::parseError() | 744 inline void HTMLTokenizer::parseError() |
| 764 { | 745 { |
| 765 notImplemented(); | 746 notImplemented(); |
| 766 } | 747 } |
| 767 | 748 |
| 768 } | 749 } |
| OLD | NEW |