| Index: sky/engine/core/html/parser/HTMLTokenizer.cpp
|
| diff --git a/sky/engine/core/html/parser/HTMLTokenizer.cpp b/sky/engine/core/html/parser/HTMLTokenizer.cpp
|
| index 93e61ba6132ba1f1e5d9e245909343bd8dfe502c..b67d4aa617bf0e113a3a6a50863e991655dace10 100644
|
| --- a/sky/engine/core/html/parser/HTMLTokenizer.cpp
|
| +++ b/sky/engine/core/html/parser/HTMLTokenizer.cpp
|
| @@ -62,33 +62,9 @@ bool AtomicHTMLToken::usesAttributes() const
|
| return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
|
| }
|
|
|
| -static inline UChar toLowerCase(UChar cc)
|
| -{
|
| - ASSERT(isASCIIUpper(cc));
|
| - const int lowerCaseOffset = 0x20;
|
| - return cc + lowerCaseOffset;
|
| -}
|
| -
|
| -static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string)
|
| -{
|
| - if (vector.size() != string.length())
|
| - return false;
|
| -
|
| - if (!string.length())
|
| - return true;
|
| -
|
| - return equal(string.impl(), vector.data(), vector.size());
|
| -}
|
| -
|
| static inline bool isEndTagBufferingState(HTMLTokenizer::State state)
|
| {
|
| - switch (state) {
|
| - case HTMLTokenizer::RAWTEXTEndTagOpenState:
|
| - case HTMLTokenizer::RAWTEXTEndTagNameState:
|
| - return true;
|
| - default:
|
| - return false;
|
| - }
|
| + return state == HTMLTokenizer::RawDataEndTagOpenState || state == HTMLTokenizer::RawDataEndTagNameState;
|
| }
|
|
|
| #define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName)
|
| @@ -118,8 +94,7 @@ bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
|
| source.advanceAndUpdateLineNumber();
|
| if (m_token->type() == HTMLToken::Character)
|
| return true;
|
| - m_token->beginEndTag(m_bufferedEndTagName);
|
| - m_bufferedEndTagName.clear();
|
| + m_token->beginEndTag(m_temporaryBuffer);
|
| m_appropriateEndTagName.clear();
|
| m_temporaryBuffer.clear();
|
| return false;
|
| @@ -151,11 +126,10 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
|
| m_token = &token;
|
|
|
| - if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
|
| + if (!m_temporaryBuffer.isEmpty() && !isEndTagBufferingState(m_state)) {
|
| // FIXME: This should call flushBufferedEndTag().
|
| // We started an end tag during our last iteration.
|
| - m_token->beginEndTag(m_bufferedEndTagName);
|
| - m_bufferedEndTagName.clear();
|
| + m_token->beginEndTag(m_temporaryBuffer);
|
| m_appropriateEndTagName.clear();
|
| m_temporaryBuffer.clear();
|
| if (m_state == HTMLTokenizer::DataState) {
|
| @@ -182,9 +156,9 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| return true;
|
| }
|
| HTML_ADVANCE_TO(TagOpenState);
|
| - } else if (cc == kEndOfFileMarker)
|
| + } else if (cc == kEndOfFileMarker) {
|
| return emitEndOfFile(source);
|
| - else {
|
| + } else {
|
| bufferCharacter(cc);
|
| HTML_ADVANCE_TO(DataState);
|
| }
|
| @@ -220,31 +194,72 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| }
|
| END_STATE()
|
|
|
| - HTML_BEGIN_STATE(RAWTEXTState) {
|
| - if (cc == '<')
|
| - HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
|
| - else if (cc == kEndOfFileMarker)
|
| - return emitEndOfFile(source);
|
| - else {
|
| + HTML_BEGIN_STATE(RawDataState) {
|
| + if (cc == '<') {
|
| + HTML_ADVANCE_TO(RawDataLessThanSignState);
|
| + } else {
|
| bufferCharacter(cc);
|
| - HTML_ADVANCE_TO(RAWTEXTState);
|
| + HTML_ADVANCE_TO(RawDataState);
|
| + }
|
| + }
|
| + END_STATE()
|
| +
|
| + HTML_BEGIN_STATE(RawDataLessThanSignState) {
|
| + if (cc == '/') {
|
| + m_temporaryBuffer.clear();
|
| + HTML_ADVANCE_TO(RawDataEndTagOpenState);
|
| + } else {
|
| + bufferCharacter('<');
|
| + HTML_RECONSUME_IN(RawDataState);
|
| + }
|
| + }
|
| + END_STATE()
|
| +
|
| + HTML_BEGIN_STATE(RawDataEndTagOpenState) {
|
| + if (isASCIILower(cc)) {
|
| + m_temporaryBuffer.append(static_cast<LChar>(cc));
|
| + HTML_ADVANCE_TO(RawDataEndTagNameState);
|
| + } else {
|
| + bufferCharacter('<');
|
| + bufferCharacter('/');
|
| + HTML_RECONSUME_IN(RawDataState);
|
| + }
|
| + }
|
| + END_STATE()
|
| +
|
| + HTML_BEGIN_STATE(RawDataEndTagNameState) {
|
| + if (isASCIILower(cc)) {
|
| + m_temporaryBuffer.append(static_cast<LChar>(cc));
|
| + HTML_ADVANCE_TO(RawDataEndTagNameState);
|
| + } else {
|
| + if (isTokenizerWhitespace(cc)) {
|
| + if (isAppropriateEndTag())
|
| + FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
|
| + } else if (cc == '/') {
|
| + if (isAppropriateEndTag())
|
| + FLUSH_AND_ADVANCE_TO(VoidTagState);
|
| + } else if (cc == '>') {
|
| + if (isAppropriateEndTag())
|
| + return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
|
| + }
|
| + bufferCharacter('<');
|
| + bufferCharacter('/');
|
| + m_token->appendToCharacter(m_temporaryBuffer);
|
| + m_temporaryBuffer.clear();
|
| + HTML_RECONSUME_IN(RawDataState);
|
| }
|
| }
|
| END_STATE()
|
|
|
| HTML_BEGIN_STATE(TagOpenState) {
|
| - if (cc == '!')
|
| + if (cc == '!') {
|
| HTML_ADVANCE_TO(CommentStart1State);
|
| - else if (cc == '/')
|
| + } else if (cc == '/') {
|
| HTML_ADVANCE_TO(CloseTagState);
|
| - else if (isASCIIUpper(cc)) {
|
| - m_token->beginStartTag(toLowerCase(cc));
|
| - HTML_ADVANCE_TO(TagNameState);
|
| - } else if (isASCIILower(cc)) {
|
| - m_token->beginStartTag(cc);
|
| + } else if (isTokenizerTagName(cc)) {
|
| + m_token->beginStartTag(static_cast<LChar>(cc));
|
| HTML_ADVANCE_TO(TagNameState);
|
| } else {
|
| - parseError();
|
| bufferCharacter('<');
|
| HTML_RECONSUME_IN(DataState);
|
| }
|
| @@ -252,13 +267,8 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| END_STATE()
|
|
|
| HTML_BEGIN_STATE(CloseTagState) {
|
| - if (isASCIIUpper(cc)) {
|
| - m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc)));
|
| - m_appropriateEndTagName.clear();
|
| - HTML_ADVANCE_TO(TagNameState);
|
| - } else if (isASCIILower(cc)) {
|
| + if (isTokenizerTagName(cc)) {
|
| m_token->beginEndTag(static_cast<LChar>(cc));
|
| - m_appropriateEndTagName.clear();
|
| HTML_ADVANCE_TO(TagNameState);
|
| } else if (cc == '>') {
|
| bufferCharacter('<');
|
| @@ -274,18 +284,12 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| END_STATE()
|
|
|
| HTML_BEGIN_STATE(TagNameState) {
|
| - if (isTokenizerWhitespace(cc))
|
| + if (isTokenizerWhitespace(cc)) {
|
| HTML_ADVANCE_TO(BeforeAttributeNameState);
|
| - else if (cc == '/')
|
| - HTML_ADVANCE_TO(SelfClosingStartTagState);
|
| - else if (cc == '>')
|
| + } else if (cc == '/') {
|
| + HTML_ADVANCE_TO(VoidTagState);
|
| + } else if (cc == '>') {
|
| return emitAndResumeIn(source, HTMLTokenizer::DataState);
|
| - else if (isASCIIUpper(cc)) {
|
| - m_token->appendToName(toLowerCase(cc));
|
| - HTML_ADVANCE_TO(TagNameState);
|
| - } else if (cc == kEndOfFileMarker) {
|
| - parseError();
|
| - HTML_RECONSUME_IN(DataState);
|
| } else {
|
| m_token->appendToName(cc);
|
| HTML_ADVANCE_TO(TagNameState);
|
| @@ -293,89 +297,14 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| }
|
| END_STATE()
|
|
|
| - HTML_BEGIN_STATE(RAWTEXTLessThanSignState) {
|
| - if (cc == '/') {
|
| - m_temporaryBuffer.clear();
|
| - ASSERT(m_bufferedEndTagName.isEmpty());
|
| - HTML_ADVANCE_TO(RAWTEXTEndTagOpenState);
|
| - } else {
|
| - bufferCharacter('<');
|
| - HTML_RECONSUME_IN(RAWTEXTState);
|
| - }
|
| - }
|
| - END_STATE()
|
| -
|
| - HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) {
|
| - if (isASCIIUpper(cc)) {
|
| - m_temporaryBuffer.append(static_cast<LChar>(cc));
|
| - addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
|
| - HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
|
| - } else if (isASCIILower(cc)) {
|
| - m_temporaryBuffer.append(static_cast<LChar>(cc));
|
| - addToPossibleEndTag(static_cast<LChar>(cc));
|
| - HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
|
| - } else {
|
| - bufferCharacter('<');
|
| - bufferCharacter('/');
|
| - HTML_RECONSUME_IN(RAWTEXTState);
|
| - }
|
| - }
|
| - END_STATE()
|
| -
|
| - HTML_BEGIN_STATE(RAWTEXTEndTagNameState) {
|
| - if (isASCIIUpper(cc)) {
|
| - m_temporaryBuffer.append(static_cast<LChar>(cc));
|
| - addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
|
| - HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
|
| - } else if (isASCIILower(cc)) {
|
| - m_temporaryBuffer.append(static_cast<LChar>(cc));
|
| - addToPossibleEndTag(static_cast<LChar>(cc));
|
| - HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
|
| - } else {
|
| - if (isTokenizerWhitespace(cc)) {
|
| - if (isAppropriateEndTag()) {
|
| - m_temporaryBuffer.append(static_cast<LChar>(cc));
|
| - FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
|
| - }
|
| - } else if (cc == '/') {
|
| - if (isAppropriateEndTag()) {
|
| - m_temporaryBuffer.append(static_cast<LChar>(cc));
|
| - FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
|
| - }
|
| - } else if (cc == '>') {
|
| - if (isAppropriateEndTag()) {
|
| - m_temporaryBuffer.append(static_cast<LChar>(cc));
|
| - return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
|
| - }
|
| - }
|
| - bufferCharacter('<');
|
| - bufferCharacter('/');
|
| - m_token->appendToCharacter(m_temporaryBuffer);
|
| - m_bufferedEndTagName.clear();
|
| - m_temporaryBuffer.clear();
|
| - HTML_RECONSUME_IN(RAWTEXTState);
|
| - }
|
| - }
|
| - END_STATE()
|
| -
|
| HTML_BEGIN_STATE(BeforeAttributeNameState) {
|
| - if (isTokenizerWhitespace(cc))
|
| + if (isTokenizerWhitespace(cc)) {
|
| HTML_ADVANCE_TO(BeforeAttributeNameState);
|
| - else if (cc == '/')
|
| - HTML_ADVANCE_TO(SelfClosingStartTagState);
|
| - else if (cc == '>')
|
| + } else if (cc == '/') {
|
| + HTML_ADVANCE_TO(VoidTagState);
|
| + } else if (cc == '>') {
|
| return emitAndResumeIn(source, HTMLTokenizer::DataState);
|
| - else if (isASCIIUpper(cc)) {
|
| - m_token->addNewAttribute();
|
| - m_token->beginAttributeName(source.numberOfCharactersConsumed());
|
| - m_token->appendToAttributeName(toLowerCase(cc));
|
| - HTML_ADVANCE_TO(AttributeNameState);
|
| - } else if (cc == kEndOfFileMarker) {
|
| - parseError();
|
| - HTML_RECONSUME_IN(DataState);
|
| } else {
|
| - if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
|
| - parseError();
|
| m_token->addNewAttribute();
|
| m_token->beginAttributeName(source.numberOfCharactersConsumed());
|
| m_token->appendToAttributeName(cc);
|
| @@ -390,23 +319,14 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| HTML_ADVANCE_TO(AfterAttributeNameState);
|
| } else if (cc == '/') {
|
| m_token->endAttributeName(source.numberOfCharactersConsumed());
|
| - HTML_ADVANCE_TO(SelfClosingStartTagState);
|
| + HTML_ADVANCE_TO(VoidTagState);
|
| } else if (cc == '=') {
|
| m_token->endAttributeName(source.numberOfCharactersConsumed());
|
| HTML_ADVANCE_TO(BeforeAttributeValueState);
|
| } else if (cc == '>') {
|
| m_token->endAttributeName(source.numberOfCharactersConsumed());
|
| return emitAndResumeIn(source, HTMLTokenizer::DataState);
|
| - } else if (isASCIIUpper(cc)) {
|
| - m_token->appendToAttributeName(toLowerCase(cc));
|
| - HTML_ADVANCE_TO(AttributeNameState);
|
| - } else if (cc == kEndOfFileMarker) {
|
| - parseError();
|
| - m_token->endAttributeName(source.numberOfCharactersConsumed());
|
| - HTML_RECONSUME_IN(DataState);
|
| } else {
|
| - if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
|
| - parseError();
|
| m_token->appendToAttributeName(cc);
|
| HTML_ADVANCE_TO(AttributeNameState);
|
| }
|
| @@ -414,25 +334,15 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| END_STATE()
|
|
|
| HTML_BEGIN_STATE(AfterAttributeNameState) {
|
| - if (isTokenizerWhitespace(cc))
|
| + if (isTokenizerWhitespace(cc)) {
|
| HTML_ADVANCE_TO(AfterAttributeNameState);
|
| - else if (cc == '/')
|
| - HTML_ADVANCE_TO(SelfClosingStartTagState);
|
| - else if (cc == '=')
|
| + } else if (cc == '/') {
|
| + HTML_ADVANCE_TO(VoidTagState);
|
| + } else if (cc == '=') {
|
| HTML_ADVANCE_TO(BeforeAttributeValueState);
|
| - else if (cc == '>')
|
| + } else if (cc == '>') {
|
| return emitAndResumeIn(source, HTMLTokenizer::DataState);
|
| - else if (isASCIIUpper(cc)) {
|
| - m_token->addNewAttribute();
|
| - m_token->beginAttributeName(source.numberOfCharactersConsumed());
|
| - m_token->appendToAttributeName(toLowerCase(cc));
|
| - HTML_ADVANCE_TO(AttributeNameState);
|
| - } else if (cc == kEndOfFileMarker) {
|
| - parseError();
|
| - HTML_RECONSUME_IN(DataState);
|
| } else {
|
| - if (cc == '"' || cc == '\'' || cc == '<')
|
| - parseError();
|
| m_token->addNewAttribute();
|
| m_token->beginAttributeName(source.numberOfCharactersConsumed());
|
| m_token->appendToAttributeName(cc);
|
| @@ -454,14 +364,8 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
|
| HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
|
| } else if (cc == '>') {
|
| - parseError();
|
| return emitAndResumeIn(source, HTMLTokenizer::DataState);
|
| - } else if (cc == kEndOfFileMarker) {
|
| - parseError();
|
| - HTML_RECONSUME_IN(DataState);
|
| } else {
|
| - if (cc == '<' || cc == '=' || cc == '`')
|
| - parseError();
|
| m_token->beginAttributeValue(source.numberOfCharactersConsumed());
|
| m_token->appendToAttributeValue(cc);
|
| HTML_ADVANCE_TO(AttributeValueUnquotedState);
|
| @@ -472,15 +376,11 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
|
| if (cc == '"') {
|
| m_token->endAttributeValue(source.numberOfCharactersConsumed());
|
| - HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
|
| + HTML_ADVANCE_TO(BeforeAttributeNameState);
|
| } else if (cc == '&') {
|
| m_returnState = AttributeValueDoubleQuotedState;
|
| m_entityParser.reset();
|
| HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
|
| - } else if (cc == kEndOfFileMarker) {
|
| - parseError();
|
| - m_token->endAttributeValue(source.numberOfCharactersConsumed());
|
| - HTML_RECONSUME_IN(DataState);
|
| } else {
|
| m_token->appendToAttributeValue(cc);
|
| HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
|
| @@ -491,15 +391,11 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
|
| if (cc == '\'') {
|
| m_token->endAttributeValue(source.numberOfCharactersConsumed());
|
| - HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
|
| + HTML_ADVANCE_TO(BeforeAttributeNameState);
|
| } else if (cc == '&') {
|
| m_returnState = AttributeValueSingleQuotedState;
|
| m_entityParser.reset();
|
| HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
|
| - } else if (cc == kEndOfFileMarker) {
|
| - parseError();
|
| - m_token->endAttributeValue(source.numberOfCharactersConsumed());
|
| - HTML_RECONSUME_IN(DataState);
|
| } else {
|
| m_token->appendToAttributeValue(cc);
|
| HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
|
| @@ -518,45 +414,18 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| } else if (cc == '>') {
|
| m_token->endAttributeValue(source.numberOfCharactersConsumed());
|
| return emitAndResumeIn(source, HTMLTokenizer::DataState);
|
| - } else if (cc == kEndOfFileMarker) {
|
| - parseError();
|
| - m_token->endAttributeValue(source.numberOfCharactersConsumed());
|
| - HTML_RECONSUME_IN(DataState);
|
| } else {
|
| - if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
|
| - parseError();
|
| m_token->appendToAttributeValue(cc);
|
| HTML_ADVANCE_TO(AttributeValueUnquotedState);
|
| }
|
| }
|
| END_STATE()
|
|
|
| - HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
|
| - if (isTokenizerWhitespace(cc))
|
| - HTML_ADVANCE_TO(BeforeAttributeNameState);
|
| - else if (cc == '/')
|
| - HTML_ADVANCE_TO(SelfClosingStartTagState);
|
| - else if (cc == '>')
|
| - return emitAndResumeIn(source, HTMLTokenizer::DataState);
|
| - else if (cc == kEndOfFileMarker) {
|
| - parseError();
|
| - HTML_RECONSUME_IN(DataState);
|
| - } else {
|
| - parseError();
|
| - HTML_RECONSUME_IN(BeforeAttributeNameState);
|
| - }
|
| - }
|
| - END_STATE()
|
| -
|
| - HTML_BEGIN_STATE(SelfClosingStartTagState) {
|
| + HTML_BEGIN_STATE(VoidTagState) {
|
| if (cc == '>') {
|
| m_token->setSelfClosing();
|
| return emitAndResumeIn(source, HTMLTokenizer::DataState);
|
| - } else if (cc == kEndOfFileMarker) {
|
| - parseError();
|
| - HTML_RECONSUME_IN(DataState);
|
| } else {
|
| - parseError();
|
| HTML_RECONSUME_IN(BeforeAttributeNameState);
|
| }
|
| }
|
| @@ -616,26 +485,15 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
|
| return false;
|
| }
|
|
|
| -inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
|
| -{
|
| - return vectorEqualsString(m_temporaryBuffer, expectedString);
|
| -}
|
| -
|
| -inline void HTMLTokenizer::addToPossibleEndTag(LChar cc)
|
| -{
|
| - ASSERT(isEndTagBufferingState(m_state));
|
| - m_bufferedEndTagName.append(cc);
|
| -}
|
| -
|
| inline bool HTMLTokenizer::isAppropriateEndTag()
|
| {
|
| - if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size())
|
| + if (m_temporaryBuffer.size() != m_appropriateEndTagName.size())
|
| return false;
|
|
|
| - size_t numCharacters = m_bufferedEndTagName.size();
|
| + size_t numCharacters = m_temporaryBuffer.size();
|
|
|
| for (size_t i = 0; i < numCharacters; i++) {
|
| - if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i])
|
| + if (m_temporaryBuffer[i] != m_appropriateEndTagName[i])
|
| return false;
|
| }
|
|
|
|
|