Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1025)

Unified Diff: sky/engine/core/html/parser/HTMLTokenizer.cpp

Issue 678263002: Update tokenizer to match spec (Closed) Base URL: git@github.com:domokit/mojo.git@master
Patch Set: Created 6 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « sky/engine/core/html/parser/HTMLTokenizer.h ('k') | sky/engine/core/html/parser/MarkupTokenizerInlines.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: sky/engine/core/html/parser/HTMLTokenizer.cpp
diff --git a/sky/engine/core/html/parser/HTMLTokenizer.cpp b/sky/engine/core/html/parser/HTMLTokenizer.cpp
index 93e61ba6132ba1f1e5d9e245909343bd8dfe502c..b67d4aa617bf0e113a3a6a50863e991655dace10 100644
--- a/sky/engine/core/html/parser/HTMLTokenizer.cpp
+++ b/sky/engine/core/html/parser/HTMLTokenizer.cpp
@@ -62,33 +62,9 @@ bool AtomicHTMLToken::usesAttributes() const
return m_type == HTMLToken::StartTag || m_type == HTMLToken::EndTag;
}
-static inline UChar toLowerCase(UChar cc)
-{
- ASSERT(isASCIIUpper(cc));
- const int lowerCaseOffset = 0x20;
- return cc + lowerCaseOffset;
-}
-
-static inline bool vectorEqualsString(const Vector<LChar, 32>& vector, const String& string)
-{
- if (vector.size() != string.length())
- return false;
-
- if (!string.length())
- return true;
-
- return equal(string.impl(), vector.data(), vector.size());
-}
-
static inline bool isEndTagBufferingState(HTMLTokenizer::State state)
{
- switch (state) {
- case HTMLTokenizer::RAWTEXTEndTagOpenState:
- case HTMLTokenizer::RAWTEXTEndTagNameState:
- return true;
- default:
- return false;
- }
+ return state == HTMLTokenizer::RawDataEndTagOpenState || state == HTMLTokenizer::RawDataEndTagNameState;
}
#define HTML_BEGIN_STATE(stateName) BEGIN_STATE(HTMLTokenizer, stateName)
@@ -118,8 +94,7 @@ bool HTMLTokenizer::flushBufferedEndTag(SegmentedString& source)
source.advanceAndUpdateLineNumber();
if (m_token->type() == HTMLToken::Character)
return true;
- m_token->beginEndTag(m_bufferedEndTagName);
- m_bufferedEndTagName.clear();
+ m_token->beginEndTag(m_temporaryBuffer);
m_appropriateEndTagName.clear();
m_temporaryBuffer.clear();
return false;
@@ -151,11 +126,10 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
ASSERT(!m_token || m_token == &token || token.type() == HTMLToken::Uninitialized);
m_token = &token;
- if (!m_bufferedEndTagName.isEmpty() && !isEndTagBufferingState(m_state)) {
+ if (!m_temporaryBuffer.isEmpty() && !isEndTagBufferingState(m_state)) {
// FIXME: This should call flushBufferedEndTag().
// We started an end tag during our last iteration.
- m_token->beginEndTag(m_bufferedEndTagName);
- m_bufferedEndTagName.clear();
+ m_token->beginEndTag(m_temporaryBuffer);
m_appropriateEndTagName.clear();
m_temporaryBuffer.clear();
if (m_state == HTMLTokenizer::DataState) {
@@ -182,9 +156,9 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
return true;
}
HTML_ADVANCE_TO(TagOpenState);
- } else if (cc == kEndOfFileMarker)
+ } else if (cc == kEndOfFileMarker) {
return emitEndOfFile(source);
- else {
+ } else {
bufferCharacter(cc);
HTML_ADVANCE_TO(DataState);
}
@@ -220,31 +194,72 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
}
END_STATE()
- HTML_BEGIN_STATE(RAWTEXTState) {
- if (cc == '<')
- HTML_ADVANCE_TO(RAWTEXTLessThanSignState);
- else if (cc == kEndOfFileMarker)
- return emitEndOfFile(source);
- else {
+ HTML_BEGIN_STATE(RawDataState) {
+ if (cc == '<') {
+ HTML_ADVANCE_TO(RawDataLessThanSignState);
+ } else {
bufferCharacter(cc);
- HTML_ADVANCE_TO(RAWTEXTState);
+ HTML_ADVANCE_TO(RawDataState);
+ }
+ }
+ END_STATE()
+
+ HTML_BEGIN_STATE(RawDataLessThanSignState) {
+ if (cc == '/') {
+ m_temporaryBuffer.clear();
+ HTML_ADVANCE_TO(RawDataEndTagOpenState);
+ } else {
+ bufferCharacter('<');
+ HTML_RECONSUME_IN(RawDataState);
+ }
+ }
+ END_STATE()
+
+ HTML_BEGIN_STATE(RawDataEndTagOpenState) {
+ if (isASCIILower(cc)) {
+ m_temporaryBuffer.append(static_cast<LChar>(cc));
+ HTML_ADVANCE_TO(RawDataEndTagNameState);
+ } else {
+ bufferCharacter('<');
+ bufferCharacter('/');
+ HTML_RECONSUME_IN(RawDataState);
+ }
+ }
+ END_STATE()
+
+ HTML_BEGIN_STATE(RawDataEndTagNameState) {
+ if (isASCIILower(cc)) {
+ m_temporaryBuffer.append(static_cast<LChar>(cc));
+ HTML_ADVANCE_TO(RawDataEndTagNameState);
+ } else {
+ if (isTokenizerWhitespace(cc)) {
+ if (isAppropriateEndTag())
+ FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
+ } else if (cc == '/') {
+ if (isAppropriateEndTag())
+ FLUSH_AND_ADVANCE_TO(VoidTagState);
+ } else if (cc == '>') {
+ if (isAppropriateEndTag())
+ return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
+ }
+ bufferCharacter('<');
+ bufferCharacter('/');
+ m_token->appendToCharacter(m_temporaryBuffer);
+ m_temporaryBuffer.clear();
+ HTML_RECONSUME_IN(RawDataState);
}
}
END_STATE()
HTML_BEGIN_STATE(TagOpenState) {
- if (cc == '!')
+ if (cc == '!') {
HTML_ADVANCE_TO(CommentStart1State);
- else if (cc == '/')
+ } else if (cc == '/') {
HTML_ADVANCE_TO(CloseTagState);
- else if (isASCIIUpper(cc)) {
- m_token->beginStartTag(toLowerCase(cc));
- HTML_ADVANCE_TO(TagNameState);
- } else if (isASCIILower(cc)) {
- m_token->beginStartTag(cc);
+ } else if (isTokenizerTagName(cc)) {
+ m_token->beginStartTag(static_cast<LChar>(cc));
HTML_ADVANCE_TO(TagNameState);
} else {
- parseError();
bufferCharacter('<');
HTML_RECONSUME_IN(DataState);
}
@@ -252,13 +267,8 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
END_STATE()
HTML_BEGIN_STATE(CloseTagState) {
- if (isASCIIUpper(cc)) {
- m_token->beginEndTag(static_cast<LChar>(toLowerCase(cc)));
- m_appropriateEndTagName.clear();
- HTML_ADVANCE_TO(TagNameState);
- } else if (isASCIILower(cc)) {
+ if (isTokenizerTagName(cc)) {
m_token->beginEndTag(static_cast<LChar>(cc));
- m_appropriateEndTagName.clear();
HTML_ADVANCE_TO(TagNameState);
} else if (cc == '>') {
bufferCharacter('<');
@@ -274,18 +284,12 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
END_STATE()
HTML_BEGIN_STATE(TagNameState) {
- if (isTokenizerWhitespace(cc))
+ if (isTokenizerWhitespace(cc)) {
HTML_ADVANCE_TO(BeforeAttributeNameState);
- else if (cc == '/')
- HTML_ADVANCE_TO(SelfClosingStartTagState);
- else if (cc == '>')
+ } else if (cc == '/') {
+ HTML_ADVANCE_TO(VoidTagState);
+ } else if (cc == '>') {
return emitAndResumeIn(source, HTMLTokenizer::DataState);
- else if (isASCIIUpper(cc)) {
- m_token->appendToName(toLowerCase(cc));
- HTML_ADVANCE_TO(TagNameState);
- } else if (cc == kEndOfFileMarker) {
- parseError();
- HTML_RECONSUME_IN(DataState);
} else {
m_token->appendToName(cc);
HTML_ADVANCE_TO(TagNameState);
@@ -293,89 +297,14 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
}
END_STATE()
- HTML_BEGIN_STATE(RAWTEXTLessThanSignState) {
- if (cc == '/') {
- m_temporaryBuffer.clear();
- ASSERT(m_bufferedEndTagName.isEmpty());
- HTML_ADVANCE_TO(RAWTEXTEndTagOpenState);
- } else {
- bufferCharacter('<');
- HTML_RECONSUME_IN(RAWTEXTState);
- }
- }
- END_STATE()
-
- HTML_BEGIN_STATE(RAWTEXTEndTagOpenState) {
- if (isASCIIUpper(cc)) {
- m_temporaryBuffer.append(static_cast<LChar>(cc));
- addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
- HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
- } else if (isASCIILower(cc)) {
- m_temporaryBuffer.append(static_cast<LChar>(cc));
- addToPossibleEndTag(static_cast<LChar>(cc));
- HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
- } else {
- bufferCharacter('<');
- bufferCharacter('/');
- HTML_RECONSUME_IN(RAWTEXTState);
- }
- }
- END_STATE()
-
- HTML_BEGIN_STATE(RAWTEXTEndTagNameState) {
- if (isASCIIUpper(cc)) {
- m_temporaryBuffer.append(static_cast<LChar>(cc));
- addToPossibleEndTag(static_cast<LChar>(toLowerCase(cc)));
- HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
- } else if (isASCIILower(cc)) {
- m_temporaryBuffer.append(static_cast<LChar>(cc));
- addToPossibleEndTag(static_cast<LChar>(cc));
- HTML_ADVANCE_TO(RAWTEXTEndTagNameState);
- } else {
- if (isTokenizerWhitespace(cc)) {
- if (isAppropriateEndTag()) {
- m_temporaryBuffer.append(static_cast<LChar>(cc));
- FLUSH_AND_ADVANCE_TO(BeforeAttributeNameState);
- }
- } else if (cc == '/') {
- if (isAppropriateEndTag()) {
- m_temporaryBuffer.append(static_cast<LChar>(cc));
- FLUSH_AND_ADVANCE_TO(SelfClosingStartTagState);
- }
- } else if (cc == '>') {
- if (isAppropriateEndTag()) {
- m_temporaryBuffer.append(static_cast<LChar>(cc));
- return flushEmitAndResumeIn(source, HTMLTokenizer::DataState);
- }
- }
- bufferCharacter('<');
- bufferCharacter('/');
- m_token->appendToCharacter(m_temporaryBuffer);
- m_bufferedEndTagName.clear();
- m_temporaryBuffer.clear();
- HTML_RECONSUME_IN(RAWTEXTState);
- }
- }
- END_STATE()
-
HTML_BEGIN_STATE(BeforeAttributeNameState) {
- if (isTokenizerWhitespace(cc))
+ if (isTokenizerWhitespace(cc)) {
HTML_ADVANCE_TO(BeforeAttributeNameState);
- else if (cc == '/')
- HTML_ADVANCE_TO(SelfClosingStartTagState);
- else if (cc == '>')
+ } else if (cc == '/') {
+ HTML_ADVANCE_TO(VoidTagState);
+ } else if (cc == '>') {
return emitAndResumeIn(source, HTMLTokenizer::DataState);
- else if (isASCIIUpper(cc)) {
- m_token->addNewAttribute();
- m_token->beginAttributeName(source.numberOfCharactersConsumed());
- m_token->appendToAttributeName(toLowerCase(cc));
- HTML_ADVANCE_TO(AttributeNameState);
- } else if (cc == kEndOfFileMarker) {
- parseError();
- HTML_RECONSUME_IN(DataState);
} else {
- if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
- parseError();
m_token->addNewAttribute();
m_token->beginAttributeName(source.numberOfCharactersConsumed());
m_token->appendToAttributeName(cc);
@@ -390,23 +319,14 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
HTML_ADVANCE_TO(AfterAttributeNameState);
} else if (cc == '/') {
m_token->endAttributeName(source.numberOfCharactersConsumed());
- HTML_ADVANCE_TO(SelfClosingStartTagState);
+ HTML_ADVANCE_TO(VoidTagState);
} else if (cc == '=') {
m_token->endAttributeName(source.numberOfCharactersConsumed());
HTML_ADVANCE_TO(BeforeAttributeValueState);
} else if (cc == '>') {
m_token->endAttributeName(source.numberOfCharactersConsumed());
return emitAndResumeIn(source, HTMLTokenizer::DataState);
- } else if (isASCIIUpper(cc)) {
- m_token->appendToAttributeName(toLowerCase(cc));
- HTML_ADVANCE_TO(AttributeNameState);
- } else if (cc == kEndOfFileMarker) {
- parseError();
- m_token->endAttributeName(source.numberOfCharactersConsumed());
- HTML_RECONSUME_IN(DataState);
} else {
- if (cc == '"' || cc == '\'' || cc == '<' || cc == '=')
- parseError();
m_token->appendToAttributeName(cc);
HTML_ADVANCE_TO(AttributeNameState);
}
@@ -414,25 +334,15 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
END_STATE()
HTML_BEGIN_STATE(AfterAttributeNameState) {
- if (isTokenizerWhitespace(cc))
+ if (isTokenizerWhitespace(cc)) {
HTML_ADVANCE_TO(AfterAttributeNameState);
- else if (cc == '/')
- HTML_ADVANCE_TO(SelfClosingStartTagState);
- else if (cc == '=')
+ } else if (cc == '/') {
+ HTML_ADVANCE_TO(VoidTagState);
+ } else if (cc == '=') {
HTML_ADVANCE_TO(BeforeAttributeValueState);
- else if (cc == '>')
+ } else if (cc == '>') {
return emitAndResumeIn(source, HTMLTokenizer::DataState);
- else if (isASCIIUpper(cc)) {
- m_token->addNewAttribute();
- m_token->beginAttributeName(source.numberOfCharactersConsumed());
- m_token->appendToAttributeName(toLowerCase(cc));
- HTML_ADVANCE_TO(AttributeNameState);
- } else if (cc == kEndOfFileMarker) {
- parseError();
- HTML_RECONSUME_IN(DataState);
} else {
- if (cc == '"' || cc == '\'' || cc == '<')
- parseError();
m_token->addNewAttribute();
m_token->beginAttributeName(source.numberOfCharactersConsumed());
m_token->appendToAttributeName(cc);
@@ -454,14 +364,8 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
m_token->beginAttributeValue(source.numberOfCharactersConsumed() + 1);
HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
} else if (cc == '>') {
- parseError();
return emitAndResumeIn(source, HTMLTokenizer::DataState);
- } else if (cc == kEndOfFileMarker) {
- parseError();
- HTML_RECONSUME_IN(DataState);
} else {
- if (cc == '<' || cc == '=' || cc == '`')
- parseError();
m_token->beginAttributeValue(source.numberOfCharactersConsumed());
m_token->appendToAttributeValue(cc);
HTML_ADVANCE_TO(AttributeValueUnquotedState);
@@ -472,15 +376,11 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
HTML_BEGIN_STATE(AttributeValueDoubleQuotedState) {
if (cc == '"') {
m_token->endAttributeValue(source.numberOfCharactersConsumed());
- HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
+ HTML_ADVANCE_TO(BeforeAttributeNameState);
} else if (cc == '&') {
m_returnState = AttributeValueDoubleQuotedState;
m_entityParser.reset();
HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
- } else if (cc == kEndOfFileMarker) {
- parseError();
- m_token->endAttributeValue(source.numberOfCharactersConsumed());
- HTML_RECONSUME_IN(DataState);
} else {
m_token->appendToAttributeValue(cc);
HTML_ADVANCE_TO(AttributeValueDoubleQuotedState);
@@ -491,15 +391,11 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
HTML_BEGIN_STATE(AttributeValueSingleQuotedState) {
if (cc == '\'') {
m_token->endAttributeValue(source.numberOfCharactersConsumed());
- HTML_ADVANCE_TO(AfterAttributeValueQuotedState);
+ HTML_ADVANCE_TO(BeforeAttributeNameState);
} else if (cc == '&') {
m_returnState = AttributeValueSingleQuotedState;
m_entityParser.reset();
HTML_ADVANCE_TO(CharacterReferenceInAttributeValueState);
- } else if (cc == kEndOfFileMarker) {
- parseError();
- m_token->endAttributeValue(source.numberOfCharactersConsumed());
- HTML_RECONSUME_IN(DataState);
} else {
m_token->appendToAttributeValue(cc);
HTML_ADVANCE_TO(AttributeValueSingleQuotedState);
@@ -518,45 +414,18 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
} else if (cc == '>') {
m_token->endAttributeValue(source.numberOfCharactersConsumed());
return emitAndResumeIn(source, HTMLTokenizer::DataState);
- } else if (cc == kEndOfFileMarker) {
- parseError();
- m_token->endAttributeValue(source.numberOfCharactersConsumed());
- HTML_RECONSUME_IN(DataState);
} else {
- if (cc == '"' || cc == '\'' || cc == '<' || cc == '=' || cc == '`')
- parseError();
m_token->appendToAttributeValue(cc);
HTML_ADVANCE_TO(AttributeValueUnquotedState);
}
}
END_STATE()
- HTML_BEGIN_STATE(AfterAttributeValueQuotedState) {
- if (isTokenizerWhitespace(cc))
- HTML_ADVANCE_TO(BeforeAttributeNameState);
- else if (cc == '/')
- HTML_ADVANCE_TO(SelfClosingStartTagState);
- else if (cc == '>')
- return emitAndResumeIn(source, HTMLTokenizer::DataState);
- else if (cc == kEndOfFileMarker) {
- parseError();
- HTML_RECONSUME_IN(DataState);
- } else {
- parseError();
- HTML_RECONSUME_IN(BeforeAttributeNameState);
- }
- }
- END_STATE()
-
- HTML_BEGIN_STATE(SelfClosingStartTagState) {
+ HTML_BEGIN_STATE(VoidTagState) {
if (cc == '>') {
m_token->setSelfClosing();
return emitAndResumeIn(source, HTMLTokenizer::DataState);
- } else if (cc == kEndOfFileMarker) {
- parseError();
- HTML_RECONSUME_IN(DataState);
} else {
- parseError();
HTML_RECONSUME_IN(BeforeAttributeNameState);
}
}
@@ -616,26 +485,15 @@ bool HTMLTokenizer::nextToken(SegmentedString& source, HTMLToken& token)
return false;
}
-inline bool HTMLTokenizer::temporaryBufferIs(const String& expectedString)
-{
- return vectorEqualsString(m_temporaryBuffer, expectedString);
-}
-
-inline void HTMLTokenizer::addToPossibleEndTag(LChar cc)
-{
- ASSERT(isEndTagBufferingState(m_state));
- m_bufferedEndTagName.append(cc);
-}
-
inline bool HTMLTokenizer::isAppropriateEndTag()
{
- if (m_bufferedEndTagName.size() != m_appropriateEndTagName.size())
+ if (m_temporaryBuffer.size() != m_appropriateEndTagName.size())
return false;
- size_t numCharacters = m_bufferedEndTagName.size();
+ size_t numCharacters = m_temporaryBuffer.size();
for (size_t i = 0; i < numCharacters; i++) {
- if (m_bufferedEndTagName[i] != m_appropriateEndTagName[i])
+ if (m_temporaryBuffer[i] != m_appropriateEndTagName[i])
return false;
}
« no previous file with comments | « sky/engine/core/html/parser/HTMLTokenizer.h ('k') | sky/engine/core/html/parser/MarkupTokenizerInlines.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698