third_party/WebKit/WebCore/html/HTMLTokenizer.cpp - Issue 21152: WebKit merge 40668:40722 part 1.

Unified Diff: third_party/WebKit/WebCore/html/HTMLTokenizer.cpp

Issue 21152: WebKit merge 40668:40722 part 1. (Closed) Base URL: svn://chrome-svn/chrome/trunk/src/

Patch Set: Created 11 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/WebKit/WebCore/html/HTMLTokenizer.cpp

===================================================================

--- third_party/WebKit/WebCore/html/HTMLTokenizer.cpp (revision 9310)

+++ third_party/WebKit/WebCore/html/HTMLTokenizer.cpp (working copy)

@@ -1,2045 +1,2045 @@

-/*

- This library is free software; you can redistribute it and/or

- modify it under the terms of the GNU Library General Public

- License as published by the Free Software Foundation; either

- version 2 of the License, or (at your option) any later version.

- This library is distributed in the hope that it will be useful,

- but WITHOUT ANY WARRANTY; without even the implied warranty of

- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

- Library General Public License for more details.

- You should have received a copy of the GNU Library General Public License

- along with this library; see the file COPYING.LIB. If not, write to

- the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,

- Boston, MA 02110-1301, USA.

-*/

-#include "config.h"

-#include "HTMLTokenizer.h"

-#include "CSSHelper.h"

-#include "Cache.h"

-#include "CachedScript.h"

-#include "DocLoader.h"

-#include "DocumentFragment.h"

-#include "EventNames.h"

-#include "Frame.h"

-#include "FrameLoader.h"

-#include "FrameView.h"

-#include "HTMLElement.h"

-#include "HTMLNames.h"

-#include "HTMLParser.h"

-#include "HTMLScriptElement.h"

-#include "HTMLViewSourceDocument.h"

-#include "Page.h"

-#include "PreloadScanner.h"

-#include "ScriptController.h"

-#include "ScriptSourceCode.h"

-#include "ScriptValue.h"

-#include <wtf/ASCIICType.h>

-#include <wtf/CurrentTime.h>

-#include "HTMLEntityNames.c"

-#define PRELOAD_SCANNER_ENABLED 1

-// #define INSTRUMENT_LAYOUT_SCHEDULING 1

-using namespace WTF;

-using namespace std;

-namespace WebCore {

-using namespace HTMLNames;

-#if MOBILE

-// The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.

-// This value is used to define how many characters the tokenizer will process before

-// yeilding control.

-static const int defaultTokenizerChunkSize = 256;

-#else

-static const int defaultTokenizerChunkSize = 4096;

-#endif

-#if MOBILE

-// As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise

-// it will take way to long to load a page.

-static const double defaultTokenizerTimeDelay = 0.300;

-#else

-// FIXME: We would like this constant to be 200ms.

-// Yielding more aggressively results in increased responsiveness and better incremental rendering.

-// It slows down overall page-load on slower machines, though, so for now we set a value of 500.

-static const double defaultTokenizerTimeDelay = 0.500;

-#endif

-static const char commentStart [] = "<!--";

-static const char doctypeStart [] = "<!doctype";

-static const char publicStart [] = "public";

-static const char systemStart [] = "system";

-static const char scriptEnd [] = "</script";

-static const char xmpEnd [] = "</xmp";

-static const char styleEnd [] = "</style";

-static const char textareaEnd [] = "</textarea";

-static const char titleEnd [] = "</title";

-static const char iframeEnd [] = "</iframe";

-// Full support for MS Windows extensions to Latin-1.

-// Technically these extensions should only be activated for pages

-// marked "windows-1252" or "cp1252", but

-// in the standard Microsoft way, these extensions infect hundreds of thousands

-// of web pages. Note that people with non-latin-1 Microsoft extensions

-// are SOL.

-//

-// See: http://www.microsoft.com/globaldev/reference/WinCP.asp

-// http://www.bbsinc.com/iso8859.html

-// http://www.obviously.com/

-//

-// There may be better equivalents

-// We only need this for entities. For non-entity text, we handle this in the text encoding.

-static const UChar windowsLatin1ExtensionArray[32] = {

- 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87

- 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F

- 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97

- 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F

-};

-static inline UChar fixUpChar(UChar c)

- if ((c & ~0x1F) != 0x0080)

- return c;

- return windowsLatin1ExtensionArray[c - 0x80];

-static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length)

- for (unsigned i = 0; i != length; ++i) {

- unsigned char c1 = s1[i];

- unsigned char uc1 = toASCIIUpper(static_cast<char>(c1));

- UChar c2 = s2[i];

- if (c1 != c2 && uc1 != c2)

- return false;

- }

- return true;

-inline void Token::addAttribute(AtomicString& attrName, const AtomicString& attributeValue, bool viewSourceMode)

- if (!attrName.isEmpty()) {

- ASSERT(!attrName.contains('/'));

- RefPtr<MappedAttribute> a = MappedAttribute::create(attrName, attributeValue);

- if (!attrs) {

- attrs = NamedMappedAttrMap::create();

- attrs->reserveInitialCapacity(10);

- }

- attrs->insertAttribute(a.release(), viewSourceMode);

- }

- attrName = emptyAtom;

-// ----------------------------------------------------------------------------

-HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors)

- : Tokenizer()

- , m_buffer(0)

- , m_scriptCode(0)

- , m_scriptCodeSize(0)

- , m_scriptCodeCapacity(0)

- , m_scriptCodeResync(0)

- , m_executingScript(0)

- , m_requestingScript(false)

- , m_hasScriptsWaitingForStylesheets(false)

- , m_timer(this, &HTMLTokenizer::timerFired)

- , m_doc(doc)

- , m_parser(new HTMLParser(doc, reportErrors))

- , m_inWrite(false)

- , m_fragment(false)

- begin();

-HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc)

- : Tokenizer(true)

- , m_buffer(0)

- , m_scriptCode(0)

- , m_scriptCodeSize(0)

- , m_scriptCodeCapacity(0)

- , m_scriptCodeResync(0)

- , m_executingScript(0)

- , m_requestingScript(false)

- , m_hasScriptsWaitingForStylesheets(false)

- , m_timer(this, &HTMLTokenizer::timerFired)

- , m_doc(doc)

- , m_parser(0)

- , m_inWrite(false)

- , m_fragment(false)

- begin();

-HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag)

- : m_buffer(0)

- , m_scriptCode(0)

- , m_scriptCodeSize(0)

- , m_scriptCodeCapacity(0)

- , m_scriptCodeResync(0)

- , m_executingScript(0)

- , m_requestingScript(false)

- , m_hasScriptsWaitingForStylesheets(false)

- , m_timer(this, &HTMLTokenizer::timerFired)

- , m_doc(frag->document())

- , m_parser(new HTMLParser(frag))

- , m_inWrite(false)

- , m_fragment(true)

- begin();

-void HTMLTokenizer::reset()

- ASSERT(m_executingScript == 0);

- while (!m_pendingScripts.isEmpty()) {

- CachedScript* cs = m_pendingScripts.first().get();

- m_pendingScripts.removeFirst();

- ASSERT(cache()->disabled() || cs->accessCount() > 0);

- cs->removeClient(this);

- }

- fastFree(m_buffer);

- m_buffer = m_dest = 0;

- m_bufferSize = 0;

- fastFree(m_scriptCode);

- m_scriptCode = 0;

- m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;

- m_timer.stop();

- m_state.setAllowYield(false);

- m_state.setForceSynchronous(false);

- m_currentToken.reset();

- m_doctypeToken.reset();

- m_doctypeSearchCount = 0;

- m_doctypeSecondarySearchCount = 0;

- m_hasScriptsWaitingForStylesheets = false;

-void HTMLTokenizer::begin()

- m_executingScript = 0;

- m_requestingScript = false;

- m_hasScriptsWaitingForStylesheets = false;

- m_state.setLoadingExtScript(false);

- reset();

- m_bufferSize = 254;

- m_buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254));

- m_dest = m_buffer;

- tquote = NoQuote;

- searchCount = 0;

- m_state.setEntityState(NoEntity);

- m_scriptTagSrcAttrValue = String();

- m_pendingSrc.clear();

- m_currentPrependingSrc = 0;

- m_noMoreData = false;

- m_brokenComments = false;

- m_brokenServer = false;

- m_lineNumber = 0;

- m_currentScriptTagStartLineNumber = 0;

- m_currentTagStartLineNumber = 0;

- m_state.setForceSynchronous(false);

- Page* page = m_doc->page();

- if (page && page->hasCustomHTMLTokenizerTimeDelay())

- m_tokenizerTimeDelay = page->customHTMLTokenizerTimeDelay();

- else

- m_tokenizerTimeDelay = defaultTokenizerTimeDelay;

- if (page && page->hasCustomHTMLTokenizerChunkSize())

- m_tokenizerChunkSize = page->customHTMLTokenizerChunkSize();

- else

- m_tokenizerChunkSize = defaultTokenizerChunkSize;

-void HTMLTokenizer::setForceSynchronous(bool force)

- m_state.setForceSynchronous(force);

-HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)

- // This function adds the listing 'list' as

- // preformatted text-tokens to the token-collection

- while (!list.isEmpty()) {

- if (state.skipLF()) {

- state.setSkipLF(false);

- if (*list == '\n') {

- list.advance();

- continue;

- }

- checkBuffer();

- if (*list == '\n' || *list == '\r') {

- if (state.discardLF())

- // Ignore this LF

- state.setDiscardLF(false); // We have discarded 1 LF

- else

- *m_dest++ = '\n';

- /* Check for MS-DOS CRLF sequence */

- if (*list == '\r')

- state.setSkipLF(true);

- list.advance();

- } else {

- state.setDiscardLF(false);

- *m_dest++ = *list;

- list.advance();

- }

- return state;

-HTMLTokenizer::State HTMLTokenizer::parseSpecial(SegmentedString& src, State state)

- ASSERT(state.inTextArea() || state.inTitle() || state.inIFrame() || !state.hasEntityState());

- ASSERT(!state.hasTagState());

- ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() + state.inIFrame() == 1 );

- if (state.inScript() && !m_currentScriptTagStartLineNumber)

- m_currentScriptTagStartLineNumber = m_lineNumber;

- if (state.inComment())

- state = parseComment(src, state);

- int lastDecodedEntityPosition = -1;

- while (!src.isEmpty()) {

- checkScriptBuffer();

- UChar ch = *src;

- if (!m_scriptCodeResync && !m_brokenComments &&

- !state.inXmp() && ch == '-' && m_scriptCodeSize >= 3 && !src.escaped() &&

- m_scriptCode[m_scriptCodeSize - 3] == '<' && m_scriptCode[m_scriptCodeSize - 2] == '!' && m_scriptCode[m_scriptCodeSize - 1] == '-' &&

- (lastDecodedEntityPosition < m_scriptCodeSize - 3)) {

- state.setInComment(true);

- state = parseComment(src, state);

- continue;

- }

- if (m_scriptCodeResync && !tquote && ch == '>') {

- src.advancePastNonNewline();

- m_scriptCodeSize = m_scriptCodeResync - 1;

- m_scriptCodeResync = 0;

- m_scriptCode[m_scriptCodeSize] = m_scriptCode[m_scriptCodeSize + 1] = 0;

- if (state.inScript())

- state = scriptHandler(state);

- else {

- state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);

- processToken();

- if (state.inStyle()) {

- m_currentToken.tagName = styleTag.localName();

- m_currentToken.beginTag = false;

- } else if (state.inTextArea()) {

- m_currentToken.tagName = textareaTag.localName();

- m_currentToken.beginTag = false;

- } else if (state.inTitle()) {

- m_currentToken.tagName = titleTag.localName();

- m_currentToken.beginTag = false;

- } else if (state.inXmp()) {

- m_currentToken.tagName = xmpTag.localName();

- m_currentToken.beginTag = false;

- } else if (state.inIFrame()) {

- m_currentToken.tagName = iframeTag.localName();

- m_currentToken.beginTag = false;

- }

- processToken();

- state.setInStyle(false);

- state.setInScript(false);

- state.setInTextArea(false);

- state.setInTitle(false);

- state.setInXmp(false);

- state.setInIFrame(false);

- tquote = NoQuote;

- m_scriptCodeSize = m_scriptCodeResync = 0;

- }

- return state;

- }

- // possible end of tagname, lets check.

- if (!m_scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || isASCIISpace(ch)) &&

- m_scriptCodeSize >= m_searchStopperLength &&

- tagMatch(m_searchStopper, m_scriptCode + m_scriptCodeSize - m_searchStopperLength, m_searchStopperLength) &&

- (lastDecodedEntityPosition < m_scriptCodeSize - m_searchStopperLength)) {

- m_scriptCodeResync = m_scriptCodeSize-m_searchStopperLength+1;

- tquote = NoQuote;

- continue;

- }

- if (m_scriptCodeResync && !state.escaped()) {

- if (ch == '\"')

- tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);

- else if (ch == '\'')

- tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;

- else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))

- tquote = NoQuote;

- }

- state.setEscaped(!state.escaped() && ch == '\\');

- if (!m_scriptCodeResync && (state.inTextArea() || state.inTitle() || state.inIFrame()) && !src.escaped() && ch == '&') {

- UChar* scriptCodeDest = m_scriptCode + m_scriptCodeSize;

- src.advancePastNonNewline();

- state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);

- if (scriptCodeDest == m_scriptCode + m_scriptCodeSize)

- lastDecodedEntityPosition = m_scriptCodeSize;

- else

- m_scriptCodeSize = scriptCodeDest - m_scriptCode;

- } else {

- m_scriptCode[m_scriptCodeSize++] = ch;

- src.advance(m_lineNumber);

- }

- return state;

-HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)

- // We are inside a <script>

- bool doScriptExec = false;

- int startLine = m_currentScriptTagStartLineNumber + 1; // Script line numbers are 1 based, HTMLTokenzier line numbers are 0 based

- // Reset m_currentScriptTagStartLineNumber to indicate that we've finished parsing the current script element

- m_currentScriptTagStartLineNumber = 0;

- // (Bugzilla 3837) Scripts following a frameset element should not execute or,

- // in the case of extern scripts, even load.

- bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag));

- CachedScript* cs = 0;

- // don't load external scripts for standalone documents (for now)

- if (!inViewSourceMode()) {

- if (!m_scriptTagSrcAttrValue.isEmpty() && m_doc->frame()) {

- // forget what we just got; load from src url instead

- if (!m_parser->skipMode() && !followingFrameset) {

-#ifdef INSTRUMENT_LAYOUT_SCHEDULING

- if (!m_doc->ownerElement())

- printf("Requesting script at time %d\n", m_doc->elapsedTime());

-#endif

- // The parser might have been stopped by for example a window.close call in an earlier script.

- // If so, we don't want to load scripts.

- if (!m_parserStopped && (cs = m_doc->docLoader()->requestScript(m_scriptTagSrcAttrValue, m_scriptTagCharsetAttrValue)))

- m_pendingScripts.append(cs);

- else

- m_scriptNode = 0;

- } else

- m_scriptNode = 0;

- m_scriptTagSrcAttrValue = String();

- } else {

- // Parse m_scriptCode containing <script> info

-#if USE(LOW_BANDWIDTH_DISPLAY)

- if (m_doc->inLowBandwidthDisplay()) {

- // ideal solution is only skipping internal JavaScript if there is external JavaScript.

- // but internal JavaScript can use document.write() to create an external JavaScript,

- // so we have to skip internal JavaScript all the time.

- m_doc->frame()->loader()->needToSwitchOutLowBandwidthDisplay();

- doScriptExec = false;

- } else

-#endif

- doScriptExec = m_scriptNode->shouldExecuteAsJavaScript();

- m_scriptNode = 0;

- }

- state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);

- RefPtr<Node> node = processToken();

- String scriptString = node ? node->textContent() : "";

- m_currentToken.tagName = scriptTag.localName();

- m_currentToken.beginTag = false;

- processToken();

- state.setInScript(false);

- m_scriptCodeSize = m_scriptCodeResync = 0;

- // FIXME: The script should be syntax highlighted.

- if (inViewSourceMode())

- return state;

- SegmentedString* savedPrependingSrc = m_currentPrependingSrc;

- SegmentedString prependingSrc;

- m_currentPrependingSrc = &prependingSrc;

- if (!m_parser->skipMode() && !followingFrameset) {

- if (cs) {

- if (savedPrependingSrc)

- savedPrependingSrc->append(m_src);

- else

- m_pendingSrc.prepend(m_src);

- setSrc(SegmentedString());

- // the ref() call below may call notifyFinished if the script is already in cache,

- // and that mucks with the state directly, so we must write it back to the object.

- m_state = state;

- bool savedRequestingScript = m_requestingScript;

- m_requestingScript = true;

- cs->addClient(this);

- m_requestingScript = savedRequestingScript;

- state = m_state;

- // will be 0 if script was already loaded and ref() executed it

- if (!m_pendingScripts.isEmpty())

- state.setLoadingExtScript(true);

- } else if (!m_fragment && doScriptExec) {

- if (!m_executingScript)

- m_pendingSrc.prepend(m_src);

- else

- prependingSrc = m_src;

- setSrc(SegmentedString());

- state = scriptExecution(ScriptSourceCode(scriptString, m_doc->frame() ? m_doc->frame()->document()->url() : KURL(), startLine), state);

- }

- if (!m_executingScript && !state.loadingExtScript()) {

- m_src.append(m_pendingSrc);

- m_pendingSrc.clear();

- } else if (!prependingSrc.isEmpty()) {

- // restore first so that the write appends in the right place

- // (does not hurt to do it again below)

- m_currentPrependingSrc = savedPrependingSrc;

- // we need to do this slightly modified bit of one of the write() cases

- // because we want to prepend to m_pendingSrc rather than appending

- // if there's no previous prependingSrc

- if (!m_pendingScripts.isEmpty()) {

- if (m_currentPrependingSrc)

- m_currentPrependingSrc->append(prependingSrc);

- else

- m_pendingSrc.prepend(prependingSrc);

- } else {

- m_state = state;

- write(prependingSrc, false);

- state = m_state;

- }

-#if PRELOAD_SCANNER_ENABLED

- if (!m_pendingScripts.isEmpty() && !m_executingScript) {

- if (!m_preloadScanner)

- m_preloadScanner.set(new PreloadScanner(m_doc));

- if (!m_preloadScanner->inProgress()) {

- m_preloadScanner->begin();

- m_preloadScanner->write(m_pendingSrc);

- }

-#endif

- m_currentPrependingSrc = savedPrependingSrc;

- return state;

-HTMLTokenizer::State HTMLTokenizer::scriptExecution(const ScriptSourceCode& sourceCode, State state)

- if (m_fragment || !m_doc->frame())

- return state;

- m_executingScript++;

- SegmentedString* savedPrependingSrc = m_currentPrependingSrc;

- SegmentedString prependingSrc;

- m_currentPrependingSrc = &prependingSrc;

-#ifdef INSTRUMENT_LAYOUT_SCHEDULING

- if (!m_doc->ownerElement())

- printf("beginning script execution at %d\n", m_doc->elapsedTime());

-#endif

- m_state = state;

- m_doc->frame()->loader()->executeScript(sourceCode);

- state = m_state;

- state.setAllowYield(true);

-#ifdef INSTRUMENT_LAYOUT_SCHEDULING

- if (!m_doc->ownerElement())

- printf("ending script execution at %d\n", m_doc->elapsedTime());

-#endif

- m_executingScript--;

- if (!m_executingScript && !state.loadingExtScript()) {

- m_pendingSrc.prepend(prependingSrc);

- m_src.append(m_pendingSrc);

- m_pendingSrc.clear();

- } else if (!prependingSrc.isEmpty()) {

- // restore first so that the write appends in the right place

- // (does not hurt to do it again below)

- m_currentPrependingSrc = savedPrependingSrc;

- // we need to do this slightly modified bit of one of the write() cases

- // because we want to prepend to m_pendingSrc rather than appending

- // if there's no previous prependingSrc

- if (!m_pendingScripts.isEmpty()) {

- if (m_currentPrependingSrc)

- m_currentPrependingSrc->append(prependingSrc);

- else

- m_pendingSrc.prepend(prependingSrc);

-#if PRELOAD_SCANNER_ENABLED

- // We are stuck waiting for another script. Lets check the source that

- // was just document.write()n for anything to load.

- PreloadScanner documentWritePreloadScanner(m_doc);

- documentWritePreloadScanner.begin();

- documentWritePreloadScanner.write(prependingSrc);

- documentWritePreloadScanner.end();

-#endif

- } else {

- m_state = state;

- write(prependingSrc, false);

- state = m_state;

- }

- m_currentPrependingSrc = savedPrependingSrc;

- return state;

-HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString& src, State state)

- // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.

- checkScriptBuffer(src.length());

- while (!src.isEmpty()) {

- UChar ch = *src;

- m_scriptCode[m_scriptCodeSize++] = ch;

- if (ch == '>') {

- bool handleBrokenComments = m_brokenComments && !(state.inScript() || state.inStyle());

- int endCharsCount = 1; // start off with one for the '>' character

- if (m_scriptCodeSize > 2 && m_scriptCode[m_scriptCodeSize-3] == '-' && m_scriptCode[m_scriptCodeSize-2] == '-') {

- endCharsCount = 3;

- } else if (m_scriptCodeSize > 3 && m_scriptCode[m_scriptCodeSize-4] == '-' && m_scriptCode[m_scriptCodeSize-3] == '-' &&

- m_scriptCode[m_scriptCodeSize-2] == '!') {

- // Other browsers will accept --!> as a close comment, even though it's

- // not technically valid.

- endCharsCount = 4;

- }

- if (handleBrokenComments || endCharsCount > 1) {

- src.advancePastNonNewline();

- if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) {

- checkScriptBuffer();

- m_scriptCode[m_scriptCodeSize] = 0;

- m_scriptCode[m_scriptCodeSize + 1] = 0;

- m_currentToken.tagName = commentAtom;

- m_currentToken.beginTag = true;

- state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state);

- processToken();

- m_currentToken.tagName = commentAtom;

- m_currentToken.beginTag = false;

- processToken();

- m_scriptCodeSize = 0;

- }

- state.setInComment(false);

- return state; // Finished parsing comment

- }

- src.advance(m_lineNumber);

- }

- return state;

-HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)

- checkScriptBuffer(src.length());

- while (!src.isEmpty()) {

- UChar ch = *src;

- m_scriptCode[m_scriptCodeSize++] = ch;

- if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') {

- src.advancePastNonNewline();

- state.setInServer(false);

- m_scriptCodeSize = 0;

- return state; // Finished parsing server include

- }

- src.advance(m_lineNumber);

- }

- return state;

-HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString& src, State state)

- UChar oldchar = 0;

- while (!src.isEmpty()) {

- UChar chbegin = *src;

- if (chbegin == '\'')

- tquote = tquote == SingleQuote ? NoQuote : SingleQuote;

- else if (chbegin == '\"')

- tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;

- // Look for '?>'

- // Some crappy sites omit the "?" before it, so

- // we look for an unquoted '>' instead. (IE compatible)

- else if (chbegin == '>' && (!tquote || oldchar == '?')) {

- // We got a '?>' sequence

- state.setInProcessingInstruction(false);

- src.advancePastNonNewline();

- state.setDiscardLF(true);

- return state; // Finished parsing comment!

- }

- src.advance(m_lineNumber);

- oldchar = chbegin;

- }

- return state;

-HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString& src, State state)

- while (!src.isEmpty()) {

- UChar cc = *src;

- if (state.skipLF()) {

- state.setSkipLF(false);

- if (cc == '\n') {

- src.advancePastNewline(m_lineNumber);

- continue;

- }

- // do we need to enlarge the buffer?

- checkBuffer();

- if (cc == '\r') {

- state.setSkipLF(true);

- *m_dest++ = '\n';

- } else

- *m_dest++ = cc;

- src.advance(m_lineNumber);

- }

- return state;

-HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag)

- if (start) {

- cBufferPos = 0;

- state.setEntityState(SearchEntity);

- EntityUnicodeValue = 0;

- }

- while(!src.isEmpty()) {

- UChar cc = *src;

- switch(state.entityState()) {

- case NoEntity:

- ASSERT(state.entityState() != NoEntity);

- return state;

- case SearchEntity:

- if (cc == '#') {

- m_cBuffer[cBufferPos++] = cc;

- src.advancePastNonNewline();

- state.setEntityState(NumericSearch);

- } else

- state.setEntityState(EntityName);

- break;

- case NumericSearch:

- if (cc == 'x' || cc == 'X') {

- m_cBuffer[cBufferPos++] = cc;

- src.advancePastNonNewline();

- state.setEntityState(Hexadecimal);

- } else if (cc >= '0' && cc <= '9')

- state.setEntityState(Decimal);

- else

- state.setEntityState(SearchSemicolon);

- break;

- case Hexadecimal: {

- int ll = min(src.length(), 10 - cBufferPos);

- while (ll--) {

- cc = *src;

- if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {

- state.setEntityState(SearchSemicolon);

- break;

- }

- int digit;

- if (cc < 'A')

- digit = cc - '0';

- else

- digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch

- EntityUnicodeValue = EntityUnicodeValue * 16 + digit;

- m_cBuffer[cBufferPos++] = cc;

- src.advancePastNonNewline();

- }

- if (cBufferPos == 10)

- state.setEntityState(SearchSemicolon);

- break;

- }

- case Decimal:

- {

- int ll = min(src.length(), 9-cBufferPos);

- while(ll--) {

- cc = *src;

- if (!(cc >= '0' && cc <= '9')) {

- state.setEntityState(SearchSemicolon);

- break;

- }

- EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');

- m_cBuffer[cBufferPos++] = cc;

- src.advancePastNonNewline();

- }

- if (cBufferPos == 9)

- state.setEntityState(SearchSemicolon);

- break;

- }

- case EntityName:

- {

- int ll = min(src.length(), 9-cBufferPos);

- while(ll--) {

- cc = *src;

- if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {

- state.setEntityState(SearchSemicolon);

- break;

- }

- m_cBuffer[cBufferPos++] = cc;

- src.advancePastNonNewline();

- }

- if (cBufferPos == 9)

- state.setEntityState(SearchSemicolon);

- if (state.entityState() == SearchSemicolon) {

- if(cBufferPos > 1) {

- // Since the maximum length of entity name is 9,

- // so a single char array which is allocated on

- // the stack, its length is 10, should be OK.

- // Also if we have an illegal character, we treat it

- // as illegal entity name.

- unsigned testedEntityNameLen = 0;

- char tmpEntityNameBuffer[10];

- ASSERT(cBufferPos < 10);

- for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) {

- if (m_cBuffer[testedEntityNameLen] > 0x7e)

- break;

- tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen];

- }

- const Entity *e;

- if (testedEntityNameLen == cBufferPos)

- e = findEntity(tmpEntityNameBuffer, cBufferPos);

- else

- e = 0;

- if(e)

- EntityUnicodeValue = e->code;

- // be IE compatible

- if(parsingTag && EntityUnicodeValue > 255 && *src != ';')

- EntityUnicodeValue = 0;

- }

- else

- break;

- }

- case SearchSemicolon:

- // Don't allow values that are more than 21 bits.

- if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {

- if (!inViewSourceMode()) {

- if (*src == ';')

- src.advancePastNonNewline();

- if (EntityUnicodeValue <= 0xFFFF) {

- checkBuffer();

- src.push(fixUpChar(EntityUnicodeValue));

- } else {

- // Convert to UTF-16, using surrogate code points.

- checkBuffer(2);

- src.push(U16_LEAD(EntityUnicodeValue));

- src.push(U16_TRAIL(EntityUnicodeValue));

- }

- } else {

- // FIXME: We should eventually colorize entities by sending them as a special token.

- checkBuffer(11);

- *dest++ = '&';

- for (unsigned i = 0; i < cBufferPos; i++)

- dest[i] = m_cBuffer[i];

- dest += cBufferPos;

- if (*src == ';') {

- *dest++ = ';';

- src.advancePastNonNewline();

- }

- } else {

- checkBuffer(10);

- // ignore the sequence, add it to the buffer as plaintext

- *dest++ = '&';

- for (unsigned i = 0; i < cBufferPos; i++)

- dest[i] = m_cBuffer[i];

- dest += cBufferPos;

- }

- state.setEntityState(NoEntity);

- return state;

- }

- return state;

-HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state)

- ASSERT(state.inDoctype());

- while (!src.isEmpty() && state.inDoctype()) {

- UChar c = *src;

- bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';

- switch (m_doctypeToken.state()) {

- case DoctypeBegin: {

- m_doctypeToken.setState(DoctypeBeforeName);

- if (isWhitespace) {

- src.advance(m_lineNumber);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- }

- break;

- }

- case DoctypeBeforeName: {

- if (c == '>') {

- // Malformed. Just exit.

- src.advancePastNonNewline();

- state.setInDoctype(false);

- if (inViewSourceMode())

- processDoctypeToken();

- } else if (isWhitespace) {

- src.advance(m_lineNumber);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else

- m_doctypeToken.setState(DoctypeName);

- break;

- }

- case DoctypeName: {

- if (c == '>') {

- // Valid doctype. Emit it.

- src.advancePastNonNewline();

- state.setInDoctype(false);

- processDoctypeToken();

- } else if (isWhitespace) {

- m_doctypeSearchCount = 0; // Used now to scan for PUBLIC

- m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM

- m_doctypeToken.setState(DoctypeAfterName);

- src.advance(m_lineNumber);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else {

- src.advancePastNonNewline();

- m_doctypeToken.m_name.append(c);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- }

- break;

- }

- case DoctypeAfterName: {

- if (c == '>') {

- // Valid doctype. Emit it.

- src.advancePastNonNewline();

- state.setInDoctype(false);

- processDoctypeToken();

- } else if (!isWhitespace) {

- src.advancePastNonNewline();

- if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) {

- m_doctypeSearchCount++;

- if (m_doctypeSearchCount == 6)

- // Found 'PUBLIC' sequence

- m_doctypeToken.setState(DoctypeBeforePublicID);

- } else if (m_doctypeSearchCount > 0) {

- m_doctypeSearchCount = 0;

- m_doctypeToken.setState(DoctypeBogus);

- } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) {

- m_doctypeSecondarySearchCount++;

- if (m_doctypeSecondarySearchCount == 6)

- // Found 'SYSTEM' sequence

- m_doctypeToken.setState(DoctypeBeforeSystemID);

- } else {

- m_doctypeSecondarySearchCount = 0;

- m_doctypeToken.setState(DoctypeBogus);

- }

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else {

- src.advance(m_lineNumber); // Whitespace keeps us in the after name state.

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- }

- break;

- }

- case DoctypeBeforePublicID: {

- if (c == '\"' || c == '\'') {

- tquote = c == '\"' ? DoubleQuote : SingleQuote;

- m_doctypeToken.setState(DoctypePublicID);

- src.advancePastNonNewline();

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else if (c == '>') {

- // Considered bogus. Don't process the doctype.

- src.advancePastNonNewline();

- state.setInDoctype(false);

- if (inViewSourceMode())

- processDoctypeToken();

- } else if (isWhitespace) {

- src.advance(m_lineNumber);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else

- m_doctypeToken.setState(DoctypeBogus);

- break;

- }

- case DoctypePublicID: {

- if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {

- src.advancePastNonNewline();

- m_doctypeToken.setState(DoctypeAfterPublicID);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else if (c == '>') {

- // Considered bogus. Don't process the doctype.

- src.advancePastNonNewline();

- state.setInDoctype(false);

- if (inViewSourceMode())

- processDoctypeToken();

- } else {

- m_doctypeToken.m_publicID.append(c);

- src.advance(m_lineNumber);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- }

- break;

- }

- case DoctypeAfterPublicID:

- if (c == '\"' || c == '\'') {

- tquote = c == '\"' ? DoubleQuote : SingleQuote;

- m_doctypeToken.setState(DoctypeSystemID);

- src.advancePastNonNewline();

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else if (c == '>') {

- // Valid doctype. Emit it now.

- src.advancePastNonNewline();

- state.setInDoctype(false);

- processDoctypeToken();

- } else if (isWhitespace) {

- src.advance(m_lineNumber);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else

- m_doctypeToken.setState(DoctypeBogus);

- break;

- case DoctypeBeforeSystemID:

- if (c == '\"' || c == '\'') {

- tquote = c == '\"' ? DoubleQuote : SingleQuote;

- m_doctypeToken.setState(DoctypeSystemID);

- src.advancePastNonNewline();

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else if (c == '>') {

- // Considered bogus. Don't process the doctype.

- src.advancePastNonNewline();

- state.setInDoctype(false);

- } else if (isWhitespace) {

- src.advance(m_lineNumber);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else

- m_doctypeToken.setState(DoctypeBogus);

- break;

- case DoctypeSystemID:

- if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {

- src.advancePastNonNewline();

- m_doctypeToken.setState(DoctypeAfterSystemID);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else if (c == '>') {

- // Considered bogus. Don't process the doctype.

- src.advancePastNonNewline();

- state.setInDoctype(false);

- if (inViewSourceMode())

- processDoctypeToken();

- } else {

- m_doctypeToken.m_systemID.append(c);

- src.advance(m_lineNumber);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- }

- break;

- case DoctypeAfterSystemID:

- if (c == '>') {

- // Valid doctype. Emit it now.

- src.advancePastNonNewline();

- state.setInDoctype(false);

- processDoctypeToken();

- } else if (isWhitespace) {

- src.advance(m_lineNumber);

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- } else

- m_doctypeToken.setState(DoctypeBogus);

- break;

- case DoctypeBogus:

- if (c == '>') {

- // Done with the bogus doctype.

- src.advancePastNonNewline();

- state.setInDoctype(false);

- if (inViewSourceMode())

- processDoctypeToken();

- } else {

- src.advance(m_lineNumber); // Just keep scanning for '>'

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(c);

- }

- break;

- default:

- break;

- }

- return state;

-HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString& src, State state)

- ASSERT(!state.hasEntityState());

- unsigned cBufferPos = m_cBufferPos;

- bool lastIsSlash = false;

- while (!src.isEmpty()) {

- checkBuffer();

- switch(state.tagState()) {

- case NoTag:

- {

- m_cBufferPos = cBufferPos;

- return state;

- }

- case TagName:

- {

- if (searchCount > 0) {

- if (*src == commentStart[searchCount]) {

- searchCount++;

- if (searchCount == 2)

- m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well.

- else

- m_doctypeSearchCount = 0;

- if (searchCount == 4) {

- // Found '<!--' sequence

- src.advancePastNonNewline();

- m_dest = m_buffer; // ignore the previous part of this tag

- state.setInComment(true);

- state.setTagState(NoTag);

- // Fix bug 34302 at kde.bugs.org. Go ahead and treat

- // <!--> as a valid comment, since both mozilla and IE on windows

- // can handle this case. Only do this in quirks mode. -dwh

- if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) {

- state.setInComment(false);

- src.advancePastNonNewline();

- if (!src.isEmpty())

- m_cBuffer[cBufferPos++] = *src;

- } else

- state = parseComment(src, state);

- m_cBufferPos = cBufferPos;

- return state; // Finished parsing tag!

- }

- m_cBuffer[cBufferPos++] = *src;

- src.advancePastNonNewline();

- break;

- } else

- searchCount = 0; // Stop looking for '<!--' sequence

- }

- if (m_doctypeSearchCount > 0) {

- if (toASCIILower(*src) == doctypeStart[m_doctypeSearchCount]) {

- m_doctypeSearchCount++;

- m_cBuffer[cBufferPos++] = *src;

- src.advancePastNonNewline();

- if (m_doctypeSearchCount == 9) {

- // Found '<!DOCTYPE' sequence

- state.setInDoctype(true);

- state.setTagState(NoTag);

- m_doctypeToken.reset();

- if (inViewSourceMode())

- m_doctypeToken.m_source.append(m_cBuffer, cBufferPos);

- state = parseDoctype(src, state);

- m_cBufferPos = cBufferPos;

- return state;

- }

- break;

- } else

- m_doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence

- }

- bool finish = false;

- unsigned int ll = min(src.length(), CBUFLEN - cBufferPos);

- while (ll--) {

- UChar curchar = *src;

- if (isASCIISpace(curchar) || curchar == '>' || curchar == '<') {

- finish = true;

- break;

- }

- // tolower() shows up on profiles. This is faster!

- if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())

- m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');

- else

- m_cBuffer[cBufferPos++] = curchar;

- src.advancePastNonNewline();

- }

- // Disadvantage: we add the possible rest of the tag

- // as attribute names. ### judge if this causes problems

- if (finish || CBUFLEN == cBufferPos) {

- bool beginTag;

- UChar* ptr = m_cBuffer;

- unsigned int len = cBufferPos;

- m_cBuffer[cBufferPos] = '\0';

- if ((cBufferPos > 0) && (*ptr == '/')) {

- // End Tag

- beginTag = false;

- ptr++;

- len--;

- }

- else

- // Start Tag

- beginTag = true;

- // Ignore the / in fake xml tags like <br/>. We trim off the "/" so that we'll get "br" as the tag name and not "br/".

- if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode())

- ptr[--len] = '\0';

- // Now that we've shaved off any invalid / that might have followed the name), make the tag.

- // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html)

- if (ptr[0] != '!' || inViewSourceMode()) {

- m_currentToken.tagName = AtomicString(ptr);

- m_currentToken.beginTag = beginTag;

- }

- m_dest = m_buffer;

- state.setTagState(SearchAttribute);

- cBufferPos = 0;

- }

- break;

- }

- case SearchAttribute:

- while(!src.isEmpty()) {

- UChar curchar = *src;

- // In this mode just ignore any quotes we encounter and treat them like spaces.

- if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"') {

- if (curchar == '<' || curchar == '>')

- state.setTagState(SearchEnd);

- else

- state.setTagState(AttributeName);

- cBufferPos = 0;

- break;

- }

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar(curchar);

- src.advance(m_lineNumber);

- }

- break;

- case AttributeName:

- {

- int ll = min(src.length(), CBUFLEN - cBufferPos);

- while (ll--) {

- UChar curchar = *src;

- // If we encounter a "/" when scanning an attribute name, treat it as a delimiter. This allows the

- // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5).

- if (curchar <= '>' && (curchar >= '<' || isASCIISpace(curchar) || curchar == '/')) {

- m_cBuffer[cBufferPos] = '\0';

- m_attrName = AtomicString(m_cBuffer);

- m_dest = m_buffer;

- *m_dest++ = 0;

- state.setTagState(SearchEqual);

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar('a');

- break;

- }

- // tolower() shows up on profiles. This is faster!

- if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())

- m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');

- else

- m_cBuffer[cBufferPos++] = curchar;

- src.advance(m_lineNumber);

- }

- if (cBufferPos == CBUFLEN) {

- m_cBuffer[cBufferPos] = '\0';

- m_attrName = AtomicString(m_cBuffer);

- m_dest = m_buffer;

- *m_dest++ = 0;

- state.setTagState(SearchEqual);

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar('a');

- }

- break;

- }

- case SearchEqual:

- while (!src.isEmpty()) {

- UChar curchar = *src;

- if (lastIsSlash && curchar == '>') {

- // This is a quirk (with a long sad history). We have to do this

- // since widgets do <script src="foo.js"/> and expect the tag to close.

- if (m_currentToken.tagName == scriptTag)

- m_currentToken.selfClosingTag = true;

- m_currentToken.brokenXMLStyle = true;

- }

- // In this mode just ignore any quotes or slashes we encounter and treat them like spaces.

- if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"' && curchar != '/') {

- if (curchar == '=') {

- state.setTagState(SearchValue);

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar(curchar);

- src.advancePastNonNewline();

- } else {

- m_currentToken.addAttribute(m_attrName, emptyAtom, inViewSourceMode());

- m_dest = m_buffer;

- state.setTagState(SearchAttribute);

- lastIsSlash = false;

- }

- break;

- }

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar(curchar);

- lastIsSlash = curchar == '/';

- src.advance(m_lineNumber);

- }

- break;

- case SearchValue:

- while (!src.isEmpty()) {

- UChar curchar = *src;

- if (!isASCIISpace(curchar)) {

- if (curchar == '\'' || curchar == '\"') {

- tquote = curchar == '\"' ? DoubleQuote : SingleQuote;

- state.setTagState(QuotedValue);

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar(curchar);

- src.advancePastNonNewline();

- } else

- state.setTagState(Value);

- break;

- }

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar(curchar);

- src.advance(m_lineNumber);

- }

- break;

- case QuotedValue:

- while (!src.isEmpty()) {

- checkBuffer();

- UChar curchar = *src;

- if (curchar <= '>' && !src.escaped()) {

- if (curchar == '>' && m_attrName.isEmpty()) {

- // Handle a case like <img '>. Just go ahead and be willing

- // to close the whole tag. Don't consume the character and

- // just go back into SearchEnd while ignoring the whole

- // value.

- // FIXME: Note that this is actually not a very good solution.

- // It doesn't handle the general case of

- // unmatched quotes among attributes that have names. -dwh

- while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))

- m_dest--; // remove trailing newlines

- AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);

- if (!attributeValue.contains('/'))

- m_attrName = attributeValue; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)

- m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar('x');

- state.setTagState(SearchAttribute);

- m_dest = m_buffer;

- tquote = NoQuote;

- break;

- }

- if (curchar == '&') {

- src.advancePastNonNewline();

- state = parseEntity(src, m_dest, state, cBufferPos, true, true);

- break;

- }

- if ((tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"')) {

- // some <input type=hidden> rely on trailing spaces. argh

- while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))

- m_dest--; // remove trailing newlines

- AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);

- if (m_attrName.isEmpty() && !attributeValue.contains('/')) {

- m_attrName = attributeValue; // Make the name match the value. (FIXME: Is this a WinIE quirk?)

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar('x');

- } else if (inViewSourceMode())

- m_currentToken.addViewSourceChar('v');

- m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());

- m_dest = m_buffer;

- state.setTagState(SearchAttribute);

- tquote = NoQuote;

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar(curchar);

- src.advancePastNonNewline();

- break;

- }

- *m_dest++ = curchar;

- src.advance(m_lineNumber);

- }

- break;

- case Value:

- while(!src.isEmpty()) {

- checkBuffer();

- UChar curchar = *src;

- if (curchar <= '>' && !src.escaped()) {

- // parse Entities

- if (curchar == '&') {

- src.advancePastNonNewline();

- state = parseEntity(src, m_dest, state, cBufferPos, true, true);

- break;

- }

- // no quotes. Every space means end of value

- // '/' does not delimit in IE!

- if (isASCIISpace(curchar) || curchar == '>') {

- AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);

- m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar('v');

- m_dest = m_buffer;

- state.setTagState(SearchAttribute);

- break;

- }

- *m_dest++ = curchar;

- src.advance(m_lineNumber);

- }

- break;

- case SearchEnd:

- {

- while (!src.isEmpty()) {

- UChar ch = *src;

- if (ch == '>' || ch == '<')

- break;

- if (ch == '/')

- m_currentToken.selfClosingTag = true;

- if (inViewSourceMode())

- m_currentToken.addViewSourceChar(ch);

- src.advance(m_lineNumber);

- }

- if (src.isEmpty())

- break;

- searchCount = 0; // Stop looking for '<!--' sequence

- state.setTagState(NoTag);

- tquote = NoQuote;

- if (*src != '<')

- src.advance(m_lineNumber);

- if (m_currentToken.tagName == nullAtom) { //stop if tag is unknown

- m_cBufferPos = cBufferPos;

- return state;

- }

- AtomicString tagName = m_currentToken.tagName;

- // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard

- // compatibility.

- bool isSelfClosingScript = m_currentToken.selfClosingTag && m_currentToken.beginTag && m_currentToken.tagName == scriptTag;

- bool beginTag = !m_currentToken.selfClosingTag && m_currentToken.beginTag;

- if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode()) {

- Attribute* a = 0;

- m_scriptTagSrcAttrValue = String();

- m_scriptTagCharsetAttrValue = String();

- if (m_currentToken.attrs && !m_fragment) {

- if (m_doc->frame() && m_doc->frame()->script()->isEnabled()) {

- if ((a = m_currentToken.attrs->getAttributeItem(srcAttr)))

- m_scriptTagSrcAttrValue = m_doc->completeURL(parseURL(a->value())).string();

- }

- RefPtr<Node> n = processToken();

- m_cBufferPos = cBufferPos;

- if (n || inViewSourceMode()) {

- if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) {

- if (beginTag)

- state.setDiscardLF(true); // Discard the first LF after we open a pre.

- } else if (tagName == scriptTag) {

- ASSERT(!m_scriptNode);

- m_scriptNode = static_pointer_cast<HTMLScriptElement>(n);

- if (m_scriptNode)

- m_scriptTagCharsetAttrValue = m_scriptNode->scriptCharset();

- if (beginTag) {

- m_searchStopper = scriptEnd;

- m_searchStopperLength = 8;

- state.setInScript(true);

- state = parseSpecial(src, state);

- } else if (isSelfClosingScript) { // Handle <script src="foo"/>

- state.setInScript(true);

- state = scriptHandler(state);

- }

- } else if (tagName == styleTag) {

- if (beginTag) {

- m_searchStopper = styleEnd;

- m_searchStopperLength = 7;

- state.setInStyle(true);

- state = parseSpecial(src, state);

- }

- } else if (tagName == textareaTag) {

- if (beginTag) {

- m_searchStopper = textareaEnd;

- m_searchStopperLength = 10;

- state.setInTextArea(true);

- state = parseSpecial(src, state);

- }

- } else if (tagName == titleTag) {

- if (beginTag) {

- m_searchStopper = titleEnd;

- m_searchStopperLength = 7;

- State savedState = state;

- SegmentedString savedSrc = src;

- long savedLineno = m_lineNumber;

- state.setInTitle(true);

- state = parseSpecial(src, state);

- if (state.inTitle() && src.isEmpty()) {

- // We just ate the rest of the document as the title #text node!

- // Reset the state then retokenize without special title handling.

- // Let the parser clean up the missing </title> tag.

- // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're

- // at the end of the document unless m_noMoreData is also true. We need

- // to detect this case elsewhere, and save the state somewhere other

- // than a local variable.

- state = savedState;

- src = savedSrc;

- m_lineNumber = savedLineno;

- m_scriptCodeSize = 0;

- }

- } else if (tagName == xmpTag) {

- if (beginTag) {

- m_searchStopper = xmpEnd;

- m_searchStopperLength = 5;

- state.setInXmp(true);

- state = parseSpecial(src, state);

- }

- } else if (tagName == iframeTag) {

- if (beginTag) {

- m_searchStopper = iframeEnd;

- m_searchStopperLength = 8;

- state.setInIFrame(true);

- state = parseSpecial(src, state);

- }

- if (tagName == plaintextTag)

- state.setInPlainText(beginTag);

- return state; // Finished parsing tag!

- }

- } // end switch

- }

- m_cBufferPos = cBufferPos;

- return state;

-inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)

- // We don't want to be checking elapsed time with every character, so we only check after we've

- // processed a certain number of characters.

- bool allowedYield = state.allowYield();

- state.setAllowYield(false);

- if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > m_tokenizerChunkSize || allowedYield)) {

- processedCount = 0;

- if (currentTime() - startTime > m_tokenizerTimeDelay) {

- /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to

- load, but this hurts overall performance on slower machines. For now turn this

- off.

- || (!m_doc->haveStylesheetsLoaded() &&

- (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/

- // Schedule the timer to keep processing as soon as possible.

- m_timer.startOneShot(0);

-#ifdef INSTRUMENT_LAYOUT_SCHEDULING

- if (currentTime() - startTime > m_tokenizerTimeDelay)

- printf("Deferring processing of data because 500ms elapsed away from event loop.\n");

-#endif

- return false;

- }

- processedCount++;

- return true;

-bool HTMLTokenizer::write(const SegmentedString& str, bool appendData)

- if (!m_buffer)

- return false;

- if (m_parserStopped)

- return false;

- SegmentedString source(str);

- if (m_executingScript)

- source.setExcludeLineNumbers();

- if ((m_executingScript && appendData) || !m_pendingScripts.isEmpty()) {

- // don't parse; we will do this later

- if (m_currentPrependingSrc)

- m_currentPrependingSrc->append(source);

- else {

- m_pendingSrc.append(source);

-#if PRELOAD_SCANNER_ENABLED

- if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)

- m_preloadScanner->write(source);

-#endif

- }

- return false;

- }

-#if PRELOAD_SCANNER_ENABLED

- if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)

- m_preloadScanner->end();

-#endif

- if (!m_src.isEmpty())

- m_src.append(source);

- else

- setSrc(source);

- // Once a timer is set, it has control of when the tokenizer continues.

- if (m_timer.isActive())

- return false;

- bool wasInWrite = m_inWrite;

- m_inWrite = true;

-#ifdef INSTRUMENT_LAYOUT_SCHEDULING

- if (!m_doc->ownerElement())

- printf("Beginning write at time %d\n", m_doc->elapsedTime());

-#endif

- int processedCount = 0;

- double startTime = currentTime();

- Frame* frame = m_doc->frame();

- State state = m_state;

- while (!m_src.isEmpty() && (!frame || !frame->loader()->isScheduledLocationChangePending())) {

- if (!continueProcessing(processedCount, startTime, state))

- break;

- // do we need to enlarge the buffer?

- checkBuffer();

- UChar cc = *m_src;

- bool wasSkipLF = state.skipLF();

- if (wasSkipLF)

- state.setSkipLF(false);

- if (wasSkipLF && (cc == '\n'))

- m_src.advance();

- else if (state.needsSpecialWriteHandling()) {

- // it's important to keep needsSpecialWriteHandling with the flags this block tests

- if (state.hasEntityState())

- state = parseEntity(m_src, m_dest, state, m_cBufferPos, false, state.hasTagState());

- else if (state.inPlainText())

- state = parseText(m_src, state);

- else if (state.inAnySpecial())

- state = parseSpecial(m_src, state);

- else if (state.inComment())

- state = parseComment(m_src, state);

- else if (state.inDoctype())

- state = parseDoctype(m_src, state);

- else if (state.inServer())

- state = parseServer(m_src, state);

- else if (state.inProcessingInstruction())

- state = parseProcessingInstruction(m_src, state);

- else if (state.hasTagState())

- state = parseTag(m_src, state);

- else if (state.startTag()) {

- state.setStartTag(false);

- switch(cc) {

- case '/':

- break;

- case '!': {

- //  or <!DOCTYPE ...>

- searchCount = 1; // Look for '<!--' sequence to start comment or '<!DOCTYPE' sequence to start doctype

- m_doctypeSearchCount = 1;

- break;

- }

- case '?': {

- // xml processing instruction

- state.setInProcessingInstruction(true);

- tquote = NoQuote;

- state = parseProcessingInstruction(m_src, state);

- continue;

- break;

- }

- case '%':

- if (!m_brokenServer) {

- // <% server stuff, handle as comment %>

- state.setInServer(true);

- tquote = NoQuote;

- state = parseServer(m_src, state);

- continue;

- }

- // else fall through

- default: {

- if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {

- // Start of a Start-Tag

- } else {

- // Invalid tag

- // Add as is

- *m_dest = '<';

- m_dest++;

- continue;

- }

- }; // end case

- processToken();

- m_cBufferPos = 0;

- state.setTagState(TagName);

- state = parseTag(m_src, state);

- }

- } else if (cc == '&' && !m_src.escaped()) {

- m_src.advancePastNonNewline();

- state = parseEntity(m_src, m_dest, state, m_cBufferPos, true, state.hasTagState());

- } else if (cc == '<' && !m_src.escaped()) {

- m_currentTagStartLineNumber = m_lineNumber;

- m_src.advancePastNonNewline();

- state.setStartTag(true);

- state.setDiscardLF(false);

- } else if (cc == '\n' || cc == '\r') {

- if (state.discardLF())

- // Ignore this LF

- state.setDiscardLF(false); // We have discarded 1 LF

- else {

- // Process this LF

- *m_dest++ = '\n';

- if (cc == '\r' && !m_src.excludeLineNumbers())

- m_lineNumber++;

- }

- /* Check for MS-DOS CRLF sequence */

- if (cc == '\r')

- state.setSkipLF(true);

- m_src.advance(m_lineNumber);

- } else {

- state.setDiscardLF(false);

- *m_dest++ = cc;

- m_src.advancePastNonNewline();

- }

-#ifdef INSTRUMENT_LAYOUT_SCHEDULING

- if (!m_doc->ownerElement())

- printf("Ending write at time %d\n", m_doc->elapsedTime());

-#endif

- m_inWrite = wasInWrite;

- m_state = state;

- if (m_noMoreData && !m_inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) {

- end(); // this actually causes us to be deleted

- return true;

- }

- return false;

-void HTMLTokenizer::stopParsing()

- Tokenizer::stopParsing();

- m_timer.stop();

- // The part needs to know that the tokenizer has finished with its data,

- // regardless of whether it happened naturally or due to manual intervention.

- if (!m_fragment && m_doc->frame())

- m_doc->frame()->loader()->tokenizerProcessedData();

-bool HTMLTokenizer::processingData() const

- return m_timer.isActive() || m_inWrite;

-void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)

-#ifdef INSTRUMENT_LAYOUT_SCHEDULING

- if (!m_doc->ownerElement())

- printf("Beginning timer write at time %d\n", m_doc->elapsedTime());

-#endif

- if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {

- // Restart the timer and let layout win. This is basically a way of ensuring that the layout

- // timer has higher priority than our timer.

- m_timer.startOneShot(0);

- return;

- }

- // Invoke write() as though more data came in. This might cause us to get deleted.

- write(SegmentedString(), true);

-void HTMLTokenizer::end()

- ASSERT(!m_timer.isActive());

- m_timer.stop(); // Only helps if assertion above fires, but do it anyway.

- if (m_buffer) {

- // parseTag is using the buffer for different matters

- if (!m_state.hasTagState())

- processToken();

- fastFree(m_scriptCode);

- m_scriptCode = 0;

- m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;

- fastFree(m_buffer);

- m_buffer = 0;

- }

- if (!inViewSourceMode())

- m_parser->finished();

- else

- m_doc->finishedParsing();

-void HTMLTokenizer::finish()

- // do this as long as we don't find matching comment ends

- while ((m_state.inComment() || m_state.inServer()) && m_scriptCode && m_scriptCodeSize) {

- // we've found an unmatched comment start

- if (m_state.inComment())

- m_brokenComments = true;

- else

- m_brokenServer = true;

- checkScriptBuffer();

- m_scriptCode[m_scriptCodeSize] = 0;

- m_scriptCode[m_scriptCodeSize + 1] = 0;

- int pos;

- String food;

- if (m_state.inScript() || m_state.inStyle() || m_state.inTextArea())

- food = String(m_scriptCode, m_scriptCodeSize);

- else if (m_state.inServer()) {

- food = "<";

- food.append(m_scriptCode, m_scriptCodeSize);

- } else {

- pos = find(m_scriptCode, m_scriptCodeSize, '>');

- food = String(m_scriptCode + pos + 1, m_scriptCodeSize - pos - 1);

- }

- fastFree(m_scriptCode);

- m_scriptCode = 0;

- m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;

- m_state.setInComment(false);

- m_state.setInServer(false);

- if (!food.isEmpty())

- write(food, true);

- }

- // this indicates we will not receive any more data... but if we are waiting on

- // an external script to load, we can't finish parsing until that is done

- m_noMoreData = true;

- if (!m_inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())

- end(); // this actually causes us to be deleted

-PassRefPtr<Node> HTMLTokenizer::processToken()

- ScriptController* scriptController = (!m_fragment && m_doc->frame()) ? m_doc->frame()->script() : 0;

- if (scriptController && scriptController->isEnabled())

- // FIXME: Why isn't this m_currentScriptTagStartLineNumber? I suspect this is wrong.

- scriptController->setEventHandlerLineno(m_currentTagStartLineNumber + 1); // Script line numbers are 1 based.

- if (m_dest > m_buffer) {

- m_currentToken.text = StringImpl::createStrippingNullCharacters(m_buffer, m_dest - m_buffer);

- if (m_currentToken.tagName != commentAtom)

- m_currentToken.tagName = textAtom;

- } else if (m_currentToken.tagName == nullAtom) {

- m_currentToken.reset();

- if (scriptController)

- scriptController->setEventHandlerLineno(m_lineNumber + 1); // Script line numbers are 1 based.

- return 0;

- }

- m_dest = m_buffer;

- RefPtr<Node> n;

- if (!m_parserStopped) {

- if (NamedMappedAttrMap* map = m_currentToken.attrs.get())

- map->shrinkToLength();

- if (inViewSourceMode())

- static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&m_currentToken);

- else

- // pass the token over to the parser, the parser DOES NOT delete the token

- n = m_parser->parseToken(&m_currentToken);

- }

- m_currentToken.reset();

- if (scriptController)

- scriptController->setEventHandlerLineno(0);

- return n.release();

-void HTMLTokenizer::processDoctypeToken()

- if (inViewSourceMode())

- static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceDoctypeToken(&m_doctypeToken);

- else

- m_parser->parseDoctypeToken(&m_doctypeToken);

-HTMLTokenizer::~HTMLTokenizer()

- ASSERT(!m_inWrite);

- reset();

-void HTMLTokenizer::enlargeBuffer(int len)

- int newSize = max(m_bufferSize * 2, m_bufferSize + len);

- int oldOffset = m_dest - m_buffer;

- m_buffer = static_cast<UChar*>(fastRealloc(m_buffer, newSize * sizeof(UChar)));

- m_dest = m_buffer + oldOffset;

- m_bufferSize = newSize;

-void HTMLTokenizer::enlargeScriptBuffer(int len)

- int newSize = max(m_scriptCodeCapacity * 2, m_scriptCodeCapacity + len);

- m_scriptCode = static_cast<UChar*>(fastRealloc(m_scriptCode, newSize * sizeof(UChar)));

- m_scriptCodeCapacity = newSize;

-void HTMLTokenizer::executeScriptsWaitingForStylesheets()

- ASSERT(m_doc->haveStylesheetsLoaded());

- if (m_hasScriptsWaitingForStylesheets)

- notifyFinished(0);

-void HTMLTokenizer::notifyFinished(CachedResource*)

-#ifdef INSTRUMENT_LAYOUT_SCHEDULING

- if (!m_doc->ownerElement())

- printf("script loaded at %d\n", m_doc->elapsedTime());

-#endif

- ASSERT(!m_pendingScripts.isEmpty());

- // Make external scripts wait for external stylesheets.

- // FIXME: This needs to be done for inline scripts too.

- m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded();

- if (m_hasScriptsWaitingForStylesheets)

- return;

- bool finished = false;

- while (!finished && m_pendingScripts.first()->isLoaded()) {

- CachedScript* cs = m_pendingScripts.first().get();

- m_pendingScripts.removeFirst();

- ASSERT(cache()->disabled() || cs->accessCount() > 0);

- setSrc(SegmentedString());

- // make sure we forget about the script before we execute the new one

- // infinite recursion might happen otherwise

- ScriptSourceCode sourceCode(cs);

- bool errorOccurred = cs->errorOccurred();

- cs->removeClient(this);

- RefPtr<Node> n = m_scriptNode.release();

-#ifdef INSTRUMENT_LAYOUT_SCHEDULING

- if (!m_doc->ownerElement())

- printf("external script beginning execution at %d\n", m_doc->elapsedTime());

-#endif

- if (errorOccurred)

- EventTargetNodeCast(n.get())->dispatchEventForType(eventNames().errorEvent, true, false);

- else {

- if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript())

- m_state = scriptExecution(sourceCode, m_state);

- EventTargetNodeCast(n.get())->dispatchEventForType(eventNames().loadEvent, false, false);

- }

- // The state of m_pendingScripts.isEmpty() can change inside the scriptExecution()

- // call above, so test afterwards.

- finished = m_pendingScripts.isEmpty();

- if (finished) {

- ASSERT(!m_hasScriptsWaitingForStylesheets);

- m_state.setLoadingExtScript(false);

-#ifdef INSTRUMENT_LAYOUT_SCHEDULING

- if (!m_doc->ownerElement())

- printf("external script finished execution at %d\n", m_doc->elapsedTime());

-#endif

- } else if (m_hasScriptsWaitingForStylesheets) {

- // m_hasScriptsWaitingForStylesheets flag might have changed during the script execution.

- // If it did we are now blocked waiting for stylesheets and should not execute more scripts until they arrive.

- finished = true;

- }

- // 'm_requestingScript' is true when we are called synchronously from

- // scriptHandler(). In that case scriptHandler() will take care

- // of m_pendingSrc.

- if (!m_requestingScript) {

- SegmentedString rest = m_pendingSrc;

- m_pendingSrc.clear();

- write(rest, false);

- // we might be deleted at this point, do not access any members.

- }

-bool HTMLTokenizer::isWaitingForScripts() const

- return m_state.loadingExtScript();

-void HTMLTokenizer::setSrc(const SegmentedString& source)

- m_src = source;

-void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment)

- HTMLTokenizer tok(fragment);

- tok.setForceSynchronous(true);

- tok.write(source, true);

- tok.finish();

- ASSERT(!tok.processingData()); // make sure we're done (see 3963151)

-UChar decodeNamedEntity(const char* name)

- const Entity* e = findEntity(name, strlen(name));

- return e ? e->code : 0;

+/*

+ This library is free software; you can redistribute it and/or

+ modify it under the terms of the GNU Library General Public

+ License as published by the Free Software Foundation; either

+ version 2 of the License, or (at your option) any later version.

+ This library is distributed in the hope that it will be useful,

+ but WITHOUT ANY WARRANTY; without even the implied warranty of

+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

+ Library General Public License for more details.

+ You should have received a copy of the GNU Library General Public License

+ along with this library; see the file COPYING.LIB. If not, write to

+ the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,

+ Boston, MA 02110-1301, USA.

+*/

+#include "config.h"

+#include "HTMLTokenizer.h"

+#include "CSSHelper.h"

+#include "Cache.h"

+#include "CachedScript.h"

+#include "DocLoader.h"

+#include "DocumentFragment.h"

+#include "EventNames.h"

+#include "Frame.h"

+#include "FrameLoader.h"

+#include "FrameView.h"

+#include "HTMLElement.h"

+#include "HTMLNames.h"

+#include "HTMLParser.h"

+#include "HTMLScriptElement.h"

+#include "HTMLViewSourceDocument.h"

+#include "Page.h"

+#include "PreloadScanner.h"

+#include "ScriptController.h"

+#include "ScriptSourceCode.h"

+#include "ScriptValue.h"

+#include <wtf/ASCIICType.h>

+#include <wtf/CurrentTime.h>

+#include "HTMLEntityNames.c"

+#define PRELOAD_SCANNER_ENABLED 1

+// #define INSTRUMENT_LAYOUT_SCHEDULING 1

+using namespace WTF;

+using namespace std;

+namespace WebCore {

+using namespace HTMLNames;

+#if MOBILE

+// The mobile device needs to be responsive, as such the tokenizer chunk size is reduced.

+// This value is used to define how many characters the tokenizer will process before

+// yeilding control.

+static const int defaultTokenizerChunkSize = 256;

+#else

+static const int defaultTokenizerChunkSize = 4096;

+#endif

+#if MOBILE

+// As the chunks are smaller (above), the tokenizer should not yield for as long a period, otherwise

+// it will take way to long to load a page.

+static const double defaultTokenizerTimeDelay = 0.300;

+#else

+// FIXME: We would like this constant to be 200ms.

+// Yielding more aggressively results in increased responsiveness and better incremental rendering.

+// It slows down overall page-load on slower machines, though, so for now we set a value of 500.

+static const double defaultTokenizerTimeDelay = 0.500;

+#endif

+static const char commentStart [] = "<!--";

+static const char doctypeStart [] = "<!doctype";

+static const char publicStart [] = "public";

+static const char systemStart [] = "system";

+static const char scriptEnd [] = "</script";

+static const char xmpEnd [] = "</xmp";

+static const char styleEnd [] = "</style";

+static const char textareaEnd [] = "</textarea";

+static const char titleEnd [] = "</title";

+static const char iframeEnd [] = "</iframe";

+// Full support for MS Windows extensions to Latin-1.

+// Technically these extensions should only be activated for pages

+// marked "windows-1252" or "cp1252", but

+// in the standard Microsoft way, these extensions infect hundreds of thousands

+// of web pages. Note that people with non-latin-1 Microsoft extensions

+// are SOL.

+//

+// See: http://www.microsoft.com/globaldev/reference/WinCP.asp

+// http://www.bbsinc.com/iso8859.html

+// http://www.obviously.com/

+//

+// There may be better equivalents

+// We only need this for entities. For non-entity text, we handle this in the text encoding.

+static const UChar windowsLatin1ExtensionArray[32] = {

+ 0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, // 80-87

+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F, // 88-8F

+ 0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, // 90-97

+ 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178 // 98-9F

+};

+static inline UChar fixUpChar(UChar c)

+ if ((c & ~0x1F) != 0x0080)

+ return c;

+ return windowsLatin1ExtensionArray[c - 0x80];

+static inline bool tagMatch(const char* s1, const UChar* s2, unsigned length)

+ for (unsigned i = 0; i != length; ++i) {

+ unsigned char c1 = s1[i];

+ unsigned char uc1 = toASCIIUpper(static_cast<char>(c1));

+ UChar c2 = s2[i];

+ if (c1 != c2 && uc1 != c2)

+ return false;

+ }

+ return true;

+inline void Token::addAttribute(AtomicString& attrName, const AtomicString& attributeValue, bool viewSourceMode)

+ if (!attrName.isEmpty()) {

+ ASSERT(!attrName.contains('/'));

+ RefPtr<MappedAttribute> a = MappedAttribute::create(attrName, attributeValue);

+ if (!attrs) {

+ attrs = NamedMappedAttrMap::create();

+ attrs->reserveInitialCapacity(10);

+ }

+ attrs->insertAttribute(a.release(), viewSourceMode);

+ }

+ attrName = emptyAtom;

+// ----------------------------------------------------------------------------

+HTMLTokenizer::HTMLTokenizer(HTMLDocument* doc, bool reportErrors)

+ : Tokenizer()

+ , m_buffer(0)

+ , m_scriptCode(0)

+ , m_scriptCodeSize(0)

+ , m_scriptCodeCapacity(0)

+ , m_scriptCodeResync(0)

+ , m_executingScript(0)

+ , m_requestingScript(false)

+ , m_hasScriptsWaitingForStylesheets(false)

+ , m_timer(this, &HTMLTokenizer::timerFired)

+ , m_doc(doc)

+ , m_parser(new HTMLParser(doc, reportErrors))

+ , m_inWrite(false)

+ , m_fragment(false)

+ begin();

+HTMLTokenizer::HTMLTokenizer(HTMLViewSourceDocument* doc)

+ : Tokenizer(true)

+ , m_buffer(0)

+ , m_scriptCode(0)

+ , m_scriptCodeSize(0)

+ , m_scriptCodeCapacity(0)

+ , m_scriptCodeResync(0)

+ , m_executingScript(0)

+ , m_requestingScript(false)

+ , m_hasScriptsWaitingForStylesheets(false)

+ , m_timer(this, &HTMLTokenizer::timerFired)

+ , m_doc(doc)

+ , m_parser(0)

+ , m_inWrite(false)

+ , m_fragment(false)

+ begin();

+HTMLTokenizer::HTMLTokenizer(DocumentFragment* frag)

+ : m_buffer(0)

+ , m_scriptCode(0)

+ , m_scriptCodeSize(0)

+ , m_scriptCodeCapacity(0)

+ , m_scriptCodeResync(0)

+ , m_executingScript(0)

+ , m_requestingScript(false)

+ , m_hasScriptsWaitingForStylesheets(false)

+ , m_timer(this, &HTMLTokenizer::timerFired)

+ , m_doc(frag->document())

+ , m_parser(new HTMLParser(frag))

+ , m_inWrite(false)

+ , m_fragment(true)

+ begin();

+void HTMLTokenizer::reset()

+ ASSERT(m_executingScript == 0);

+ while (!m_pendingScripts.isEmpty()) {

+ CachedScript* cs = m_pendingScripts.first().get();

+ m_pendingScripts.removeFirst();

+ ASSERT(cache()->disabled() || cs->accessCount() > 0);

+ cs->removeClient(this);

+ }

+ fastFree(m_buffer);

+ m_buffer = m_dest = 0;

+ m_bufferSize = 0;

+ fastFree(m_scriptCode);

+ m_scriptCode = 0;

+ m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;

+ m_timer.stop();

+ m_state.setAllowYield(false);

+ m_state.setForceSynchronous(false);

+ m_currentToken.reset();

+ m_doctypeToken.reset();

+ m_doctypeSearchCount = 0;

+ m_doctypeSecondarySearchCount = 0;

+ m_hasScriptsWaitingForStylesheets = false;

+void HTMLTokenizer::begin()

+ m_executingScript = 0;

+ m_requestingScript = false;

+ m_hasScriptsWaitingForStylesheets = false;

+ m_state.setLoadingExtScript(false);

+ reset();

+ m_bufferSize = 254;

+ m_buffer = static_cast<UChar*>(fastMalloc(sizeof(UChar) * 254));

+ m_dest = m_buffer;

+ tquote = NoQuote;

+ searchCount = 0;

+ m_state.setEntityState(NoEntity);

+ m_scriptTagSrcAttrValue = String();

+ m_pendingSrc.clear();

+ m_currentPrependingSrc = 0;

+ m_noMoreData = false;

+ m_brokenComments = false;

+ m_brokenServer = false;

+ m_lineNumber = 0;

+ m_currentScriptTagStartLineNumber = 0;

+ m_currentTagStartLineNumber = 0;

+ m_state.setForceSynchronous(false);

+ Page* page = m_doc->page();

+ if (page && page->hasCustomHTMLTokenizerTimeDelay())

+ m_tokenizerTimeDelay = page->customHTMLTokenizerTimeDelay();

+ else

+ m_tokenizerTimeDelay = defaultTokenizerTimeDelay;

+ if (page && page->hasCustomHTMLTokenizerChunkSize())

+ m_tokenizerChunkSize = page->customHTMLTokenizerChunkSize();

+ else

+ m_tokenizerChunkSize = defaultTokenizerChunkSize;

+void HTMLTokenizer::setForceSynchronous(bool force)

+ m_state.setForceSynchronous(force);

+HTMLTokenizer::State HTMLTokenizer::processListing(SegmentedString list, State state)

+ // This function adds the listing 'list' as

+ // preformatted text-tokens to the token-collection

+ while (!list.isEmpty()) {

+ if (state.skipLF()) {

+ state.setSkipLF(false);

+ if (*list == '\n') {

+ list.advance();

+ continue;

+ }

+ checkBuffer();

+ if (*list == '\n' || *list == '\r') {

+ if (state.discardLF())

+ // Ignore this LF

+ state.setDiscardLF(false); // We have discarded 1 LF

+ else

+ *m_dest++ = '\n';

+ /* Check for MS-DOS CRLF sequence */

+ if (*list == '\r')

+ state.setSkipLF(true);

+ list.advance();

+ } else {

+ state.setDiscardLF(false);

+ *m_dest++ = *list;

+ list.advance();

+ }

+ return state;

+HTMLTokenizer::State HTMLTokenizer::parseSpecial(SegmentedString& src, State state)

+ ASSERT(state.inTextArea() || state.inTitle() || state.inIFrame() || !state.hasEntityState());

+ ASSERT(!state.hasTagState());

+ ASSERT(state.inXmp() + state.inTextArea() + state.inTitle() + state.inStyle() + state.inScript() + state.inIFrame() == 1 );

+ if (state.inScript() && !m_currentScriptTagStartLineNumber)

+ m_currentScriptTagStartLineNumber = m_lineNumber;

+ if (state.inComment())

+ state = parseComment(src, state);

+ int lastDecodedEntityPosition = -1;

+ while (!src.isEmpty()) {

+ checkScriptBuffer();

+ UChar ch = *src;

+ if (!m_scriptCodeResync && !m_brokenComments &&

+ !state.inXmp() && ch == '-' && m_scriptCodeSize >= 3 && !src.escaped() &&

+ m_scriptCode[m_scriptCodeSize - 3] == '<' && m_scriptCode[m_scriptCodeSize - 2] == '!' && m_scriptCode[m_scriptCodeSize - 1] == '-' &&

+ (lastDecodedEntityPosition < m_scriptCodeSize - 3)) {

+ state.setInComment(true);

+ state = parseComment(src, state);

+ continue;

+ }

+ if (m_scriptCodeResync && !tquote && ch == '>') {

+ src.advancePastNonNewline();

+ m_scriptCodeSize = m_scriptCodeResync - 1;

+ m_scriptCodeResync = 0;

+ m_scriptCode[m_scriptCodeSize] = m_scriptCode[m_scriptCodeSize + 1] = 0;

+ if (state.inScript())

+ state = scriptHandler(state);

+ else {

+ state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);

+ processToken();

+ if (state.inStyle()) {

+ m_currentToken.tagName = styleTag.localName();

+ m_currentToken.beginTag = false;

+ } else if (state.inTextArea()) {

+ m_currentToken.tagName = textareaTag.localName();

+ m_currentToken.beginTag = false;

+ } else if (state.inTitle()) {

+ m_currentToken.tagName = titleTag.localName();

+ m_currentToken.beginTag = false;

+ } else if (state.inXmp()) {

+ m_currentToken.tagName = xmpTag.localName();

+ m_currentToken.beginTag = false;

+ } else if (state.inIFrame()) {

+ m_currentToken.tagName = iframeTag.localName();

+ m_currentToken.beginTag = false;

+ }

+ processToken();

+ state.setInStyle(false);

+ state.setInScript(false);

+ state.setInTextArea(false);

+ state.setInTitle(false);

+ state.setInXmp(false);

+ state.setInIFrame(false);

+ tquote = NoQuote;

+ m_scriptCodeSize = m_scriptCodeResync = 0;

+ }

+ return state;

+ }

+ // possible end of tagname, lets check.

+ if (!m_scriptCodeResync && !state.escaped() && !src.escaped() && (ch == '>' || ch == '/' || isASCIISpace(ch)) &&

+ m_scriptCodeSize >= m_searchStopperLength &&

+ tagMatch(m_searchStopper, m_scriptCode + m_scriptCodeSize - m_searchStopperLength, m_searchStopperLength) &&

+ (lastDecodedEntityPosition < m_scriptCodeSize - m_searchStopperLength)) {

+ m_scriptCodeResync = m_scriptCodeSize-m_searchStopperLength+1;

+ tquote = NoQuote;

+ continue;

+ }

+ if (m_scriptCodeResync && !state.escaped()) {

+ if (ch == '\"')

+ tquote = (tquote == NoQuote) ? DoubleQuote : ((tquote == SingleQuote) ? SingleQuote : NoQuote);

+ else if (ch == '\'')

+ tquote = (tquote == NoQuote) ? SingleQuote : (tquote == DoubleQuote) ? DoubleQuote : NoQuote;

+ else if (tquote != NoQuote && (ch == '\r' || ch == '\n'))

+ tquote = NoQuote;

+ }

+ state.setEscaped(!state.escaped() && ch == '\\');

+ if (!m_scriptCodeResync && (state.inTextArea() || state.inTitle() || state.inIFrame()) && !src.escaped() && ch == '&') {

+ UChar* scriptCodeDest = m_scriptCode + m_scriptCodeSize;

+ src.advancePastNonNewline();

+ state = parseEntity(src, scriptCodeDest, state, m_cBufferPos, true, false);

+ if (scriptCodeDest == m_scriptCode + m_scriptCodeSize)

+ lastDecodedEntityPosition = m_scriptCodeSize;

+ else

+ m_scriptCodeSize = scriptCodeDest - m_scriptCode;

+ } else {

+ m_scriptCode[m_scriptCodeSize++] = ch;

+ src.advance(m_lineNumber);

+ }

+ return state;

+HTMLTokenizer::State HTMLTokenizer::scriptHandler(State state)

+ // We are inside a <script>

+ bool doScriptExec = false;

+ int startLine = m_currentScriptTagStartLineNumber + 1; // Script line numbers are 1 based, HTMLTokenzier line numbers are 0 based

+ // Reset m_currentScriptTagStartLineNumber to indicate that we've finished parsing the current script element

+ m_currentScriptTagStartLineNumber = 0;

+ // (Bugzilla 3837) Scripts following a frameset element should not execute or,

+ // in the case of extern scripts, even load.

+ bool followingFrameset = (m_doc->body() && m_doc->body()->hasTagName(framesetTag));

+ CachedScript* cs = 0;

+ // don't load external scripts for standalone documents (for now)

+ if (!inViewSourceMode()) {

+ if (!m_scriptTagSrcAttrValue.isEmpty() && m_doc->frame()) {

+ // forget what we just got; load from src url instead

+ if (!m_parser->skipMode() && !followingFrameset) {

+#ifdef INSTRUMENT_LAYOUT_SCHEDULING

+ if (!m_doc->ownerElement())

+ printf("Requesting script at time %d\n", m_doc->elapsedTime());

+#endif

+ // The parser might have been stopped by for example a window.close call in an earlier script.

+ // If so, we don't want to load scripts.

+ if (!m_parserStopped && (cs = m_doc->docLoader()->requestScript(m_scriptTagSrcAttrValue, m_scriptTagCharsetAttrValue)))

+ m_pendingScripts.append(cs);

+ else

+ m_scriptNode = 0;

+ } else

+ m_scriptNode = 0;

+ m_scriptTagSrcAttrValue = String();

+ } else {

+ // Parse m_scriptCode containing <script> info

+#if USE(LOW_BANDWIDTH_DISPLAY)

+ if (m_doc->inLowBandwidthDisplay()) {

+ // ideal solution is only skipping internal JavaScript if there is external JavaScript.

+ // but internal JavaScript can use document.write() to create an external JavaScript,

+ // so we have to skip internal JavaScript all the time.

+ m_doc->frame()->loader()->needToSwitchOutLowBandwidthDisplay();

+ doScriptExec = false;

+ } else

+#endif

+ doScriptExec = m_scriptNode->shouldExecuteAsJavaScript();

+ m_scriptNode = 0;

+ }

+ state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize), state);

+ RefPtr<Node> node = processToken();

+ String scriptString = node ? node->textContent() : "";

+ m_currentToken.tagName = scriptTag.localName();

+ m_currentToken.beginTag = false;

+ processToken();

+ state.setInScript(false);

+ m_scriptCodeSize = m_scriptCodeResync = 0;

+ // FIXME: The script should be syntax highlighted.

+ if (inViewSourceMode())

+ return state;

+ SegmentedString* savedPrependingSrc = m_currentPrependingSrc;

+ SegmentedString prependingSrc;

+ m_currentPrependingSrc = &prependingSrc;

+ if (!m_parser->skipMode() && !followingFrameset) {

+ if (cs) {

+ if (savedPrependingSrc)

+ savedPrependingSrc->append(m_src);

+ else

+ m_pendingSrc.prepend(m_src);

+ setSrc(SegmentedString());

+ // the ref() call below may call notifyFinished if the script is already in cache,

+ // and that mucks with the state directly, so we must write it back to the object.

+ m_state = state;

+ bool savedRequestingScript = m_requestingScript;

+ m_requestingScript = true;

+ cs->addClient(this);

+ m_requestingScript = savedRequestingScript;

+ state = m_state;

+ // will be 0 if script was already loaded and ref() executed it

+ if (!m_pendingScripts.isEmpty())

+ state.setLoadingExtScript(true);

+ } else if (!m_fragment && doScriptExec) {

+ if (!m_executingScript)

+ m_pendingSrc.prepend(m_src);

+ else

+ prependingSrc = m_src;

+ setSrc(SegmentedString());

+ state = scriptExecution(ScriptSourceCode(scriptString, m_doc->frame() ? m_doc->frame()->document()->url() : KURL(), startLine), state);

+ }

+ if (!m_executingScript && !state.loadingExtScript()) {

+ m_src.append(m_pendingSrc);

+ m_pendingSrc.clear();

+ } else if (!prependingSrc.isEmpty()) {

+ // restore first so that the write appends in the right place

+ // (does not hurt to do it again below)

+ m_currentPrependingSrc = savedPrependingSrc;

+ // we need to do this slightly modified bit of one of the write() cases

+ // because we want to prepend to m_pendingSrc rather than appending

+ // if there's no previous prependingSrc

+ if (!m_pendingScripts.isEmpty()) {

+ if (m_currentPrependingSrc)

+ m_currentPrependingSrc->append(prependingSrc);

+ else

+ m_pendingSrc.prepend(prependingSrc);

+ } else {

+ m_state = state;

+ write(prependingSrc, false);

+ state = m_state;

+ }

+#if PRELOAD_SCANNER_ENABLED

+ if (!m_pendingScripts.isEmpty() && !m_executingScript) {

+ if (!m_preloadScanner)

+ m_preloadScanner.set(new PreloadScanner(m_doc));

+ if (!m_preloadScanner->inProgress()) {

+ m_preloadScanner->begin();

+ m_preloadScanner->write(m_pendingSrc);

+ }

+#endif

+ m_currentPrependingSrc = savedPrependingSrc;

+ return state;

+HTMLTokenizer::State HTMLTokenizer::scriptExecution(const ScriptSourceCode& sourceCode, State state)

+ if (m_fragment || !m_doc->frame())

+ return state;

+ m_executingScript++;

+ SegmentedString* savedPrependingSrc = m_currentPrependingSrc;

+ SegmentedString prependingSrc;

+ m_currentPrependingSrc = &prependingSrc;

+#ifdef INSTRUMENT_LAYOUT_SCHEDULING

+ if (!m_doc->ownerElement())

+ printf("beginning script execution at %d\n", m_doc->elapsedTime());

+#endif

+ m_state = state;

+ m_doc->frame()->loader()->executeScript(sourceCode);

+ state = m_state;

+ state.setAllowYield(true);

+#ifdef INSTRUMENT_LAYOUT_SCHEDULING

+ if (!m_doc->ownerElement())

+ printf("ending script execution at %d\n", m_doc->elapsedTime());

+#endif

+ m_executingScript--;

+ if (!m_executingScript && !state.loadingExtScript()) {

+ m_pendingSrc.prepend(prependingSrc);

+ m_src.append(m_pendingSrc);

+ m_pendingSrc.clear();

+ } else if (!prependingSrc.isEmpty()) {

+ // restore first so that the write appends in the right place

+ // (does not hurt to do it again below)

+ m_currentPrependingSrc = savedPrependingSrc;

+ // we need to do this slightly modified bit of one of the write() cases

+ // because we want to prepend to m_pendingSrc rather than appending

+ // if there's no previous prependingSrc

+ if (!m_pendingScripts.isEmpty()) {

+ if (m_currentPrependingSrc)

+ m_currentPrependingSrc->append(prependingSrc);

+ else

+ m_pendingSrc.prepend(prependingSrc);

+#if PRELOAD_SCANNER_ENABLED

+ // We are stuck waiting for another script. Lets check the source that

+ // was just document.write()n for anything to load.

+ PreloadScanner documentWritePreloadScanner(m_doc);

+ documentWritePreloadScanner.begin();

+ documentWritePreloadScanner.write(prependingSrc);

+ documentWritePreloadScanner.end();

+#endif

+ } else {

+ m_state = state;

+ write(prependingSrc, false);

+ state = m_state;

+ }

+ m_currentPrependingSrc = savedPrependingSrc;

+ return state;

+HTMLTokenizer::State HTMLTokenizer::parseComment(SegmentedString& src, State state)

+ // FIXME: Why does this code even run for comments inside <script> and <style>? This seems bogus.

+ checkScriptBuffer(src.length());

+ while (!src.isEmpty()) {

+ UChar ch = *src;

+ m_scriptCode[m_scriptCodeSize++] = ch;

+ if (ch == '>') {

+ bool handleBrokenComments = m_brokenComments && !(state.inScript() || state.inStyle());

+ int endCharsCount = 1; // start off with one for the '>' character

+ if (m_scriptCodeSize > 2 && m_scriptCode[m_scriptCodeSize-3] == '-' && m_scriptCode[m_scriptCodeSize-2] == '-') {

+ endCharsCount = 3;

+ } else if (m_scriptCodeSize > 3 && m_scriptCode[m_scriptCodeSize-4] == '-' && m_scriptCode[m_scriptCodeSize-3] == '-' &&

+ m_scriptCode[m_scriptCodeSize-2] == '!') {

+ // Other browsers will accept --!> as a close comment, even though it's

+ // not technically valid.

+ endCharsCount = 4;

+ }

+ if (handleBrokenComments || endCharsCount > 1) {

+ src.advancePastNonNewline();

+ if (!(state.inTitle() || state.inScript() || state.inXmp() || state.inTextArea() || state.inStyle() || state.inIFrame())) {

+ checkScriptBuffer();

+ m_scriptCode[m_scriptCodeSize] = 0;

+ m_scriptCode[m_scriptCodeSize + 1] = 0;

+ m_currentToken.tagName = commentAtom;

+ m_currentToken.beginTag = true;

+ state = processListing(SegmentedString(m_scriptCode, m_scriptCodeSize - endCharsCount), state);

+ processToken();

+ m_currentToken.tagName = commentAtom;

+ m_currentToken.beginTag = false;

+ processToken();

+ m_scriptCodeSize = 0;

+ }

+ state.setInComment(false);

+ return state; // Finished parsing comment

+ }

+ src.advance(m_lineNumber);

+ }

+ return state;

+HTMLTokenizer::State HTMLTokenizer::parseServer(SegmentedString& src, State state)

+ checkScriptBuffer(src.length());

+ while (!src.isEmpty()) {

+ UChar ch = *src;

+ m_scriptCode[m_scriptCodeSize++] = ch;

+ if (ch == '>' && m_scriptCodeSize > 1 && m_scriptCode[m_scriptCodeSize - 2] == '%') {

+ src.advancePastNonNewline();

+ state.setInServer(false);

+ m_scriptCodeSize = 0;

+ return state; // Finished parsing server include

+ }

+ src.advance(m_lineNumber);

+ }

+ return state;

+HTMLTokenizer::State HTMLTokenizer::parseProcessingInstruction(SegmentedString& src, State state)

+ UChar oldchar = 0;

+ while (!src.isEmpty()) {

+ UChar chbegin = *src;

+ if (chbegin == '\'')

+ tquote = tquote == SingleQuote ? NoQuote : SingleQuote;

+ else if (chbegin == '\"')

+ tquote = tquote == DoubleQuote ? NoQuote : DoubleQuote;

+ // Look for '?>'

+ // Some crappy sites omit the "?" before it, so

+ // we look for an unquoted '>' instead. (IE compatible)

+ else if (chbegin == '>' && (!tquote || oldchar == '?')) {

+ // We got a '?>' sequence

+ state.setInProcessingInstruction(false);

+ src.advancePastNonNewline();

+ state.setDiscardLF(true);

+ return state; // Finished parsing comment!

+ }

+ src.advance(m_lineNumber);

+ oldchar = chbegin;

+ }

+ return state;

+HTMLTokenizer::State HTMLTokenizer::parseText(SegmentedString& src, State state)

+ while (!src.isEmpty()) {

+ UChar cc = *src;

+ if (state.skipLF()) {

+ state.setSkipLF(false);

+ if (cc == '\n') {

+ src.advancePastNewline(m_lineNumber);

+ continue;

+ }

+ // do we need to enlarge the buffer?

+ checkBuffer();

+ if (cc == '\r') {

+ state.setSkipLF(true);

+ *m_dest++ = '\n';

+ } else

+ *m_dest++ = cc;

+ src.advance(m_lineNumber);

+ }

+ return state;

+HTMLTokenizer::State HTMLTokenizer::parseEntity(SegmentedString& src, UChar*& dest, State state, unsigned& cBufferPos, bool start, bool parsingTag)

+ if (start) {

+ cBufferPos = 0;

+ state.setEntityState(SearchEntity);

+ EntityUnicodeValue = 0;

+ }

+ while(!src.isEmpty()) {

+ UChar cc = *src;

+ switch(state.entityState()) {

+ case NoEntity:

+ ASSERT(state.entityState() != NoEntity);

+ return state;

+ case SearchEntity:

+ if (cc == '#') {

+ m_cBuffer[cBufferPos++] = cc;

+ src.advancePastNonNewline();

+ state.setEntityState(NumericSearch);

+ } else

+ state.setEntityState(EntityName);

+ break;

+ case NumericSearch:

+ if (cc == 'x' || cc == 'X') {

+ m_cBuffer[cBufferPos++] = cc;

+ src.advancePastNonNewline();

+ state.setEntityState(Hexadecimal);

+ } else if (cc >= '0' && cc <= '9')

+ state.setEntityState(Decimal);

+ else

+ state.setEntityState(SearchSemicolon);

+ break;

+ case Hexadecimal: {

+ int ll = min(src.length(), 10 - cBufferPos);

+ while (ll--) {

+ cc = *src;

+ if (!((cc >= '0' && cc <= '9') || (cc >= 'a' && cc <= 'f') || (cc >= 'A' && cc <= 'F'))) {

+ state.setEntityState(SearchSemicolon);

+ break;

+ }

+ int digit;

+ if (cc < 'A')

+ digit = cc - '0';

+ else

+ digit = (cc - 'A' + 10) & 0xF; // handle both upper and lower case without a branch

+ EntityUnicodeValue = EntityUnicodeValue * 16 + digit;

+ m_cBuffer[cBufferPos++] = cc;

+ src.advancePastNonNewline();

+ }

+ if (cBufferPos == 10)

+ state.setEntityState(SearchSemicolon);

+ break;

+ }

+ case Decimal:

+ {

+ int ll = min(src.length(), 9-cBufferPos);

+ while(ll--) {

+ cc = *src;

+ if (!(cc >= '0' && cc <= '9')) {

+ state.setEntityState(SearchSemicolon);

+ break;

+ }

+ EntityUnicodeValue = EntityUnicodeValue * 10 + (cc - '0');

+ m_cBuffer[cBufferPos++] = cc;

+ src.advancePastNonNewline();

+ }

+ if (cBufferPos == 9)

+ state.setEntityState(SearchSemicolon);

+ break;

+ }

+ case EntityName:

+ {

+ int ll = min(src.length(), 9-cBufferPos);

+ while(ll--) {

+ cc = *src;

+ if (!((cc >= 'a' && cc <= 'z') || (cc >= '0' && cc <= '9') || (cc >= 'A' && cc <= 'Z'))) {

+ state.setEntityState(SearchSemicolon);

+ break;

+ }

+ m_cBuffer[cBufferPos++] = cc;

+ src.advancePastNonNewline();

+ }

+ if (cBufferPos == 9)

+ state.setEntityState(SearchSemicolon);

+ if (state.entityState() == SearchSemicolon) {

+ if(cBufferPos > 1) {

+ // Since the maximum length of entity name is 9,

+ // so a single char array which is allocated on

+ // the stack, its length is 10, should be OK.

+ // Also if we have an illegal character, we treat it

+ // as illegal entity name.

+ unsigned testedEntityNameLen = 0;

+ char tmpEntityNameBuffer[10];

+ ASSERT(cBufferPos < 10);

+ for (; testedEntityNameLen < cBufferPos; ++testedEntityNameLen) {

+ if (m_cBuffer[testedEntityNameLen] > 0x7e)

+ break;

+ tmpEntityNameBuffer[testedEntityNameLen] = m_cBuffer[testedEntityNameLen];

+ }

+ const Entity *e;

+ if (testedEntityNameLen == cBufferPos)

+ e = findEntity(tmpEntityNameBuffer, cBufferPos);

+ else

+ e = 0;

+ if(e)

+ EntityUnicodeValue = e->code;

+ // be IE compatible

+ if(parsingTag && EntityUnicodeValue > 255 && *src != ';')

+ EntityUnicodeValue = 0;

+ }

+ else

+ break;

+ }

+ case SearchSemicolon:

+ // Don't allow values that are more than 21 bits.

+ if (EntityUnicodeValue > 0 && EntityUnicodeValue <= 0x10FFFF) {

+ if (!inViewSourceMode()) {

+ if (*src == ';')

+ src.advancePastNonNewline();

+ if (EntityUnicodeValue <= 0xFFFF) {

+ checkBuffer();

+ src.push(fixUpChar(EntityUnicodeValue));

+ } else {

+ // Convert to UTF-16, using surrogate code points.

+ checkBuffer(2);

+ src.push(U16_LEAD(EntityUnicodeValue));

+ src.push(U16_TRAIL(EntityUnicodeValue));

+ }

+ } else {

+ // FIXME: We should eventually colorize entities by sending them as a special token.

+ checkBuffer(11);

+ *dest++ = '&';

+ for (unsigned i = 0; i < cBufferPos; i++)

+ dest[i] = m_cBuffer[i];

+ dest += cBufferPos;

+ if (*src == ';') {

+ *dest++ = ';';

+ src.advancePastNonNewline();

+ }

+ } else {

+ checkBuffer(10);

+ // ignore the sequence, add it to the buffer as plaintext

+ *dest++ = '&';

+ for (unsigned i = 0; i < cBufferPos; i++)

+ dest[i] = m_cBuffer[i];

+ dest += cBufferPos;

+ }

+ state.setEntityState(NoEntity);

+ return state;

+ }

+ return state;

+HTMLTokenizer::State HTMLTokenizer::parseDoctype(SegmentedString& src, State state)

+ ASSERT(state.inDoctype());

+ while (!src.isEmpty() && state.inDoctype()) {

+ UChar c = *src;

+ bool isWhitespace = c == '\r' || c == '\n' || c == '\t' || c == ' ';

+ switch (m_doctypeToken.state()) {

+ case DoctypeBegin: {

+ m_doctypeToken.setState(DoctypeBeforeName);

+ if (isWhitespace) {

+ src.advance(m_lineNumber);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ }

+ break;

+ }

+ case DoctypeBeforeName: {

+ if (c == '>') {

+ // Malformed. Just exit.

+ src.advancePastNonNewline();

+ state.setInDoctype(false);

+ if (inViewSourceMode())

+ processDoctypeToken();

+ } else if (isWhitespace) {

+ src.advance(m_lineNumber);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else

+ m_doctypeToken.setState(DoctypeName);

+ break;

+ }

+ case DoctypeName: {

+ if (c == '>') {

+ // Valid doctype. Emit it.

+ src.advancePastNonNewline();

+ state.setInDoctype(false);

+ processDoctypeToken();

+ } else if (isWhitespace) {

+ m_doctypeSearchCount = 0; // Used now to scan for PUBLIC

+ m_doctypeSecondarySearchCount = 0; // Used now to scan for SYSTEM

+ m_doctypeToken.setState(DoctypeAfterName);

+ src.advance(m_lineNumber);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else {

+ src.advancePastNonNewline();

+ m_doctypeToken.m_name.append(c);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ }

+ break;

+ }

+ case DoctypeAfterName: {

+ if (c == '>') {

+ // Valid doctype. Emit it.

+ src.advancePastNonNewline();

+ state.setInDoctype(false);

+ processDoctypeToken();

+ } else if (!isWhitespace) {

+ src.advancePastNonNewline();

+ if (toASCIILower(c) == publicStart[m_doctypeSearchCount]) {

+ m_doctypeSearchCount++;

+ if (m_doctypeSearchCount == 6)

+ // Found 'PUBLIC' sequence

+ m_doctypeToken.setState(DoctypeBeforePublicID);

+ } else if (m_doctypeSearchCount > 0) {

+ m_doctypeSearchCount = 0;

+ m_doctypeToken.setState(DoctypeBogus);

+ } else if (toASCIILower(c) == systemStart[m_doctypeSecondarySearchCount]) {

+ m_doctypeSecondarySearchCount++;

+ if (m_doctypeSecondarySearchCount == 6)

+ // Found 'SYSTEM' sequence

+ m_doctypeToken.setState(DoctypeBeforeSystemID);

+ } else {

+ m_doctypeSecondarySearchCount = 0;

+ m_doctypeToken.setState(DoctypeBogus);

+ }

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else {

+ src.advance(m_lineNumber); // Whitespace keeps us in the after name state.

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ }

+ break;

+ }

+ case DoctypeBeforePublicID: {

+ if (c == '\"' || c == '\'') {

+ tquote = c == '\"' ? DoubleQuote : SingleQuote;

+ m_doctypeToken.setState(DoctypePublicID);

+ src.advancePastNonNewline();

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else if (c == '>') {

+ // Considered bogus. Don't process the doctype.

+ src.advancePastNonNewline();

+ state.setInDoctype(false);

+ if (inViewSourceMode())

+ processDoctypeToken();

+ } else if (isWhitespace) {

+ src.advance(m_lineNumber);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else

+ m_doctypeToken.setState(DoctypeBogus);

+ break;

+ }

+ case DoctypePublicID: {

+ if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {

+ src.advancePastNonNewline();

+ m_doctypeToken.setState(DoctypeAfterPublicID);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else if (c == '>') {

+ // Considered bogus. Don't process the doctype.

+ src.advancePastNonNewline();

+ state.setInDoctype(false);

+ if (inViewSourceMode())

+ processDoctypeToken();

+ } else {

+ m_doctypeToken.m_publicID.append(c);

+ src.advance(m_lineNumber);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ }

+ break;

+ }

+ case DoctypeAfterPublicID:

+ if (c == '\"' || c == '\'') {

+ tquote = c == '\"' ? DoubleQuote : SingleQuote;

+ m_doctypeToken.setState(DoctypeSystemID);

+ src.advancePastNonNewline();

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else if (c == '>') {

+ // Valid doctype. Emit it now.

+ src.advancePastNonNewline();

+ state.setInDoctype(false);

+ processDoctypeToken();

+ } else if (isWhitespace) {

+ src.advance(m_lineNumber);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else

+ m_doctypeToken.setState(DoctypeBogus);

+ break;

+ case DoctypeBeforeSystemID:

+ if (c == '\"' || c == '\'') {

+ tquote = c == '\"' ? DoubleQuote : SingleQuote;

+ m_doctypeToken.setState(DoctypeSystemID);

+ src.advancePastNonNewline();

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else if (c == '>') {

+ // Considered bogus. Don't process the doctype.

+ src.advancePastNonNewline();

+ state.setInDoctype(false);

+ } else if (isWhitespace) {

+ src.advance(m_lineNumber);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else

+ m_doctypeToken.setState(DoctypeBogus);

+ break;

+ case DoctypeSystemID:

+ if ((c == '\"' && tquote == DoubleQuote) || (c == '\'' && tquote == SingleQuote)) {

+ src.advancePastNonNewline();

+ m_doctypeToken.setState(DoctypeAfterSystemID);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else if (c == '>') {

+ // Considered bogus. Don't process the doctype.

+ src.advancePastNonNewline();

+ state.setInDoctype(false);

+ if (inViewSourceMode())

+ processDoctypeToken();

+ } else {

+ m_doctypeToken.m_systemID.append(c);

+ src.advance(m_lineNumber);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ }

+ break;

+ case DoctypeAfterSystemID:

+ if (c == '>') {

+ // Valid doctype. Emit it now.

+ src.advancePastNonNewline();

+ state.setInDoctype(false);

+ processDoctypeToken();

+ } else if (isWhitespace) {

+ src.advance(m_lineNumber);

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ } else

+ m_doctypeToken.setState(DoctypeBogus);

+ break;

+ case DoctypeBogus:

+ if (c == '>') {

+ // Done with the bogus doctype.

+ src.advancePastNonNewline();

+ state.setInDoctype(false);

+ if (inViewSourceMode())

+ processDoctypeToken();

+ } else {

+ src.advance(m_lineNumber); // Just keep scanning for '>'

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(c);

+ }

+ break;

+ default:

+ break;

+ }

+ return state;

+HTMLTokenizer::State HTMLTokenizer::parseTag(SegmentedString& src, State state)

+ ASSERT(!state.hasEntityState());

+ unsigned cBufferPos = m_cBufferPos;

+ bool lastIsSlash = false;

+ while (!src.isEmpty()) {

+ checkBuffer();

+ switch(state.tagState()) {

+ case NoTag:

+ {

+ m_cBufferPos = cBufferPos;

+ return state;

+ }

+ case TagName:

+ {

+ if (searchCount > 0) {

+ if (*src == commentStart[searchCount]) {

+ searchCount++;

+ if (searchCount == 2)

+ m_doctypeSearchCount++; // A '!' is also part of a doctype, so we are moving through that still as well.

+ else

+ m_doctypeSearchCount = 0;

+ if (searchCount == 4) {

+ // Found '<!--' sequence

+ src.advancePastNonNewline();

+ m_dest = m_buffer; // ignore the previous part of this tag

+ state.setInComment(true);

+ state.setTagState(NoTag);

+ // Fix bug 34302 at kde.bugs.org. Go ahead and treat

+ // <!--> as a valid comment, since both mozilla and IE on windows

+ // can handle this case. Only do this in quirks mode. -dwh

+ if (!src.isEmpty() && *src == '>' && m_doc->inCompatMode()) {

+ state.setInComment(false);

+ src.advancePastNonNewline();

+ if (!src.isEmpty())

+ m_cBuffer[cBufferPos++] = *src;

+ } else

+ state = parseComment(src, state);

+ m_cBufferPos = cBufferPos;

+ return state; // Finished parsing tag!

+ }

+ m_cBuffer[cBufferPos++] = *src;

+ src.advancePastNonNewline();

+ break;

+ } else

+ searchCount = 0; // Stop looking for '<!--' sequence

+ }

+ if (m_doctypeSearchCount > 0) {

+ if (toASCIILower(*src) == doctypeStart[m_doctypeSearchCount]) {

+ m_doctypeSearchCount++;

+ m_cBuffer[cBufferPos++] = *src;

+ src.advancePastNonNewline();

+ if (m_doctypeSearchCount == 9) {

+ // Found '<!DOCTYPE' sequence

+ state.setInDoctype(true);

+ state.setTagState(NoTag);

+ m_doctypeToken.reset();

+ if (inViewSourceMode())

+ m_doctypeToken.m_source.append(m_cBuffer, cBufferPos);

+ state = parseDoctype(src, state);

+ m_cBufferPos = cBufferPos;

+ return state;

+ }

+ break;

+ } else

+ m_doctypeSearchCount = 0; // Stop looking for '<!DOCTYPE' sequence

+ }

+ bool finish = false;

+ unsigned int ll = min(src.length(), CBUFLEN - cBufferPos);

+ while (ll--) {

+ UChar curchar = *src;

+ if (isASCIISpace(curchar) || curchar == '>' || curchar == '<') {

+ finish = true;

+ break;

+ }

+ // tolower() shows up on profiles. This is faster!

+ if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())

+ m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');

+ else

+ m_cBuffer[cBufferPos++] = curchar;

+ src.advancePastNonNewline();

+ }

+ // Disadvantage: we add the possible rest of the tag

+ // as attribute names. ### judge if this causes problems

+ if (finish || CBUFLEN == cBufferPos) {

+ bool beginTag;

+ UChar* ptr = m_cBuffer;

+ unsigned int len = cBufferPos;

+ m_cBuffer[cBufferPos] = '\0';

+ if ((cBufferPos > 0) && (*ptr == '/')) {

+ // End Tag

+ beginTag = false;

+ ptr++;

+ len--;

+ }

+ else

+ // Start Tag

+ beginTag = true;

+ // Ignore the / in fake xml tags like <br/>. We trim off the "/" so that we'll get "br" as the tag name and not "br/".

+ if (len > 1 && ptr[len-1] == '/' && !inViewSourceMode())

+ ptr[--len] = '\0';

+ // Now that we've shaved off any invalid / that might have followed the name), make the tag.

+ // FIXME: FireFox and WinIE turn !foo nodes into comments, we ignore comments. (fast/parser/tag-with-exclamation-point.html)

+ if (ptr[0] != '!' || inViewSourceMode()) {

+ m_currentToken.tagName = AtomicString(ptr);

+ m_currentToken.beginTag = beginTag;

+ }

+ m_dest = m_buffer;

+ state.setTagState(SearchAttribute);

+ cBufferPos = 0;

+ }

+ break;

+ }

+ case SearchAttribute:

+ while(!src.isEmpty()) {

+ UChar curchar = *src;

+ // In this mode just ignore any quotes we encounter and treat them like spaces.

+ if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"') {

+ if (curchar == '<' || curchar == '>')

+ state.setTagState(SearchEnd);

+ else

+ state.setTagState(AttributeName);

+ cBufferPos = 0;

+ break;

+ }

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar(curchar);

+ src.advance(m_lineNumber);

+ }

+ break;

+ case AttributeName:

+ {

+ int ll = min(src.length(), CBUFLEN - cBufferPos);

+ while (ll--) {

+ UChar curchar = *src;

+ // If we encounter a "/" when scanning an attribute name, treat it as a delimiter. This allows the

+ // cases like <input type=checkbox checked/> to work (and accommodates XML-style syntax as per HTML5).

+ if (curchar <= '>' && (curchar >= '<' || isASCIISpace(curchar) || curchar == '/')) {

+ m_cBuffer[cBufferPos] = '\0';

+ m_attrName = AtomicString(m_cBuffer);

+ m_dest = m_buffer;

+ *m_dest++ = 0;

+ state.setTagState(SearchEqual);

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar('a');

+ break;

+ }

+ // tolower() shows up on profiles. This is faster!

+ if (curchar >= 'A' && curchar <= 'Z' && !inViewSourceMode())

+ m_cBuffer[cBufferPos++] = curchar + ('a' - 'A');

+ else

+ m_cBuffer[cBufferPos++] = curchar;

+ src.advance(m_lineNumber);

+ }

+ if (cBufferPos == CBUFLEN) {

+ m_cBuffer[cBufferPos] = '\0';

+ m_attrName = AtomicString(m_cBuffer);

+ m_dest = m_buffer;

+ *m_dest++ = 0;

+ state.setTagState(SearchEqual);

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar('a');

+ }

+ break;

+ }

+ case SearchEqual:

+ while (!src.isEmpty()) {

+ UChar curchar = *src;

+ if (lastIsSlash && curchar == '>') {

+ // This is a quirk (with a long sad history). We have to do this

+ // since widgets do <script src="foo.js"/> and expect the tag to close.

+ if (m_currentToken.tagName == scriptTag)

+ m_currentToken.selfClosingTag = true;

+ m_currentToken.brokenXMLStyle = true;

+ }

+ // In this mode just ignore any quotes or slashes we encounter and treat them like spaces.

+ if (!isASCIISpace(curchar) && curchar != '\'' && curchar != '"' && curchar != '/') {

+ if (curchar == '=') {

+ state.setTagState(SearchValue);

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar(curchar);

+ src.advancePastNonNewline();

+ } else {

+ m_currentToken.addAttribute(m_attrName, emptyAtom, inViewSourceMode());

+ m_dest = m_buffer;

+ state.setTagState(SearchAttribute);

+ lastIsSlash = false;

+ }

+ break;

+ }

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar(curchar);

+ lastIsSlash = curchar == '/';

+ src.advance(m_lineNumber);

+ }

+ break;

+ case SearchValue:

+ while (!src.isEmpty()) {

+ UChar curchar = *src;

+ if (!isASCIISpace(curchar)) {

+ if (curchar == '\'' || curchar == '\"') {

+ tquote = curchar == '\"' ? DoubleQuote : SingleQuote;

+ state.setTagState(QuotedValue);

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar(curchar);

+ src.advancePastNonNewline();

+ } else

+ state.setTagState(Value);

+ break;

+ }

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar(curchar);

+ src.advance(m_lineNumber);

+ }

+ break;

+ case QuotedValue:

+ while (!src.isEmpty()) {

+ checkBuffer();

+ UChar curchar = *src;

+ if (curchar <= '>' && !src.escaped()) {

+ if (curchar == '>' && m_attrName.isEmpty()) {

+ // Handle a case like <img '>. Just go ahead and be willing

+ // to close the whole tag. Don't consume the character and

+ // just go back into SearchEnd while ignoring the whole

+ // value.

+ // FIXME: Note that this is actually not a very good solution.

+ // It doesn't handle the general case of

+ // unmatched quotes among attributes that have names. -dwh

+ while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))

+ m_dest--; // remove trailing newlines

+ AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);

+ if (!attributeValue.contains('/'))

+ m_attrName = attributeValue; // Just make the name/value match. (FIXME: Is this some WinIE quirk?)

+ m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar('x');

+ state.setTagState(SearchAttribute);

+ m_dest = m_buffer;

+ tquote = NoQuote;

+ break;

+ }

+ if (curchar == '&') {

+ src.advancePastNonNewline();

+ state = parseEntity(src, m_dest, state, cBufferPos, true, true);

+ break;

+ }

+ if ((tquote == SingleQuote && curchar == '\'') || (tquote == DoubleQuote && curchar == '\"')) {

+ // some <input type=hidden> rely on trailing spaces. argh

+ while (m_dest > m_buffer + 1 && (m_dest[-1] == '\n' || m_dest[-1] == '\r'))

+ m_dest--; // remove trailing newlines

+ AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);

+ if (m_attrName.isEmpty() && !attributeValue.contains('/')) {

+ m_attrName = attributeValue; // Make the name match the value. (FIXME: Is this a WinIE quirk?)

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar('x');

+ } else if (inViewSourceMode())

+ m_currentToken.addViewSourceChar('v');

+ m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());

+ m_dest = m_buffer;

+ state.setTagState(SearchAttribute);

+ tquote = NoQuote;

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar(curchar);

+ src.advancePastNonNewline();

+ break;

+ }

+ *m_dest++ = curchar;

+ src.advance(m_lineNumber);

+ }

+ break;

+ case Value:

+ while(!src.isEmpty()) {

+ checkBuffer();

+ UChar curchar = *src;

+ if (curchar <= '>' && !src.escaped()) {

+ // parse Entities

+ if (curchar == '&') {

+ src.advancePastNonNewline();

+ state = parseEntity(src, m_dest, state, cBufferPos, true, true);

+ break;

+ }

+ // no quotes. Every space means end of value

+ // '/' does not delimit in IE!

+ if (isASCIISpace(curchar) || curchar == '>') {

+ AtomicString attributeValue(m_buffer + 1, m_dest - m_buffer - 1);

+ m_currentToken.addAttribute(m_attrName, attributeValue, inViewSourceMode());

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar('v');

+ m_dest = m_buffer;

+ state.setTagState(SearchAttribute);

+ break;

+ }

+ *m_dest++ = curchar;

+ src.advance(m_lineNumber);

+ }

+ break;

+ case SearchEnd:

+ {

+ while (!src.isEmpty()) {

+ UChar ch = *src;

+ if (ch == '>' || ch == '<')

+ break;

+ if (ch == '/')

+ m_currentToken.selfClosingTag = true;

+ if (inViewSourceMode())

+ m_currentToken.addViewSourceChar(ch);

+ src.advance(m_lineNumber);

+ }

+ if (src.isEmpty())

+ break;

+ searchCount = 0; // Stop looking for '<!--' sequence

+ state.setTagState(NoTag);

+ tquote = NoQuote;

+ if (*src != '<')

+ src.advance(m_lineNumber);

+ if (m_currentToken.tagName == nullAtom) { //stop if tag is unknown

+ m_cBufferPos = cBufferPos;

+ return state;

+ }

+ AtomicString tagName = m_currentToken.tagName;

+ // Handle <script src="foo"/> like Mozilla/Opera. We have to do this now for Dashboard

+ // compatibility.

+ bool isSelfClosingScript = m_currentToken.selfClosingTag && m_currentToken.beginTag && m_currentToken.tagName == scriptTag;

+ bool beginTag = !m_currentToken.selfClosingTag && m_currentToken.beginTag;

+ if (m_currentToken.beginTag && m_currentToken.tagName == scriptTag && !inViewSourceMode() && !m_parser->skipMode()) {

+ Attribute* a = 0;

+ m_scriptTagSrcAttrValue = String();

+ m_scriptTagCharsetAttrValue = String();

+ if (m_currentToken.attrs && !m_fragment) {

+ if (m_doc->frame() && m_doc->frame()->script()->isEnabled()) {

+ if ((a = m_currentToken.attrs->getAttributeItem(srcAttr)))

+ m_scriptTagSrcAttrValue = m_doc->completeURL(parseURL(a->value())).string();

+ }

+ RefPtr<Node> n = processToken();

+ m_cBufferPos = cBufferPos;

+ if (n || inViewSourceMode()) {

+ if ((tagName == preTag || tagName == listingTag) && !inViewSourceMode()) {

+ if (beginTag)

+ state.setDiscardLF(true); // Discard the first LF after we open a pre.

+ } else if (tagName == scriptTag) {

+ ASSERT(!m_scriptNode);

+ m_scriptNode = static_pointer_cast<HTMLScriptElement>(n);

+ if (m_scriptNode)

+ m_scriptTagCharsetAttrValue = m_scriptNode->scriptCharset();

+ if (beginTag) {

+ m_searchStopper = scriptEnd;

+ m_searchStopperLength = 8;

+ state.setInScript(true);

+ state = parseSpecial(src, state);

+ } else if (isSelfClosingScript) { // Handle <script src="foo"/>

+ state.setInScript(true);

+ state = scriptHandler(state);

+ }

+ } else if (tagName == styleTag) {

+ if (beginTag) {

+ m_searchStopper = styleEnd;

+ m_searchStopperLength = 7;

+ state.setInStyle(true);

+ state = parseSpecial(src, state);

+ }

+ } else if (tagName == textareaTag) {

+ if (beginTag) {

+ m_searchStopper = textareaEnd;

+ m_searchStopperLength = 10;

+ state.setInTextArea(true);

+ state = parseSpecial(src, state);

+ }

+ } else if (tagName == titleTag) {

+ if (beginTag) {

+ m_searchStopper = titleEnd;

+ m_searchStopperLength = 7;

+ State savedState = state;

+ SegmentedString savedSrc = src;

+ long savedLineno = m_lineNumber;

+ state.setInTitle(true);

+ state = parseSpecial(src, state);

+ if (state.inTitle() && src.isEmpty()) {

+ // We just ate the rest of the document as the title #text node!

+ // Reset the state then retokenize without special title handling.

+ // Let the parser clean up the missing </title> tag.

+ // FIXME: This is incorrect, because src.isEmpty() doesn't mean we're

+ // at the end of the document unless m_noMoreData is also true. We need

+ // to detect this case elsewhere, and save the state somewhere other

+ // than a local variable.

+ state = savedState;

+ src = savedSrc;

+ m_lineNumber = savedLineno;

+ m_scriptCodeSize = 0;

+ }

+ } else if (tagName == xmpTag) {

+ if (beginTag) {

+ m_searchStopper = xmpEnd;

+ m_searchStopperLength = 5;

+ state.setInXmp(true);

+ state = parseSpecial(src, state);

+ }

+ } else if (tagName == iframeTag) {

+ if (beginTag) {

+ m_searchStopper = iframeEnd;

+ m_searchStopperLength = 8;

+ state.setInIFrame(true);

+ state = parseSpecial(src, state);

+ }

+ if (tagName == plaintextTag)

+ state.setInPlainText(beginTag);

+ return state; // Finished parsing tag!

+ }

+ } // end switch

+ }

+ m_cBufferPos = cBufferPos;

+ return state;

+inline bool HTMLTokenizer::continueProcessing(int& processedCount, double startTime, State &state)

+ // We don't want to be checking elapsed time with every character, so we only check after we've

+ // processed a certain number of characters.

+ bool allowedYield = state.allowYield();

+ state.setAllowYield(false);

+ if (!state.loadingExtScript() && !state.forceSynchronous() && !m_executingScript && (processedCount > m_tokenizerChunkSize || allowedYield)) {

+ processedCount = 0;

+ if (currentTime() - startTime > m_tokenizerTimeDelay) {

+ /* FIXME: We'd like to yield aggressively to give stylesheets the opportunity to

+ load, but this hurts overall performance on slower machines. For now turn this

+ off.

+ || (!m_doc->haveStylesheetsLoaded() &&

+ (m_doc->documentElement()->id() != ID_HTML || m_doc->body()))) {*/

+ // Schedule the timer to keep processing as soon as possible.

+ m_timer.startOneShot(0);

+#ifdef INSTRUMENT_LAYOUT_SCHEDULING

+ if (currentTime() - startTime > m_tokenizerTimeDelay)

+ printf("Deferring processing of data because 500ms elapsed away from event loop.\n");

+#endif

+ return false;

+ }

+ processedCount++;

+ return true;

+bool HTMLTokenizer::write(const SegmentedString& str, bool appendData)

+ if (!m_buffer)

+ return false;

+ if (m_parserStopped)

+ return false;

+ SegmentedString source(str);

+ if (m_executingScript)

+ source.setExcludeLineNumbers();

+ if ((m_executingScript && appendData) || !m_pendingScripts.isEmpty()) {

+ // don't parse; we will do this later

+ if (m_currentPrependingSrc)

+ m_currentPrependingSrc->append(source);

+ else {

+ m_pendingSrc.append(source);

+#if PRELOAD_SCANNER_ENABLED

+ if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)

+ m_preloadScanner->write(source);

+#endif

+ }

+ return false;

+ }

+#if PRELOAD_SCANNER_ENABLED

+ if (m_preloadScanner && m_preloadScanner->inProgress() && appendData)

+ m_preloadScanner->end();

+#endif

+ if (!m_src.isEmpty())

+ m_src.append(source);

+ else

+ setSrc(source);

+ // Once a timer is set, it has control of when the tokenizer continues.

+ if (m_timer.isActive())

+ return false;

+ bool wasInWrite = m_inWrite;

+ m_inWrite = true;

+#ifdef INSTRUMENT_LAYOUT_SCHEDULING

+ if (!m_doc->ownerElement())

+ printf("Beginning write at time %d\n", m_doc->elapsedTime());

+#endif

+ int processedCount = 0;

+ double startTime = currentTime();

+ Frame* frame = m_doc->frame();

+ State state = m_state;

+ while (!m_src.isEmpty() && (!frame || !frame->loader()->isScheduledLocationChangePending())) {

+ if (!continueProcessing(processedCount, startTime, state))

+ break;

+ // do we need to enlarge the buffer?

+ checkBuffer();

+ UChar cc = *m_src;

+ bool wasSkipLF = state.skipLF();

+ if (wasSkipLF)

+ state.setSkipLF(false);

+ if (wasSkipLF && (cc == '\n'))

+ m_src.advance();

+ else if (state.needsSpecialWriteHandling()) {

+ // it's important to keep needsSpecialWriteHandling with the flags this block tests

+ if (state.hasEntityState())

+ state = parseEntity(m_src, m_dest, state, m_cBufferPos, false, state.hasTagState());

+ else if (state.inPlainText())

+ state = parseText(m_src, state);

+ else if (state.inAnySpecial())

+ state = parseSpecial(m_src, state);

+ else if (state.inComment())

+ state = parseComment(m_src, state);

+ else if (state.inDoctype())

+ state = parseDoctype(m_src, state);

+ else if (state.inServer())

+ state = parseServer(m_src, state);

+ else if (state.inProcessingInstruction())

+ state = parseProcessingInstruction(m_src, state);

+ else if (state.hasTagState())

+ state = parseTag(m_src, state);

+ else if (state.startTag()) {

+ state.setStartTag(false);

+ switch(cc) {

+ case '/':

+ break;

+ case '!': {

+ //  or <!DOCTYPE ...>

+ searchCount = 1; // Look for '<!--' sequence to start comment or '<!DOCTYPE' sequence to start doctype

+ m_doctypeSearchCount = 1;

+ break;

+ }

+ case '?': {

+ // xml processing instruction

+ state.setInProcessingInstruction(true);

+ tquote = NoQuote;

+ state = parseProcessingInstruction(m_src, state);

+ continue;

+ break;

+ }

+ case '%':

+ if (!m_brokenServer) {

+ // <% server stuff, handle as comment %>

+ state.setInServer(true);

+ tquote = NoQuote;

+ state = parseServer(m_src, state);

+ continue;

+ }

+ // else fall through

+ default: {

+ if( ((cc >= 'a') && (cc <= 'z')) || ((cc >= 'A') && (cc <= 'Z'))) {

+ // Start of a Start-Tag

+ } else {

+ // Invalid tag

+ // Add as is

+ *m_dest = '<';

+ m_dest++;

+ continue;

+ }

+ }; // end case

+ processToken();

+ m_cBufferPos = 0;

+ state.setTagState(TagName);

+ state = parseTag(m_src, state);

+ }

+ } else if (cc == '&' && !m_src.escaped()) {

+ m_src.advancePastNonNewline();

+ state = parseEntity(m_src, m_dest, state, m_cBufferPos, true, state.hasTagState());

+ } else if (cc == '<' && !m_src.escaped()) {

+ m_currentTagStartLineNumber = m_lineNumber;

+ m_src.advancePastNonNewline();

+ state.setStartTag(true);

+ state.setDiscardLF(false);

+ } else if (cc == '\n' || cc == '\r') {

+ if (state.discardLF())

+ // Ignore this LF

+ state.setDiscardLF(false); // We have discarded 1 LF

+ else {

+ // Process this LF

+ *m_dest++ = '\n';

+ if (cc == '\r' && !m_src.excludeLineNumbers())

+ m_lineNumber++;

+ }

+ /* Check for MS-DOS CRLF sequence */

+ if (cc == '\r')

+ state.setSkipLF(true);

+ m_src.advance(m_lineNumber);

+ } else {

+ state.setDiscardLF(false);

+ *m_dest++ = cc;

+ m_src.advancePastNonNewline();

+ }

+#ifdef INSTRUMENT_LAYOUT_SCHEDULING

+ if (!m_doc->ownerElement())

+ printf("Ending write at time %d\n", m_doc->elapsedTime());

+#endif

+ m_inWrite = wasInWrite;

+ m_state = state;

+ if (m_noMoreData && !m_inWrite && !state.loadingExtScript() && !m_executingScript && !m_timer.isActive()) {

+ end(); // this actually causes us to be deleted

+ return true;

+ }

+ return false;

+void HTMLTokenizer::stopParsing()

+ Tokenizer::stopParsing();

+ m_timer.stop();

+ // The part needs to know that the tokenizer has finished with its data,

+ // regardless of whether it happened naturally or due to manual intervention.

+ if (!m_fragment && m_doc->frame())

+ m_doc->frame()->loader()->tokenizerProcessedData();

+bool HTMLTokenizer::processingData() const

+ return m_timer.isActive() || m_inWrite;

+void HTMLTokenizer::timerFired(Timer<HTMLTokenizer>*)

+#ifdef INSTRUMENT_LAYOUT_SCHEDULING

+ if (!m_doc->ownerElement())

+ printf("Beginning timer write at time %d\n", m_doc->elapsedTime());

+#endif

+ if (m_doc->view() && m_doc->view()->layoutPending() && !m_doc->minimumLayoutDelay()) {

+ // Restart the timer and let layout win. This is basically a way of ensuring that the layout

+ // timer has higher priority than our timer.

+ m_timer.startOneShot(0);

+ return;

+ }

+ // Invoke write() as though more data came in. This might cause us to get deleted.

+ write(SegmentedString(), true);

+void HTMLTokenizer::end()

+ ASSERT(!m_timer.isActive());

+ m_timer.stop(); // Only helps if assertion above fires, but do it anyway.

+ if (m_buffer) {

+ // parseTag is using the buffer for different matters

+ if (!m_state.hasTagState())

+ processToken();

+ fastFree(m_scriptCode);

+ m_scriptCode = 0;

+ m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;

+ fastFree(m_buffer);

+ m_buffer = 0;

+ }

+ if (!inViewSourceMode())

+ m_parser->finished();

+ else

+ m_doc->finishedParsing();

+void HTMLTokenizer::finish()

+ // do this as long as we don't find matching comment ends

+ while ((m_state.inComment() || m_state.inServer()) && m_scriptCode && m_scriptCodeSize) {

+ // we've found an unmatched comment start

+ if (m_state.inComment())

+ m_brokenComments = true;

+ else

+ m_brokenServer = true;

+ checkScriptBuffer();

+ m_scriptCode[m_scriptCodeSize] = 0;

+ m_scriptCode[m_scriptCodeSize + 1] = 0;

+ int pos;

+ String food;

+ if (m_state.inScript() || m_state.inStyle() || m_state.inTextArea())

+ food = String(m_scriptCode, m_scriptCodeSize);

+ else if (m_state.inServer()) {

+ food = "<";

+ food.append(m_scriptCode, m_scriptCodeSize);

+ } else {

+ pos = find(m_scriptCode, m_scriptCodeSize, '>');

+ food = String(m_scriptCode + pos + 1, m_scriptCodeSize - pos - 1);

+ }

+ fastFree(m_scriptCode);

+ m_scriptCode = 0;

+ m_scriptCodeSize = m_scriptCodeCapacity = m_scriptCodeResync = 0;

+ m_state.setInComment(false);

+ m_state.setInServer(false);

+ if (!food.isEmpty())

+ write(food, true);

+ }

+ // this indicates we will not receive any more data... but if we are waiting on

+ // an external script to load, we can't finish parsing until that is done

+ m_noMoreData = true;

+ if (!m_inWrite && !m_state.loadingExtScript() && !m_executingScript && !m_timer.isActive())

+ end(); // this actually causes us to be deleted

+PassRefPtr<Node> HTMLTokenizer::processToken()

+ ScriptController* scriptController = (!m_fragment && m_doc->frame()) ? m_doc->frame()->script() : 0;

+ if (scriptController && scriptController->isEnabled())

+ // FIXME: Why isn't this m_currentScriptTagStartLineNumber? I suspect this is wrong.

+ scriptController->setEventHandlerLineno(m_currentTagStartLineNumber + 1); // Script line numbers are 1 based.

+ if (m_dest > m_buffer) {

+ m_currentToken.text = StringImpl::createStrippingNullCharacters(m_buffer, m_dest - m_buffer);

+ if (m_currentToken.tagName != commentAtom)

+ m_currentToken.tagName = textAtom;

+ } else if (m_currentToken.tagName == nullAtom) {

+ m_currentToken.reset();

+ if (scriptController)

+ scriptController->setEventHandlerLineno(m_lineNumber + 1); // Script line numbers are 1 based.

+ return 0;

+ }

+ m_dest = m_buffer;

+ RefPtr<Node> n;

+ if (!m_parserStopped) {

+ if (NamedMappedAttrMap* map = m_currentToken.attrs.get())

+ map->shrinkToLength();

+ if (inViewSourceMode())

+ static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceToken(&m_currentToken);

+ else

+ // pass the token over to the parser, the parser DOES NOT delete the token

+ n = m_parser->parseToken(&m_currentToken);

+ }

+ m_currentToken.reset();

+ if (scriptController)

+ scriptController->setEventHandlerLineno(0);

+ return n.release();

+void HTMLTokenizer::processDoctypeToken()

+ if (inViewSourceMode())

+ static_cast<HTMLViewSourceDocument*>(m_doc)->addViewSourceDoctypeToken(&m_doctypeToken);

+ else

+ m_parser->parseDoctypeToken(&m_doctypeToken);

+HTMLTokenizer::~HTMLTokenizer()

+ ASSERT(!m_inWrite);

+ reset();

+void HTMLTokenizer::enlargeBuffer(int len)

+ int newSize = max(m_bufferSize * 2, m_bufferSize + len);

+ int oldOffset = m_dest - m_buffer;

+ m_buffer = static_cast<UChar*>(fastRealloc(m_buffer, newSize * sizeof(UChar)));

+ m_dest = m_buffer + oldOffset;

+ m_bufferSize = newSize;

+void HTMLTokenizer::enlargeScriptBuffer(int len)

+ int newSize = max(m_scriptCodeCapacity * 2, m_scriptCodeCapacity + len);

+ m_scriptCode = static_cast<UChar*>(fastRealloc(m_scriptCode, newSize * sizeof(UChar)));

+ m_scriptCodeCapacity = newSize;

+void HTMLTokenizer::executeScriptsWaitingForStylesheets()

+ ASSERT(m_doc->haveStylesheetsLoaded());

+ if (m_hasScriptsWaitingForStylesheets)

+ notifyFinished(0);

+void HTMLTokenizer::notifyFinished(CachedResource*)

+#ifdef INSTRUMENT_LAYOUT_SCHEDULING

+ if (!m_doc->ownerElement())

+ printf("script loaded at %d\n", m_doc->elapsedTime());

+#endif

+ ASSERT(!m_pendingScripts.isEmpty());

+ // Make external scripts wait for external stylesheets.

+ // FIXME: This needs to be done for inline scripts too.

+ m_hasScriptsWaitingForStylesheets = !m_doc->haveStylesheetsLoaded();

+ if (m_hasScriptsWaitingForStylesheets)

+ return;

+ bool finished = false;

+ while (!finished && m_pendingScripts.first()->isLoaded()) {

+ CachedScript* cs = m_pendingScripts.first().get();

+ m_pendingScripts.removeFirst();

+ ASSERT(cache()->disabled() || cs->accessCount() > 0);

+ setSrc(SegmentedString());

+ // make sure we forget about the script before we execute the new one

+ // infinite recursion might happen otherwise

+ ScriptSourceCode sourceCode(cs);

+ bool errorOccurred = cs->errorOccurred();

+ cs->removeClient(this);

+ RefPtr<Node> n = m_scriptNode.release();

+#ifdef INSTRUMENT_LAYOUT_SCHEDULING

+ if (!m_doc->ownerElement())

+ printf("external script beginning execution at %d\n", m_doc->elapsedTime());

+#endif

+ if (errorOccurred)

+ n->dispatchEventForType(eventNames().errorEvent, true, false);

+ else {

+ if (static_cast<HTMLScriptElement*>(n.get())->shouldExecuteAsJavaScript())

+ m_state = scriptExecution(sourceCode, m_state);

+ n->dispatchEventForType(eventNames().loadEvent, false, false);

+ }

+ // The state of m_pendingScripts.isEmpty() can change inside the scriptExecution()

+ // call above, so test afterwards.

+ finished = m_pendingScripts.isEmpty();

+ if (finished) {

+ ASSERT(!m_hasScriptsWaitingForStylesheets);

+ m_state.setLoadingExtScript(false);

+#ifdef INSTRUMENT_LAYOUT_SCHEDULING

+ if (!m_doc->ownerElement())

+ printf("external script finished execution at %d\n", m_doc->elapsedTime());

+#endif

+ } else if (m_hasScriptsWaitingForStylesheets) {

+ // m_hasScriptsWaitingForStylesheets flag might have changed during the script execution.

+ // If it did we are now blocked waiting for stylesheets and should not execute more scripts until they arrive.

+ finished = true;

+ }

+ // 'm_requestingScript' is true when we are called synchronously from

+ // scriptHandler(). In that case scriptHandler() will take care

+ // of m_pendingSrc.

+ if (!m_requestingScript) {

+ SegmentedString rest = m_pendingSrc;

+ m_pendingSrc.clear();

+ write(rest, false);

+ // we might be deleted at this point, do not access any members.

+ }

+bool HTMLTokenizer::isWaitingForScripts() const

+ return m_state.loadingExtScript();

+void HTMLTokenizer::setSrc(const SegmentedString& source)

+ m_src = source;

+void parseHTMLDocumentFragment(const String& source, DocumentFragment* fragment)

+ HTMLTokenizer tok(fragment);

+ tok.setForceSynchronous(true);

+ tok.write(source, true);

+ tok.finish();

+ ASSERT(!tok.processingData()); // make sure we're done (see 3963151)

+UChar decodeNamedEntity(const char* name)

+ const Entity* e = findEntity(name, strlen(name));

+ return e ? e->code : 0;

« no previous file with comments | « third_party/WebKit/WebCore/html/HTMLParser.cpp ('k') | third_party/WebKit/WebCore/loader/EmptyClients.h » ('j') | no next file with comments »