| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. | 2 * Copyright (C) 2008 Apple Inc. All Rights Reserved. |
| 3 * Copyright (C) 2010 Google, Inc. All Rights Reserved. | 3 * Copyright (C) 2010 Google, Inc. All Rights Reserved. |
| 4 * | 4 * |
| 5 * Redistribution and use in source and binary forms, with or without | 5 * Redistribution and use in source and binary forms, with or without |
| 6 * modification, are permitted provided that the following conditions | 6 * modification, are permitted provided that the following conditions |
| 7 * are met: | 7 * are met: |
| 8 * 1. Redistributions of source code must retain the above copyright | 8 * 1. Redistributions of source code must retain the above copyright |
| 9 * notice, this list of conditions and the following disclaimer. | 9 * notice, this list of conditions and the following disclaimer. |
| 10 * 2. Redistributions in binary form must reproduce the above copyright | 10 * 2. Redistributions in binary form must reproduce the above copyright |
| (...skipping 29 matching lines...) Expand all Loading... |
| 40 static PassOwnPtr<HTMLTokenizer> create() { return adoptPtr(new HTMLTokenize
r()); } | 40 static PassOwnPtr<HTMLTokenizer> create() { return adoptPtr(new HTMLTokenize
r()); } |
| 41 ~HTMLTokenizer(); | 41 ~HTMLTokenizer(); |
| 42 | 42 |
| 43 void reset(); | 43 void reset(); |
| 44 | 44 |
| 45 enum State { | 45 enum State { |
| 46 DataState, | 46 DataState, |
| 47 CharacterReferenceInDataState, | 47 CharacterReferenceInDataState, |
| 48 RAWTEXTState, | 48 RAWTEXTState, |
| 49 ScriptDataState, | 49 ScriptDataState, |
| 50 PLAINTEXTState, | |
| 51 TagOpenState, | 50 TagOpenState, |
| 52 EndTagOpenState, | 51 EndTagOpenState, |
| 53 TagNameState, | 52 TagNameState, |
| 54 RAWTEXTLessThanSignState, | 53 RAWTEXTLessThanSignState, |
| 55 RAWTEXTEndTagOpenState, | 54 RAWTEXTEndTagOpenState, |
| 56 RAWTEXTEndTagNameState, | 55 RAWTEXTEndTagNameState, |
| 57 ScriptDataLessThanSignState, | 56 ScriptDataLessThanSignState, |
| 58 ScriptDataEndTagOpenState, | 57 ScriptDataEndTagOpenState, |
| 59 ScriptDataEndTagNameState, | 58 ScriptDataEndTagNameState, |
| 60 ScriptDataEscapeStartState, | 59 ScriptDataEscapeStartState, |
| (...skipping 32 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 93 CommentEndDashState, | 92 CommentEndDashState, |
| 94 CommentEndState, | 93 CommentEndState, |
| 95 CommentEndBangState, | 94 CommentEndBangState, |
| 96 }; | 95 }; |
| 97 | 96 |
| 98 // This function returns true if it emits a token. Otherwise, callers | 97 // This function returns true if it emits a token. Otherwise, callers |
| 99 // must provide the same (in progress) token on the next call (unless | 98 // must provide the same (in progress) token on the next call (unless |
| 100 // they call reset() first). | 99 // they call reset() first). |
| 101 bool nextToken(SegmentedString&, HTMLToken&); | 100 bool nextToken(SegmentedString&, HTMLToken&); |
| 102 | 101 |
| 103 // Returns a copy of any characters buffered internally by the tokenizer. | |
| 104 // The tokenizer buffers characters when searching for the </script> token | |
| 105 // that terminates a script element. | |
| 106 String bufferedCharacters() const; | |
| 107 | |
| 108 size_t numberOfBufferedCharacters() const | |
| 109 { | |
| 110 // Notice that we add 2 to the length of the m_temporaryBuffer to | |
| 111 // account for the "</" characters, which are effecitvely buffered in | |
| 112 // the tokenizer's state machine. | |
| 113 return m_temporaryBuffer.size() ? m_temporaryBuffer.size() + 2 : 0; | |
| 114 } | |
| 115 | |
| 116 // Updates the tokenizer's state according to the given tag name. This is | |
| 117 // an approximation of how the tree builder would update the tokenizer's | |
| 118 // state. This method is useful for approximating HTML tokenization. To | |
| 119 // get exactly the correct tokenization, you need the real tree builder. | |
| 120 // | |
| 121 // The main failures in the approximation are as follows: | |
| 122 // | |
| 123 // * The first set of character tokens emitted for a <pre> element might | |
| 124 // contain an extra leading newline. | |
| 125 // * The replacement of U+0000 with U+FFFD will not be sensitive to the | |
| 126 // tree builder's insertion mode. | |
| 127 // * CDATA sections in foreign content will be tokenized as bogus comments | |
| 128 // instead of as character tokens. | |
| 129 // | |
| 130 void updateStateFor(const String& tagName); | |
| 131 | |
| 132 State state() const { return m_state; } | 102 State state() const { return m_state; } |
| 133 void setState(State state) { m_state = state; } | 103 void setState(State state) { m_state = state; } |
| 134 | 104 |
| 135 private: | 105 private: |
| 136 HTMLTokenizer(); | 106 HTMLTokenizer(); |
| 137 | 107 |
| 138 inline bool processEntity(SegmentedString&); | 108 inline bool processEntity(SegmentedString&); |
| 139 | 109 |
| 140 inline void parseError(); | 110 inline void parseError(); |
| 141 | 111 |
| (...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 217 | 187 |
| 218 // We occationally want to emit both a character token and an end tag | 188 // We occationally want to emit both a character token and an end tag |
| 219 // token (e.g., when lexing script). We buffer the name of the end tag | 189 // token (e.g., when lexing script). We buffer the name of the end tag |
| 220 // token here so we remember it next time we re-enter the tokenizer. | 190 // token here so we remember it next time we re-enter the tokenizer. |
| 221 Vector<LChar, 32> m_bufferedEndTagName; | 191 Vector<LChar, 32> m_bufferedEndTagName; |
| 222 }; | 192 }; |
| 223 | 193 |
| 224 } | 194 } |
| 225 | 195 |
| 226 #endif | 196 #endif |
| OLD | NEW |