OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2010 Google Inc. All Rights Reserved. | 2 * Copyright (C) 2010 Google Inc. All Rights Reserved. |
3 * | 3 * |
4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
6 * are met: | 6 * are met: |
7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
44 , m_assumedCodec(newTextCodec(Latin1Encoding())) | 44 , m_assumedCodec(newTextCodec(Latin1Encoding())) |
45 , m_inHeadSection(true) | 45 , m_inHeadSection(true) |
46 , m_doneChecking(false) | 46 , m_doneChecking(false) |
47 { | 47 { |
48 } | 48 } |
49 | 49 |
50 HTMLMetaCharsetParser::~HTMLMetaCharsetParser() | 50 HTMLMetaCharsetParser::~HTMLMetaCharsetParser() |
51 { | 51 { |
52 } | 52 } |
53 | 53 |
54 static const char charsetString[] = "charset"; | |
55 static const size_t charsetLength = sizeof("charset") - 1; | |
56 | |
57 String HTMLMetaCharsetParser::extractCharset(const String& value) | |
58 { | |
59 size_t pos = 0; | |
60 unsigned length = value.length(); | |
61 | |
62 while (pos < length) { | |
63 pos = value.find(charsetString, pos, false); | |
64 if (pos == kNotFound) | |
65 break; | |
66 | |
67 pos += charsetLength; | |
68 | |
69 // Skip whitespace. | |
70 while (pos < length && value[pos] <= ' ') | |
71 ++pos; | |
72 | |
73 if (value[pos] != '=') | |
74 continue; | |
75 | |
76 ++pos; | |
77 | |
78 while (pos < length && value[pos] <= ' ') | |
79 ++pos; | |
80 | |
81 char quoteMark = 0; | |
82 if (pos < length && (value[pos] == '"' || value[pos] == '\'')) { | |
83 quoteMark = static_cast<char>(value[pos++]); | |
84 ASSERT(!(quoteMark & 0x80)); | |
85 } | |
86 | |
87 if (pos == length) | |
88 break; | |
89 | |
90 unsigned end = pos; | |
91 while (end < length && ((quoteMark && value[end] != quoteMark) || (!quot
eMark && value[end] > ' ' && value[end] != '"' && value[end] != '\'' && value[en
d] != ';'))) | |
92 ++end; | |
93 | |
94 if (quoteMark && (end == length)) | |
95 break; // Close quote not found. | |
96 | |
97 return value.substring(pos, end - pos); | |
98 } | |
99 | |
100 return ""; | |
101 } | |
102 | |
103 bool HTMLMetaCharsetParser::processMeta() | 54 bool HTMLMetaCharsetParser::processMeta() |
104 { | 55 { |
105 const HTMLToken::AttributeList& tokenAttributes = m_token.attributes(); | 56 const HTMLToken::AttributeList& tokenAttributes = m_token.attributes(); |
106 AttributeList attributes; | 57 HTMLAttributeList attributes; |
107 for (HTMLToken::AttributeList::const_iterator iter = tokenAttributes.begin()
; iter != tokenAttributes.end(); ++iter) { | 58 for (HTMLToken::AttributeList::const_iterator iter = tokenAttributes.begin()
; iter != tokenAttributes.end(); ++iter) { |
108 String attributeName = StringImpl::create8BitIfPossible(iter->name); | 59 String attributeName = attemptStaticStringCreation(iter->name, Likely8Bi
t); |
109 String attributeValue = StringImpl::create8BitIfPossible(iter->value); | 60 String attributeValue = StringImpl::create8BitIfPossible(iter->value); |
110 attributes.append(std::make_pair(attributeName, attributeValue)); | 61 attributes.append(std::make_pair(attributeName, attributeValue)); |
111 } | 62 } |
112 | 63 |
113 m_encoding = encodingFromMetaAttributes(attributes); | 64 m_encoding = encodingFromMetaAttributes(attributes); |
114 return m_encoding.isValid(); | 65 return m_encoding.isValid(); |
115 } | 66 } |
116 | 67 |
117 WTF::TextEncoding HTMLMetaCharsetParser::encodingFromMetaAttributes(const Attrib
uteList& attributes) | |
118 { | |
119 bool gotPragma = false; | |
120 Mode mode = None; | |
121 String charset; | |
122 | |
123 for (AttributeList::const_iterator iter = attributes.begin(); iter != attrib
utes.end(); ++iter) { | |
124 const AtomicString& attributeName = AtomicString(iter->first); | |
125 const String& attributeValue = iter->second; | |
126 | |
127 if (attributeName == http_equivAttr) { | |
128 if (equalIgnoringCase(attributeValue, "content-type")) | |
129 gotPragma = true; | |
130 } else if (charset.isEmpty()) { | |
131 if (attributeName == charsetAttr) { | |
132 charset = attributeValue; | |
133 mode = Charset; | |
134 } else if (attributeName == contentAttr) { | |
135 charset = extractCharset(attributeValue); | |
136 if (charset.length()) | |
137 mode = Pragma; | |
138 } | |
139 } | |
140 } | |
141 | |
142 if (mode == Charset || (mode == Pragma && gotPragma)) | |
143 return WTF::TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset)); | |
144 | |
145 return WTF::TextEncoding(); | |
146 } | |
147 | |
148 static const int bytesToCheckUnconditionally = 1024; // That many input bytes wi
ll be checked for meta charset even if <head> section is over. | 68 static const int bytesToCheckUnconditionally = 1024; // That many input bytes wi
ll be checked for meta charset even if <head> section is over. |
149 | 69 |
150 bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) | 70 bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) |
151 { | 71 { |
152 if (m_doneChecking) | 72 if (m_doneChecking) |
153 return true; | 73 return true; |
154 | 74 |
155 ASSERT(!m_encoding.isValid()); | 75 ASSERT(!m_encoding.isValid()); |
156 | 76 |
157 // We still don't have an encoding, and are in the head. | 77 // We still don't have an encoding, and are in the head. |
(...skipping 12 matching lines...) Expand all Loading... |
170 | 90 |
171 // Since many sites have charset declarations after <body> or other tags | 91 // Since many sites have charset declarations after <body> or other tags |
172 // that are disallowed in <head>, we don't bail out until we've checked at | 92 // that are disallowed in <head>, we don't bail out until we've checked at |
173 // least bytesToCheckUnconditionally bytes of input. | 93 // least bytesToCheckUnconditionally bytes of input. |
174 | 94 |
175 m_input.append(SegmentedString(m_assumedCodec->decode(data, length))); | 95 m_input.append(SegmentedString(m_assumedCodec->decode(data, length))); |
176 | 96 |
177 while (m_tokenizer->nextToken(m_input, m_token)) { | 97 while (m_tokenizer->nextToken(m_input, m_token)) { |
178 bool end = m_token.type() == HTMLToken::EndTag; | 98 bool end = m_token.type() == HTMLToken::EndTag; |
179 if (end || m_token.type() == HTMLToken::StartTag) { | 99 if (end || m_token.type() == HTMLToken::StartTag) { |
180 AtomicString tagName(m_token.name()); | 100 String tagName = attemptStaticStringCreation(m_token.name(), Likely8
Bit); |
181 if (!end) { | 101 if (!end) { |
182 m_tokenizer->updateStateFor(tagName); | 102 m_tokenizer->updateStateFor(tagName); |
183 if (tagName == metaTag && processMeta()) { | 103 if (threadSafeMatch(tagName, metaTag) && processMeta()) { |
184 m_doneChecking = true; | 104 m_doneChecking = true; |
185 return true; | 105 return true; |
186 } | 106 } |
187 } | 107 } |
188 | 108 |
189 if (tagName != scriptTag && tagName != noscriptTag | 109 if (!threadSafeMatch(tagName, scriptTag) && !threadSafeMatch(tagName
, noscriptTag) |
190 && tagName != styleTag && tagName != linkTag | 110 && !threadSafeMatch(tagName, styleTag) && !threadSafeMatch(tagNa
me, linkTag) |
191 && tagName != metaTag && tagName != objectTag | 111 && !threadSafeMatch(tagName, metaTag) && !threadSafeMatch(tagNam
e, objectTag) |
192 && tagName != titleTag && tagName != baseTag | 112 && !threadSafeMatch(tagName, titleTag) && !threadSafeMatch(tagNa
me, baseTag) |
193 && (end || tagName != htmlTag) && (end || tagName != headTag)) { | 113 && (end || !threadSafeMatch(tagName, htmlTag)) && (end || !threa
dSafeMatch(tagName, headTag))) { |
194 m_inHeadSection = false; | 114 m_inHeadSection = false; |
195 } | 115 } |
196 } | 116 } |
197 | 117 |
198 if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToC
heckUnconditionally) { | 118 if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToC
heckUnconditionally) { |
199 m_doneChecking = true; | 119 m_doneChecking = true; |
200 return true; | 120 return true; |
201 } | 121 } |
202 | 122 |
203 m_token.clear(); | 123 m_token.clear(); |
204 } | 124 } |
205 | 125 |
206 return false; | 126 return false; |
207 } | 127 } |
208 | 128 |
209 } | 129 } |
OLD | NEW |