OLD | NEW |
1 /* | 1 /* |
2 * Copyright (C) 2010 Google Inc. All Rights Reserved. | 2 * Copyright (C) 2010 Google Inc. All Rights Reserved. |
3 * | 3 * |
4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
6 * are met: | 6 * are met: |
7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
(...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
44 , m_assumedCodec(newTextCodec(Latin1Encoding())) | 44 , m_assumedCodec(newTextCodec(Latin1Encoding())) |
45 , m_inHeadSection(true) | 45 , m_inHeadSection(true) |
46 , m_doneChecking(false) | 46 , m_doneChecking(false) |
47 { | 47 { |
48 } | 48 } |
49 | 49 |
50 HTMLMetaCharsetParser::~HTMLMetaCharsetParser() | 50 HTMLMetaCharsetParser::~HTMLMetaCharsetParser() |
51 { | 51 { |
52 } | 52 } |
53 | 53 |
| 54 static const char charsetString[] = "charset"; |
| 55 static const size_t charsetLength = sizeof("charset") - 1; |
| 56 |
| 57 String HTMLMetaCharsetParser::extractCharset(const String& value) |
| 58 { |
| 59 size_t pos = 0; |
| 60 unsigned length = value.length(); |
| 61 |
| 62 while (pos < length) { |
| 63 pos = value.find(charsetString, pos, false); |
| 64 if (pos == kNotFound) |
| 65 break; |
| 66 |
| 67 pos += charsetLength; |
| 68 |
| 69 // Skip whitespace. |
| 70 while (pos < length && value[pos] <= ' ') |
| 71 ++pos; |
| 72 |
| 73 if (value[pos] != '=') |
| 74 continue; |
| 75 |
| 76 ++pos; |
| 77 |
| 78 while (pos < length && value[pos] <= ' ') |
| 79 ++pos; |
| 80 |
| 81 char quoteMark = 0; |
| 82 if (pos < length && (value[pos] == '"' || value[pos] == '\'')) { |
| 83 quoteMark = static_cast<char>(value[pos++]); |
| 84 ASSERT(!(quoteMark & 0x80)); |
| 85 } |
| 86 |
| 87 if (pos == length) |
| 88 break; |
| 89 |
| 90 unsigned end = pos; |
| 91 while (end < length && ((quoteMark && value[end] != quoteMark) || (!quot
eMark && value[end] > ' ' && value[end] != '"' && value[end] != '\'' && value[en
d] != ';'))) |
| 92 ++end; |
| 93 |
| 94 if (quoteMark && (end == length)) |
| 95 break; // Close quote not found. |
| 96 |
| 97 return value.substring(pos, end - pos); |
| 98 } |
| 99 |
| 100 return ""; |
| 101 } |
| 102 |
54 bool HTMLMetaCharsetParser::processMeta() | 103 bool HTMLMetaCharsetParser::processMeta() |
55 { | 104 { |
56 const HTMLToken::AttributeList& tokenAttributes = m_token.attributes(); | 105 const HTMLToken::AttributeList& tokenAttributes = m_token.attributes(); |
57 HTMLAttributeList attributes; | 106 AttributeList attributes; |
58 for (HTMLToken::AttributeList::const_iterator iter = tokenAttributes.begin()
; iter != tokenAttributes.end(); ++iter) { | 107 for (HTMLToken::AttributeList::const_iterator iter = tokenAttributes.begin()
; iter != tokenAttributes.end(); ++iter) { |
59 String attributeName = attemptStaticStringCreation(iter->name, Likely8Bi
t); | 108 String attributeName = StringImpl::create8BitIfPossible(iter->name); |
60 String attributeValue = StringImpl::create8BitIfPossible(iter->value); | 109 String attributeValue = StringImpl::create8BitIfPossible(iter->value); |
61 attributes.append(std::make_pair(attributeName, attributeValue)); | 110 attributes.append(std::make_pair(attributeName, attributeValue)); |
62 } | 111 } |
63 | 112 |
64 m_encoding = encodingFromMetaAttributes(attributes); | 113 m_encoding = encodingFromMetaAttributes(attributes); |
65 return m_encoding.isValid(); | 114 return m_encoding.isValid(); |
66 } | 115 } |
67 | 116 |
| 117 WTF::TextEncoding HTMLMetaCharsetParser::encodingFromMetaAttributes(const Attrib
uteList& attributes) |
| 118 { |
| 119 bool gotPragma = false; |
| 120 Mode mode = None; |
| 121 String charset; |
| 122 |
| 123 for (AttributeList::const_iterator iter = attributes.begin(); iter != attrib
utes.end(); ++iter) { |
| 124 const AtomicString& attributeName = AtomicString(iter->first); |
| 125 const String& attributeValue = iter->second; |
| 126 |
| 127 if (attributeName == http_equivAttr) { |
| 128 if (equalIgnoringCase(attributeValue, "content-type")) |
| 129 gotPragma = true; |
| 130 } else if (charset.isEmpty()) { |
| 131 if (attributeName == charsetAttr) { |
| 132 charset = attributeValue; |
| 133 mode = Charset; |
| 134 } else if (attributeName == contentAttr) { |
| 135 charset = extractCharset(attributeValue); |
| 136 if (charset.length()) |
| 137 mode = Pragma; |
| 138 } |
| 139 } |
| 140 } |
| 141 |
| 142 if (mode == Charset || (mode == Pragma && gotPragma)) |
| 143 return WTF::TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset)); |
| 144 |
| 145 return WTF::TextEncoding(); |
| 146 } |
| 147 |
68 static const int bytesToCheckUnconditionally = 1024; // That many input bytes wi
ll be checked for meta charset even if <head> section is over. | 148 static const int bytesToCheckUnconditionally = 1024; // That many input bytes wi
ll be checked for meta charset even if <head> section is over. |
69 | 149 |
70 bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) | 150 bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) |
71 { | 151 { |
72 if (m_doneChecking) | 152 if (m_doneChecking) |
73 return true; | 153 return true; |
74 | 154 |
75 ASSERT(!m_encoding.isValid()); | 155 ASSERT(!m_encoding.isValid()); |
76 | 156 |
77 // We still don't have an encoding, and are in the head. | 157 // We still don't have an encoding, and are in the head. |
(...skipping 12 matching lines...) Expand all Loading... |
90 | 170 |
91 // Since many sites have charset declarations after <body> or other tags | 171 // Since many sites have charset declarations after <body> or other tags |
92 // that are disallowed in <head>, we don't bail out until we've checked at | 172 // that are disallowed in <head>, we don't bail out until we've checked at |
93 // least bytesToCheckUnconditionally bytes of input. | 173 // least bytesToCheckUnconditionally bytes of input. |
94 | 174 |
95 m_input.append(SegmentedString(m_assumedCodec->decode(data, length))); | 175 m_input.append(SegmentedString(m_assumedCodec->decode(data, length))); |
96 | 176 |
97 while (m_tokenizer->nextToken(m_input, m_token)) { | 177 while (m_tokenizer->nextToken(m_input, m_token)) { |
98 bool end = m_token.type() == HTMLToken::EndTag; | 178 bool end = m_token.type() == HTMLToken::EndTag; |
99 if (end || m_token.type() == HTMLToken::StartTag) { | 179 if (end || m_token.type() == HTMLToken::StartTag) { |
100 String tagName = attemptStaticStringCreation(m_token.name(), Likely8
Bit); | 180 AtomicString tagName(m_token.name()); |
101 if (!end) { | 181 if (!end) { |
102 m_tokenizer->updateStateFor(tagName); | 182 m_tokenizer->updateStateFor(tagName); |
103 if (threadSafeMatch(tagName, metaTag) && processMeta()) { | 183 if (tagName == metaTag && processMeta()) { |
104 m_doneChecking = true; | 184 m_doneChecking = true; |
105 return true; | 185 return true; |
106 } | 186 } |
107 } | 187 } |
108 | 188 |
109 if (!threadSafeMatch(tagName, scriptTag) && !threadSafeMatch(tagName
, noscriptTag) | 189 if (tagName != scriptTag && tagName != noscriptTag |
110 && !threadSafeMatch(tagName, styleTag) && !threadSafeMatch(tagNa
me, linkTag) | 190 && tagName != styleTag && tagName != linkTag |
111 && !threadSafeMatch(tagName, metaTag) && !threadSafeMatch(tagNam
e, objectTag) | 191 && tagName != metaTag && tagName != objectTag |
112 && !threadSafeMatch(tagName, titleTag) && !threadSafeMatch(tagNa
me, baseTag) | 192 && tagName != titleTag && tagName != baseTag |
113 && (end || !threadSafeMatch(tagName, htmlTag)) && (end || !threa
dSafeMatch(tagName, headTag))) { | 193 && (end || tagName != htmlTag) && (end || tagName != headTag)) { |
114 m_inHeadSection = false; | 194 m_inHeadSection = false; |
115 } | 195 } |
116 } | 196 } |
117 | 197 |
118 if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToC
heckUnconditionally) { | 198 if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToC
heckUnconditionally) { |
119 m_doneChecking = true; | 199 m_doneChecking = true; |
120 return true; | 200 return true; |
121 } | 201 } |
122 | 202 |
123 m_token.clear(); | 203 m_token.clear(); |
124 } | 204 } |
125 | 205 |
126 return false; | 206 return false; |
127 } | 207 } |
128 | 208 |
129 } | 209 } |
OLD | NEW |