| OLD | NEW |
| 1 /* | 1 /* |
| 2 * Copyright (C) 2010 Google Inc. All Rights Reserved. | 2 * Copyright (C) 2010 Google Inc. All Rights Reserved. |
| 3 * | 3 * |
| 4 * Redistribution and use in source and binary forms, with or without | 4 * Redistribution and use in source and binary forms, with or without |
| 5 * modification, are permitted provided that the following conditions | 5 * modification, are permitted provided that the following conditions |
| 6 * are met: | 6 * are met: |
| 7 * 1. Redistributions of source code must retain the above copyright | 7 * 1. Redistributions of source code must retain the above copyright |
| 8 * notice, this list of conditions and the following disclaimer. | 8 * notice, this list of conditions and the following disclaimer. |
| 9 * 2. Redistributions in binary form must reproduce the above copyright | 9 * 2. Redistributions in binary form must reproduce the above copyright |
| 10 * notice, this list of conditions and the following disclaimer in the | 10 * notice, this list of conditions and the following disclaimer in the |
| (...skipping 33 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 44 , m_assumedCodec(newTextCodec(Latin1Encoding())) | 44 , m_assumedCodec(newTextCodec(Latin1Encoding())) |
| 45 , m_inHeadSection(true) | 45 , m_inHeadSection(true) |
| 46 , m_doneChecking(false) | 46 , m_doneChecking(false) |
| 47 { | 47 { |
| 48 } | 48 } |
| 49 | 49 |
| 50 HTMLMetaCharsetParser::~HTMLMetaCharsetParser() | 50 HTMLMetaCharsetParser::~HTMLMetaCharsetParser() |
| 51 { | 51 { |
| 52 } | 52 } |
| 53 | 53 |
| 54 static const char charsetString[] = "charset"; | |
| 55 static const size_t charsetLength = sizeof("charset") - 1; | |
| 56 | |
| 57 String HTMLMetaCharsetParser::extractCharset(const String& value) | |
| 58 { | |
| 59 size_t pos = 0; | |
| 60 unsigned length = value.length(); | |
| 61 | |
| 62 while (pos < length) { | |
| 63 pos = value.find(charsetString, pos, false); | |
| 64 if (pos == kNotFound) | |
| 65 break; | |
| 66 | |
| 67 pos += charsetLength; | |
| 68 | |
| 69 // Skip whitespace. | |
| 70 while (pos < length && value[pos] <= ' ') | |
| 71 ++pos; | |
| 72 | |
| 73 if (value[pos] != '=') | |
| 74 continue; | |
| 75 | |
| 76 ++pos; | |
| 77 | |
| 78 while (pos < length && value[pos] <= ' ') | |
| 79 ++pos; | |
| 80 | |
| 81 char quoteMark = 0; | |
| 82 if (pos < length && (value[pos] == '"' || value[pos] == '\'')) { | |
| 83 quoteMark = static_cast<char>(value[pos++]); | |
| 84 ASSERT(!(quoteMark & 0x80)); | |
| 85 } | |
| 86 | |
| 87 if (pos == length) | |
| 88 break; | |
| 89 | |
| 90 unsigned end = pos; | |
| 91 while (end < length && ((quoteMark && value[end] != quoteMark) || (!quot
eMark && value[end] > ' ' && value[end] != '"' && value[end] != '\'' && value[en
d] != ';'))) | |
| 92 ++end; | |
| 93 | |
| 94 if (quoteMark && (end == length)) | |
| 95 break; // Close quote not found. | |
| 96 | |
| 97 return value.substring(pos, end - pos); | |
| 98 } | |
| 99 | |
| 100 return ""; | |
| 101 } | |
| 102 | |
| 103 bool HTMLMetaCharsetParser::processMeta() | 54 bool HTMLMetaCharsetParser::processMeta() |
| 104 { | 55 { |
| 105 const HTMLToken::AttributeList& tokenAttributes = m_token.attributes(); | 56 const HTMLToken::AttributeList& tokenAttributes = m_token.attributes(); |
| 106 AttributeList attributes; | 57 HTMLAttributeList attributes; |
| 107 for (HTMLToken::AttributeList::const_iterator iter = tokenAttributes.begin()
; iter != tokenAttributes.end(); ++iter) { | 58 for (HTMLToken::AttributeList::const_iterator iter = tokenAttributes.begin()
; iter != tokenAttributes.end(); ++iter) { |
| 108 String attributeName = StringImpl::create8BitIfPossible(iter->name); | 59 String attributeName = attemptStaticStringCreation(iter->name, Likely8Bi
t); |
| 109 String attributeValue = StringImpl::create8BitIfPossible(iter->value); | 60 String attributeValue = StringImpl::create8BitIfPossible(iter->value); |
| 110 attributes.append(std::make_pair(attributeName, attributeValue)); | 61 attributes.append(std::make_pair(attributeName, attributeValue)); |
| 111 } | 62 } |
| 112 | 63 |
| 113 m_encoding = encodingFromMetaAttributes(attributes); | 64 m_encoding = encodingFromMetaAttributes(attributes); |
| 114 return m_encoding.isValid(); | 65 return m_encoding.isValid(); |
| 115 } | 66 } |
| 116 | 67 |
| 117 WTF::TextEncoding HTMLMetaCharsetParser::encodingFromMetaAttributes(const Attrib
uteList& attributes) | |
| 118 { | |
| 119 bool gotPragma = false; | |
| 120 Mode mode = None; | |
| 121 String charset; | |
| 122 | |
| 123 for (AttributeList::const_iterator iter = attributes.begin(); iter != attrib
utes.end(); ++iter) { | |
| 124 const AtomicString& attributeName = iter->first; | |
| 125 const String& attributeValue = iter->second; | |
| 126 | |
| 127 if (attributeName == http_equivAttr) { | |
| 128 if (equalIgnoringCase(attributeValue, "content-type")) | |
| 129 gotPragma = true; | |
| 130 } else if (charset.isEmpty()) { | |
| 131 if (attributeName == charsetAttr) { | |
| 132 charset = attributeValue; | |
| 133 mode = Charset; | |
| 134 } else if (attributeName == contentAttr) { | |
| 135 charset = extractCharset(attributeValue); | |
| 136 if (charset.length()) | |
| 137 mode = Pragma; | |
| 138 } | |
| 139 } | |
| 140 } | |
| 141 | |
| 142 if (mode == Charset || (mode == Pragma && gotPragma)) | |
| 143 return WTF::TextEncoding(stripLeadingAndTrailingHTMLSpaces(charset)); | |
| 144 | |
| 145 return WTF::TextEncoding(); | |
| 146 } | |
| 147 | |
| 148 static const int bytesToCheckUnconditionally = 1024; // That many input bytes wi
ll be checked for meta charset even if <head> section is over. | 68 static const int bytesToCheckUnconditionally = 1024; // That many input bytes wi
ll be checked for meta charset even if <head> section is over. |
| 149 | 69 |
| 150 bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) | 70 bool HTMLMetaCharsetParser::checkForMetaCharset(const char* data, size_t length) |
| 151 { | 71 { |
| 152 if (m_doneChecking) | 72 if (m_doneChecking) |
| 153 return true; | 73 return true; |
| 154 | 74 |
| 155 ASSERT(!m_encoding.isValid()); | 75 ASSERT(!m_encoding.isValid()); |
| 156 | 76 |
| 157 // We still don't have an encoding, and are in the head. | 77 // We still don't have an encoding, and are in the head. |
| (...skipping 12 matching lines...) Expand all Loading... |
| 170 | 90 |
| 171 // Since many sites have charset declarations after <body> or other tags | 91 // Since many sites have charset declarations after <body> or other tags |
| 172 // that are disallowed in <head>, we don't bail out until we've checked at | 92 // that are disallowed in <head>, we don't bail out until we've checked at |
| 173 // least bytesToCheckUnconditionally bytes of input. | 93 // least bytesToCheckUnconditionally bytes of input. |
| 174 | 94 |
| 175 m_input.append(SegmentedString(m_assumedCodec->decode(data, length))); | 95 m_input.append(SegmentedString(m_assumedCodec->decode(data, length))); |
| 176 | 96 |
| 177 while (m_tokenizer->nextToken(m_input, m_token)) { | 97 while (m_tokenizer->nextToken(m_input, m_token)) { |
| 178 bool end = m_token.type() == HTMLToken::EndTag; | 98 bool end = m_token.type() == HTMLToken::EndTag; |
| 179 if (end || m_token.type() == HTMLToken::StartTag) { | 99 if (end || m_token.type() == HTMLToken::StartTag) { |
| 180 AtomicString tagName(m_token.name()); | 100 String tagName = attemptStaticStringCreation(m_token.name(), Likely8
Bit); |
| 181 if (!end) { | 101 if (!end) { |
| 182 m_tokenizer->updateStateFor(tagName); | 102 m_tokenizer->updateStateFor(tagName); |
| 183 if (tagName == metaTag && processMeta()) { | 103 if (threadSafeMatch(tagName, metaTag) && processMeta()) { |
| 184 m_doneChecking = true; | 104 m_doneChecking = true; |
| 185 return true; | 105 return true; |
| 186 } | 106 } |
| 187 } | 107 } |
| 188 | 108 |
| 189 if (tagName != scriptTag && tagName != noscriptTag | 109 if (!threadSafeMatch(tagName, scriptTag) && !threadSafeMatch(tagName
, noscriptTag) |
| 190 && tagName != styleTag && tagName != linkTag | 110 && !threadSafeMatch(tagName, styleTag) && !threadSafeMatch(tagNa
me, linkTag) |
| 191 && tagName != metaTag && tagName != objectTag | 111 && !threadSafeMatch(tagName, metaTag) && !threadSafeMatch(tagNam
e, objectTag) |
| 192 && tagName != titleTag && tagName != baseTag | 112 && !threadSafeMatch(tagName, titleTag) && !threadSafeMatch(tagNa
me, baseTag) |
| 193 && (end || tagName != htmlTag) && (end || tagName != headTag)) { | 113 && (end || !threadSafeMatch(tagName, htmlTag)) && (end || !threa
dSafeMatch(tagName, headTag))) { |
| 194 m_inHeadSection = false; | 114 m_inHeadSection = false; |
| 195 } | 115 } |
| 196 } | 116 } |
| 197 | 117 |
| 198 if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToC
heckUnconditionally) { | 118 if (!m_inHeadSection && m_input.numberOfCharactersConsumed() >= bytesToC
heckUnconditionally) { |
| 199 m_doneChecking = true; | 119 m_doneChecking = true; |
| 200 return true; | 120 return true; |
| 201 } | 121 } |
| 202 | 122 |
| 203 m_token.clear(); | 123 m_token.clear(); |
| 204 } | 124 } |
| 205 | 125 |
| 206 return false; | 126 return false; |
| 207 } | 127 } |
| 208 | 128 |
| 209 } | 129 } |
| OLD | NEW |