| OLD | NEW |
| (Empty) |
| 1 /* | |
| 2 * Copyright (C) 2009 Google Inc. All rights reserved. | |
| 3 * | |
| 4 * Redistribution and use in source and binary forms, with or without | |
| 5 * modification, are permitted provided that the following conditions are | |
| 6 * met: | |
| 7 * | |
| 8 * * Redistributions of source code must retain the above copyright | |
| 9 * notice, this list of conditions and the following disclaimer. | |
| 10 * * Redistributions in binary form must reproduce the above | |
| 11 * copyright notice, this list of conditions and the following disclaimer | |
| 12 * in the documentation and/or other materials provided with the | |
| 13 * distribution. | |
| 14 * * Neither the name of Google Inc. nor the names of its | |
| 15 * contributors may be used to endorse or promote products derived from | |
| 16 * this software without specific prior written permission. | |
| 17 * | |
| 18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
| 19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
| 20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
| 21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
| 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
| 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
| 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
| 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
| 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
| 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| 29 */ | |
| 30 | |
| 31 // How we handle the base tag better. | |
| 32 // Current status: | |
| 33 // At now the normal way we use to handling base tag is | |
| 34 // a) For those links which have corresponding local saved files, such as | |
| 35 // savable CSS, JavaScript files, they will be written to relative URLs which | |
| 36 // point to local saved file. Why those links can not be resolved as absolute | |
| 37 // file URLs, because if they are resolved as absolute URLs, after moving the | |
| 38 // file location from one directory to another directory, the file URLs will | |
| 39 // be dead links. | |
| 40 // b) For those links which have not corresponding local saved files, such as | |
| 41 // links in A, AREA tags, they will be resolved as absolute URLs. | |
| 42 // c) We comment all base tags when serialzing DOM for the page. | |
| 43 // FireFox also uses above way to handle base tag. | |
| 44 // | |
| 45 // Problem: | |
| 46 // This way can not handle the following situation: | |
| 47 // the base tag is written by JavaScript. | |
| 48 // For example. The page "www.yahoo.com" use | |
| 49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL | |
| 50 // of page when loading page. So when saving page as completed-HTML, we assume | |
| 51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved | |
| 52 // completed-HTML page, then the JavaScript will insert a base tag | |
| 53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to | |
| 54 // local saved resource files will be resolved as | |
| 55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource | |
| 56 // files can not be loaded correctly. Also the page will be rendered ugly since | |
| 57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame | |
| 58 // files can not be fetched. | |
| 59 // Now FireFox, IE and WebKit based Browser all have this problem. | |
| 60 // | |
| 61 // Solution: | |
| 62 // My solution is that we comment old base tag and write new base tag: | |
| 63 // <base href="." ...> after the previous commented base tag. In WebKit, it | |
| 64 // always uses the latest "href" attribute of base tag to set document's base | |
| 65 // URL. Based on this behavior, when we encounter a base tag, we comment it and | |
| 66 // write a new base tag <base href="."> after the previous commented base tag. | |
| 67 // The new added base tag can help engine to locate correct base URL for | |
| 68 // correctly loading local saved resource files. Also I think we need to inherit | |
| 69 // the base target value from document object when appending new base tag. | |
| 70 // If there are multiple base tags in original document, we will comment all old | |
| 71 // base tags and append new base tag after each old base tag because we do not | |
| 72 // know those old base tags are original content or added by JavaScript. If | |
| 73 // they are added by JavaScript, it means when loading saved page, the script(s) | |
| 74 // will still insert base tag(s) to DOM, so the new added base tag(s) can | |
| 75 // override the incorrect base URL and make sure we alway load correct local | |
| 76 // saved resource files. | |
| 77 | |
| 78 #include "web/WebPageSerializerImpl.h" | |
| 79 | |
| 80 #include "core/HTMLNames.h" | |
| 81 #include "core/dom/Document.h" | |
| 82 #include "core/dom/DocumentType.h" | |
| 83 #include "core/dom/Element.h" | |
| 84 #include "core/editing/serializers/Serialization.h" | |
| 85 #include "core/html/HTMLAllCollection.h" | |
| 86 #include "core/html/HTMLElement.h" | |
| 87 #include "core/html/HTMLFormElement.h" | |
| 88 #include "core/html/HTMLHtmlElement.h" | |
| 89 #include "core/html/HTMLMetaElement.h" | |
| 90 #include "core/loader/DocumentLoader.h" | |
| 91 #include "core/loader/FrameLoader.h" | |
| 92 #include "core/page/PageSerializer.h" | |
| 93 #include "public/platform/WebVector.h" | |
| 94 #include "web/WebLocalFrameImpl.h" | |
| 95 #include "wtf/text/TextEncoding.h" | |
| 96 | |
| 97 namespace blink { | |
| 98 | |
| 99 // Maximum length of data buffer which is used to temporary save generated | |
| 100 // html content data. This is a soft limit which might be passed if a very large | |
| 101 // contegious string is found in the page. | |
| 102 static const unsigned dataBufferCapacity = 65536; | |
| 103 | |
| 104 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url, | |
| 105 const WTF::TextEncod
ing& textEncoding, | |
| 106 Document* document) | |
| 107 : url(url) | |
| 108 , textEncoding(textEncoding) | |
| 109 , document(document) | |
| 110 , isHTMLDocument(document->isHTMLDocument()) | |
| 111 , haveSeenDocType(false) | |
| 112 , haveAddedCharsetDeclaration(false) | |
| 113 , skipMetaElement(nullptr) | |
| 114 , isInScriptOrStyleTag(false) | |
| 115 , haveAddedXMLProcessingDirective(false) | |
| 116 , haveAddedContentsBeforeEnd(false) | |
| 117 { | |
| 118 } | |
| 119 | |
| 120 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag( | |
| 121 const Element* element, SerializeDomParam* param, bool* needSkip) | |
| 122 { | |
| 123 StringBuilder result; | |
| 124 | |
| 125 *needSkip = false; | |
| 126 if (param->isHTMLDocument) { | |
| 127 // Skip the open tag of original META tag which declare charset since we | |
| 128 // have overrided the META which have correct charset declaration after | |
| 129 // serializing open tag of HEAD element. | |
| 130 ASSERT(element); | |
| 131 if (isHTMLMetaElement(element) && toHTMLMetaElement(element)->computeEnc
oding().isValid()) { | |
| 132 // Found META tag declared charset, we need to skip it when | |
| 133 // serializing DOM. | |
| 134 param->skipMetaElement = element; | |
| 135 *needSkip = true; | |
| 136 } else if (isHTMLHtmlElement(*element)) { | |
| 137 // Check something before processing the open tag of HEAD element. | |
| 138 // First we add doc type declaration if original document has it. | |
| 139 if (!param->haveSeenDocType) { | |
| 140 param->haveSeenDocType = true; | |
| 141 result.append(createMarkup(param->document->doctype())); | |
| 142 } | |
| 143 | |
| 144 // Add MOTW declaration before html tag. | |
| 145 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx
. | |
| 146 result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(par
am->url)); | |
| 147 } else if (isHTMLBaseElement(*element)) { | |
| 148 // Comment the BASE tag when serializing dom. | |
| 149 result.appendLiteral("<!--"); | |
| 150 } | |
| 151 } else { | |
| 152 // Write XML declaration. | |
| 153 if (!param->haveAddedXMLProcessingDirective) { | |
| 154 param->haveAddedXMLProcessingDirective = true; | |
| 155 // Get encoding info. | |
| 156 String xmlEncoding = param->document->xmlEncoding(); | |
| 157 if (xmlEncoding.isEmpty()) | |
| 158 xmlEncoding = param->document->encodingName(); | |
| 159 if (xmlEncoding.isEmpty()) | |
| 160 xmlEncoding = UTF8Encoding().name(); | |
| 161 result.appendLiteral("<?xml version=\""); | |
| 162 result.append(param->document->xmlVersion()); | |
| 163 result.appendLiteral("\" encoding=\""); | |
| 164 result.append(xmlEncoding); | |
| 165 if (param->document->xmlStandalone()) | |
| 166 result.appendLiteral("\" standalone=\"yes"); | |
| 167 result.appendLiteral("\"?>\n"); | |
| 168 } | |
| 169 // Add doc type declaration if original document has it. | |
| 170 if (!param->haveSeenDocType) { | |
| 171 param->haveSeenDocType = true; | |
| 172 result.append(createMarkup(param->document->doctype())); | |
| 173 } | |
| 174 } | |
| 175 return result.toString(); | |
| 176 } | |
| 177 | |
| 178 String WebPageSerializerImpl::postActionAfterSerializeOpenTag( | |
| 179 const Element* element, SerializeDomParam* param) | |
| 180 { | |
| 181 StringBuilder result; | |
| 182 | |
| 183 param->haveAddedContentsBeforeEnd = false; | |
| 184 if (!param->isHTMLDocument) | |
| 185 return result.toString(); | |
| 186 // Check after processing the open tag of HEAD element | |
| 187 if (!param->haveAddedCharsetDeclaration | |
| 188 && isHTMLHeadElement(*element)) { | |
| 189 param->haveAddedCharsetDeclaration = true; | |
| 190 // Check meta element. WebKit only pre-parse the first 512 bytes | |
| 191 // of the document. If the whole <HEAD> is larger and meta is the | |
| 192 // end of head part, then this kind of pages aren't decoded correctly | |
| 193 // because of this issue. So when we serialize the DOM, we need to | |
| 194 // make sure the meta will in first child of head tag. | |
| 195 // See http://bugs.webkit.org/show_bug.cgi?id=16621. | |
| 196 // First we generate new content for writing correct META element. | |
| 197 result.append(WebPageSerializer::generateMetaCharsetDeclaration( | |
| 198 String(param->textEncoding.name()))); | |
| 199 | |
| 200 param->haveAddedContentsBeforeEnd = true; | |
| 201 // Will search each META which has charset declaration, and skip them al
l | |
| 202 // in PreActionBeforeSerializeOpenTag. | |
| 203 } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) { | |
| 204 param->isInScriptOrStyleTag = true; | |
| 205 } | |
| 206 | |
| 207 return result.toString(); | |
| 208 } | |
| 209 | |
| 210 String WebPageSerializerImpl::preActionBeforeSerializeEndTag( | |
| 211 const Element* element, SerializeDomParam* param, bool* needSkip) | |
| 212 { | |
| 213 String result; | |
| 214 | |
| 215 *needSkip = false; | |
| 216 if (!param->isHTMLDocument) | |
| 217 return result; | |
| 218 // Skip the end tag of original META tag which declare charset. | |
| 219 // Need not to check whether it's META tag since we guarantee | |
| 220 // skipMetaElement is definitely META tag if it's not 0. | |
| 221 if (param->skipMetaElement == element) { | |
| 222 *needSkip = true; | |
| 223 } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) { | |
| 224 ASSERT(param->isInScriptOrStyleTag); | |
| 225 param->isInScriptOrStyleTag = false; | |
| 226 } | |
| 227 | |
| 228 return result; | |
| 229 } | |
| 230 | |
| 231 // After we finish serializing end tag of a element, we give the target | |
| 232 // element a chance to do some post work to add some additional data. | |
| 233 String WebPageSerializerImpl::postActionAfterSerializeEndTag( | |
| 234 const Element* element, SerializeDomParam* param) | |
| 235 { | |
| 236 StringBuilder result; | |
| 237 | |
| 238 if (!param->isHTMLDocument) | |
| 239 return result.toString(); | |
| 240 // Comment the BASE tag when serializing DOM. | |
| 241 if (isHTMLBaseElement(*element)) { | |
| 242 result.appendLiteral("-->"); | |
| 243 // Append a new base tag declaration. | |
| 244 result.append(WebPageSerializer::generateBaseTagDeclaration( | |
| 245 param->document->baseTarget())); | |
| 246 } | |
| 247 | |
| 248 return result.toString(); | |
| 249 } | |
| 250 | |
| 251 void WebPageSerializerImpl::saveHTMLContentToBuffer( | |
| 252 const String& result, SerializeDomParam* param) | |
| 253 { | |
| 254 m_dataBuffer.append(result); | |
| 255 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished, | |
| 256 param, | |
| 257 DoNotForceFlush); | |
| 258 } | |
| 259 | |
| 260 void WebPageSerializerImpl::encodeAndFlushBuffer( | |
| 261 WebPageSerializerClient::PageSerializationStatus status, | |
| 262 SerializeDomParam* param, | |
| 263 FlushOption flushOption) | |
| 264 { | |
| 265 // Data buffer is not full nor do we want to force flush. | |
| 266 if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity
) | |
| 267 return; | |
| 268 | |
| 269 String content = m_dataBuffer.toString(); | |
| 270 m_dataBuffer.clear(); | |
| 271 | |
| 272 CString encodedContent = param->textEncoding.encode(content, WTF::EntitiesFo
rUnencodables); | |
| 273 | |
| 274 // Send result to the client. | |
| 275 m_client->didSerializeDataForFrame(WebCString(encodedContent), status); | |
| 276 } | |
| 277 | |
| 278 // TODO(yosin): We should utilize |MarkupFormatter| here to share code, | |
| 279 // especially escaping attribute values, done by |WebEntities| |m_htmlEntities| | |
| 280 // and |m_xmlEntities|. | |
| 281 void WebPageSerializerImpl::openTagToString(Element* element, | |
| 282 SerializeDomParam* param) | |
| 283 { | |
| 284 bool needSkip; | |
| 285 StringBuilder result; | |
| 286 // Do pre action for open tag. | |
| 287 result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip)); | |
| 288 if (needSkip) | |
| 289 return; | |
| 290 // Add open tag | |
| 291 result.append('<'); | |
| 292 result.append(element->nodeName().lower()); | |
| 293 // Go through all attributes and serialize them. | |
| 294 AttributeCollection attributes = element->attributes(); | |
| 295 AttributeCollection::iterator end = attributes.end(); | |
| 296 for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it)
{ | |
| 297 result.append(' '); | |
| 298 // Add attribute pair | |
| 299 result.append(it->name().toString()); | |
| 300 result.appendLiteral("=\""); | |
| 301 if (!it->value().isEmpty()) { | |
| 302 const String& attrValue = it->value(); | |
| 303 | |
| 304 // Check whether we need to replace some resource links | |
| 305 // with local resource paths. | |
| 306 const QualifiedName& attrName = it->name(); | |
| 307 if (element->hasLegalLinkAttribute(attrName)) { | |
| 308 // For links start with "javascript:", we do not change it. | |
| 309 if (attrValue.startsWith("javascript:", TextCaseInsensitive)) { | |
| 310 result.append(m_htmlEntities.convertEntitiesInString(attrVal
ue)); | |
| 311 } else { | |
| 312 // Get the absolute link | |
| 313 String completeURL = param->document->completeURL(attrValue)
; | |
| 314 // Check whether we have local files for those link. | |
| 315 if (m_localLinks.contains(completeURL)) { | |
| 316 result.append(m_htmlEntities.convertEntitiesInString(m_l
ocalLinks.get(completeURL))); | |
| 317 } else { | |
| 318 result.append(m_htmlEntities.convertEntitiesInString(com
pleteURL)); | |
| 319 } | |
| 320 } | |
| 321 } else { | |
| 322 if (param->isHTMLDocument) | |
| 323 result.append(m_htmlEntities.convertEntitiesInString(attrVal
ue)); | |
| 324 else | |
| 325 result.append(m_xmlEntities.convertEntitiesInString(attrValu
e)); | |
| 326 } | |
| 327 } | |
| 328 result.append('\"'); | |
| 329 } | |
| 330 | |
| 331 // Do post action for open tag. | |
| 332 String addedContents = postActionAfterSerializeOpenTag(element, param); | |
| 333 // Complete the open tag for element when it has child/children. | |
| 334 if (element->hasChildren() || param->haveAddedContentsBeforeEnd) | |
| 335 result.append('>'); | |
| 336 // Append the added contents generate in post action of open tag. | |
| 337 result.append(addedContents); | |
| 338 // Save the result to data buffer. | |
| 339 saveHTMLContentToBuffer(result.toString(), param); | |
| 340 } | |
| 341 | |
| 342 // Serialize end tag of an specified element. | |
| 343 void WebPageSerializerImpl::endTagToString(Element* element, | |
| 344 SerializeDomParam* param) | |
| 345 { | |
| 346 bool needSkip; | |
| 347 StringBuilder result; | |
| 348 // Do pre action for end tag. | |
| 349 result.append(preActionBeforeSerializeEndTag(element, param, &needSkip)); | |
| 350 if (needSkip) | |
| 351 return; | |
| 352 // Write end tag when element has child/children. | |
| 353 if (element->hasChildren() || param->haveAddedContentsBeforeEnd) { | |
| 354 result.appendLiteral("</"); | |
| 355 result.append(element->nodeName().lower()); | |
| 356 result.append('>'); | |
| 357 } else { | |
| 358 // Check whether we have to write end tag for empty element. | |
| 359 if (param->isHTMLDocument) { | |
| 360 result.append('>'); | |
| 361 // FIXME: This code is horribly wrong. WebPageSerializerImpl must d
ie. | |
| 362 if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsI
nsertHTML()) { | |
| 363 // We need to write end tag when it is required. | |
| 364 result.appendLiteral("</"); | |
| 365 result.append(element->nodeName().lower()); | |
| 366 result.append('>'); | |
| 367 } | |
| 368 } else { | |
| 369 // For xml base document. | |
| 370 result.appendLiteral(" />"); | |
| 371 } | |
| 372 } | |
| 373 // Do post action for end tag. | |
| 374 result.append(postActionAfterSerializeEndTag(element, param)); | |
| 375 // Save the result to data buffer. | |
| 376 saveHTMLContentToBuffer(result.toString(), param); | |
| 377 } | |
| 378 | |
| 379 void WebPageSerializerImpl::buildContentForNode(Node* node, | |
| 380 SerializeDomParam* param) | |
| 381 { | |
| 382 switch (node->nodeType()) { | |
| 383 case Node::ELEMENT_NODE: | |
| 384 // Process open tag of element. | |
| 385 openTagToString(toElement(node), param); | |
| 386 // Walk through the children nodes and process it. | |
| 387 for (Node *child = node->firstChild(); child; child = child->nextSibling
()) | |
| 388 buildContentForNode(child, param); | |
| 389 // Process end tag of element. | |
| 390 endTagToString(toElement(node), param); | |
| 391 break; | |
| 392 case Node::TEXT_NODE: | |
| 393 saveHTMLContentToBuffer(createMarkup(node), param); | |
| 394 break; | |
| 395 case Node::ATTRIBUTE_NODE: | |
| 396 case Node::DOCUMENT_NODE: | |
| 397 case Node::DOCUMENT_FRAGMENT_NODE: | |
| 398 // Should not exist. | |
| 399 ASSERT_NOT_REACHED(); | |
| 400 break; | |
| 401 // Document type node can be in DOM? | |
| 402 case Node::DOCUMENT_TYPE_NODE: | |
| 403 param->haveSeenDocType = true; | |
| 404 default: | |
| 405 // For other type node, call default action. | |
| 406 saveHTMLContentToBuffer(createMarkup(node), param); | |
| 407 break; | |
| 408 } | |
| 409 } | |
| 410 | |
| 411 WebPageSerializerImpl::WebPageSerializerImpl( | |
| 412 WebLocalFrame* frame, | |
| 413 WebPageSerializerClient* client, | |
| 414 const WebVector<std::pair<WebURL, WebString>>& urlsToLocalPaths) | |
| 415 : m_client(client) | |
| 416 , m_htmlEntities(false) | |
| 417 , m_xmlEntities(true) | |
| 418 { | |
| 419 // Must specify available webframe. | |
| 420 ASSERT(frame); | |
| 421 m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame); | |
| 422 // Make sure we have non 0 client. | |
| 423 ASSERT(client); | |
| 424 // Build local resources map. | |
| 425 for (const auto& it : urlsToLocalPaths) { | |
| 426 KURL url = it.first; | |
| 427 ASSERT(!m_localLinks.contains(url.string())); | |
| 428 m_localLinks.set(url.string(), it.second); | |
| 429 } | |
| 430 | |
| 431 ASSERT(m_dataBuffer.isEmpty()); | |
| 432 } | |
| 433 | |
| 434 bool WebPageSerializerImpl::serialize() | |
| 435 { | |
| 436 bool didSerialization = false; | |
| 437 | |
| 438 Document* document = m_specifiedWebLocalFrameImpl->frame()->document(); | |
| 439 const KURL& url = document->url(); | |
| 440 | |
| 441 if (url.isValid()) { | |
| 442 didSerialization = true; | |
| 443 | |
| 444 const WTF::TextEncoding& textEncoding = document->encoding().isValid() ?
document->encoding() : UTF8Encoding(); | |
| 445 if (textEncoding.isNonByteBasedEncoding()) { | |
| 446 const UChar byteOrderMark = 0xFEFF; | |
| 447 m_dataBuffer.append(byteOrderMark); | |
| 448 } | |
| 449 | |
| 450 SerializeDomParam param(url, textEncoding, document); | |
| 451 | |
| 452 Element* documentElement = document->documentElement(); | |
| 453 if (documentElement) | |
| 454 buildContentForNode(documentElement, ¶m); | |
| 455 | |
| 456 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &p
aram, ForceFlush); | |
| 457 } else { | |
| 458 // Report empty contents for invalid URLs. | |
| 459 m_client->didSerializeDataForFrame( | |
| 460 WebCString(), WebPageSerializerClient::CurrentFrameIsFinished); | |
| 461 } | |
| 462 | |
| 463 ASSERT(m_dataBuffer.isEmpty()); | |
| 464 return didSerialization; | |
| 465 } | |
| 466 | |
| 467 } // namespace blink | |
| OLD | NEW |