Index: Source/web/WebPageSerializerImpl.cpp |
diff --git a/Source/web/WebPageSerializerImpl.cpp b/Source/web/WebPageSerializerImpl.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..53a421db9cd5b5cfaeac2cdf96615fc3d9ae73b8 |
--- /dev/null |
+++ b/Source/web/WebPageSerializerImpl.cpp |
@@ -0,0 +1,520 @@ |
+/* |
+ * Copyright (C) 2009 Google Inc. All rights reserved. |
+ * |
+ * Redistribution and use in source and binary forms, with or without |
+ * modification, are permitted provided that the following conditions are |
+ * met: |
+ * |
+ * * Redistributions of source code must retain the above copyright |
+ * notice, this list of conditions and the following disclaimer. |
+ * * Redistributions in binary form must reproduce the above |
+ * copyright notice, this list of conditions and the following disclaimer |
+ * in the documentation and/or other materials provided with the |
+ * distribution. |
+ * * Neither the name of Google Inc. nor the names of its |
+ * contributors may be used to endorse or promote products derived from |
+ * this software without specific prior written permission. |
+ * |
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
+ */ |
+ |
+// How we handle the base tag better. |
+// Current status: |
+// At now the normal way we use to handling base tag is |
+// a) For those links which have corresponding local saved files, such as |
+// savable CSS, JavaScript files, they will be written to relative URLs which |
+// point to local saved file. Why those links can not be resolved as absolute |
+// file URLs, because if they are resolved as absolute URLs, after moving the |
+// file location from one directory to another directory, the file URLs will |
+// be dead links. |
+// b) For those links which have not corresponding local saved files, such as |
+// links in A, AREA tags, they will be resolved as absolute URLs. |
+// c) We comment all base tags when serialzing DOM for the page. |
+// FireFox also uses above way to handle base tag. |
+// |
+// Problem: |
+// This way can not handle the following situation: |
+// the base tag is written by JavaScript. |
+// For example. The page "www.yahoo.com" use |
+// "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL |
+// of page when loading page. So when saving page as completed-HTML, we assume |
+// that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved |
+// completed-HTML page, then the JavaScript will insert a base tag |
+// <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to |
+// local saved resource files will be resolved as |
+// "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource |
+// files can not be loaded correctly. Also the page will be rendered ugly since |
+// all saved sub-resource files (such as CSS, JavaScript files) and sub-frame |
+// files can not be fetched. |
+// Now FireFox, IE and WebKit based Browser all have this problem. |
+// |
+// Solution: |
+// My solution is that we comment old base tag and write new base tag: |
+// <base href="." ...> after the previous commented base tag. In WebKit, it |
+// always uses the latest "href" attribute of base tag to set document's base |
+// URL. Based on this behavior, when we encounter a base tag, we comment it and |
+// write a new base tag <base href="."> after the previous commented base tag. |
+// The new added base tag can help engine to locate correct base URL for |
+// correctly loading local saved resource files. Also I think we need to inherit |
+// the base target value from document object when appending new base tag. |
+// If there are multiple base tags in original document, we will comment all old |
+// base tags and append new base tag after each old base tag because we do not |
+// know those old base tags are original content or added by JavaScript. If |
+// they are added by JavaScript, it means when loading saved page, the script(s) |
+// will still insert base tag(s) to DOM, so the new added base tag(s) can |
+// override the incorrect base URL and make sure we alway load correct local |
+// saved resource files. |
+ |
+#include "config.h" |
+#include "web/WebPageSerializerImpl.h" |
+ |
+#include "core/HTMLNames.h" |
+#include "core/dom/Document.h" |
+#include "core/dom/DocumentType.h" |
+#include "core/dom/Element.h" |
+#include "core/editing/Serialization.h" |
+#include "core/html/HTMLAllCollection.h" |
+#include "core/html/HTMLElement.h" |
+#include "core/html/HTMLFormElement.h" |
+#include "core/html/HTMLHtmlElement.h" |
+#include "core/html/HTMLMetaElement.h" |
+#include "core/loader/DocumentLoader.h" |
+#include "core/loader/FrameLoader.h" |
+#include "public/platform/WebVector.h" |
+#include "web/WebLocalFrameImpl.h" |
+#include "wtf/text/TextEncoding.h" |
+ |
+namespace blink { |
+ |
+// Maximum length of data buffer which is used to temporary save generated |
+// html content data. This is a soft limit which might be passed if a very large |
+// contegious string is found in the page. |
+static const unsigned dataBufferCapacity = 65536; |
+ |
+WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url, |
+ const WTF::TextEncoding& textEncoding, |
+ Document* document, |
+ const String& directoryName) |
+ : url(url) |
+ , textEncoding(textEncoding) |
+ , document(document) |
+ , directoryName(directoryName) |
+ , isHTMLDocument(document->isHTMLDocument()) |
+ , haveSeenDocType(false) |
+ , haveAddedCharsetDeclaration(false) |
+ , skipMetaElement(nullptr) |
+ , isInScriptOrStyleTag(false) |
+ , haveAddedXMLProcessingDirective(false) |
+ , haveAddedContentsBeforeEnd(false) |
+{ |
+} |
+ |
+String WebPageSerializerImpl::preActionBeforeSerializeOpenTag( |
+ const Element* element, SerializeDomParam* param, bool* needSkip) |
+{ |
+ StringBuilder result; |
+ |
+ *needSkip = false; |
+ if (param->isHTMLDocument) { |
+ // Skip the open tag of original META tag which declare charset since we |
+ // have overrided the META which have correct charset declaration after |
+ // serializing open tag of HEAD element. |
+ ASSERT(element); |
+ if (isHTMLMetaElement(*element)) { |
+ const HTMLMetaElement& meta = toHTMLMetaElement(*element); |
+ // Check whether the META tag has declared charset or not. |
+ String equiv = meta.httpEquiv(); |
+ if (equalIgnoringCase(equiv, "content-type")) { |
+ String content = meta.content(); |
+ if (content.length() && content.contains("charset", TextCaseInsensitive)) { |
+ // Find META tag declared charset, we need to skip it when |
+ // serializing DOM. |
+ param->skipMetaElement = element; |
+ *needSkip = true; |
+ } |
+ } |
+ } else if (isHTMLHtmlElement(*element)) { |
+ // Check something before processing the open tag of HEAD element. |
+ // First we add doc type declaration if original document has it. |
+ if (!param->haveSeenDocType) { |
+ param->haveSeenDocType = true; |
+ result.append(createMarkup(param->document->doctype())); |
+ } |
+ |
+ // Add MOTW declaration before html tag. |
+ // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. |
+ result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url)); |
+ } else if (isHTMLBaseElement(*element)) { |
+ // Comment the BASE tag when serializing dom. |
+ result.appendLiteral("<!--"); |
+ } |
+ } else { |
+ // Write XML declaration. |
+ if (!param->haveAddedXMLProcessingDirective) { |
+ param->haveAddedXMLProcessingDirective = true; |
+ // Get encoding info. |
+ String xmlEncoding = param->document->xmlEncoding(); |
+ if (xmlEncoding.isEmpty()) |
+ xmlEncoding = param->document->encodingName(); |
+ if (xmlEncoding.isEmpty()) |
+ xmlEncoding = UTF8Encoding().name(); |
+ result.appendLiteral("<?xml version=\""); |
+ result.append(param->document->xmlVersion()); |
+ result.appendLiteral("\" encoding=\""); |
+ result.append(xmlEncoding); |
+ if (param->document->xmlStandalone()) |
+ result.appendLiteral("\" standalone=\"yes"); |
+ result.appendLiteral("\"?>\n"); |
+ } |
+ // Add doc type declaration if original document has it. |
+ if (!param->haveSeenDocType) { |
+ param->haveSeenDocType = true; |
+ result.append(createMarkup(param->document->doctype())); |
+ } |
+ } |
+ return result.toString(); |
+} |
+ |
+String WebPageSerializerImpl::postActionAfterSerializeOpenTag( |
+ const Element* element, SerializeDomParam* param) |
+{ |
+ StringBuilder result; |
+ |
+ param->haveAddedContentsBeforeEnd = false; |
+ if (!param->isHTMLDocument) |
+ return result.toString(); |
+ // Check after processing the open tag of HEAD element |
+ if (!param->haveAddedCharsetDeclaration |
+ && isHTMLHeadElement(*element)) { |
+ param->haveAddedCharsetDeclaration = true; |
+ // Check meta element. WebKit only pre-parse the first 512 bytes |
+ // of the document. If the whole <HEAD> is larger and meta is the |
+ // end of head part, then this kind of pages aren't decoded correctly |
+ // because of this issue. So when we serialize the DOM, we need to |
+ // make sure the meta will in first child of head tag. |
+ // See http://bugs.webkit.org/show_bug.cgi?id=16621. |
+ // First we generate new content for writing correct META element. |
+ result.append(WebPageSerializer::generateMetaCharsetDeclaration( |
+ String(param->textEncoding.name()))); |
+ |
+ param->haveAddedContentsBeforeEnd = true; |
+ // Will search each META which has charset declaration, and skip them all |
+ // in PreActionBeforeSerializeOpenTag. |
+ } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) { |
+ param->isInScriptOrStyleTag = true; |
+ } |
+ |
+ return result.toString(); |
+} |
+ |
+String WebPageSerializerImpl::preActionBeforeSerializeEndTag( |
+ const Element* element, SerializeDomParam* param, bool* needSkip) |
+{ |
+ String result; |
+ |
+ *needSkip = false; |
+ if (!param->isHTMLDocument) |
+ return result; |
+ // Skip the end tag of original META tag which declare charset. |
+ // Need not to check whether it's META tag since we guarantee |
+ // skipMetaElement is definitely META tag if it's not 0. |
+ if (param->skipMetaElement == element) { |
+ *needSkip = true; |
+ } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) { |
+ ASSERT(param->isInScriptOrStyleTag); |
+ param->isInScriptOrStyleTag = false; |
+ } |
+ |
+ return result; |
+} |
+ |
+// After we finish serializing end tag of a element, we give the target |
+// element a chance to do some post work to add some additional data. |
+String WebPageSerializerImpl::postActionAfterSerializeEndTag( |
+ const Element* element, SerializeDomParam* param) |
+{ |
+ StringBuilder result; |
+ |
+ if (!param->isHTMLDocument) |
+ return result.toString(); |
+ // Comment the BASE tag when serializing DOM. |
+ if (isHTMLBaseElement(*element)) { |
+ result.appendLiteral("-->"); |
+ // Append a new base tag declaration. |
+ result.append(WebPageSerializer::generateBaseTagDeclaration( |
+ param->document->baseTarget())); |
+ } |
+ |
+ return result.toString(); |
+} |
+ |
+void WebPageSerializerImpl::saveHTMLContentToBuffer( |
+ const String& result, SerializeDomParam* param) |
+{ |
+ m_dataBuffer.append(result); |
+ encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished, |
+ param, |
+ DoNotForceFlush); |
+} |
+ |
+void WebPageSerializerImpl::encodeAndFlushBuffer( |
+ WebPageSerializerClient::PageSerializationStatus status, |
+ SerializeDomParam* param, |
+ FlushOption flushOption) |
+{ |
+ // Data buffer is not full nor do we want to force flush. |
+ if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity) |
+ return; |
+ |
+ String content = m_dataBuffer.toString(); |
+ m_dataBuffer.clear(); |
+ |
+ CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF::EntitiesForUnencodables); |
+ |
+ // Send result to the client. |
+ m_client->didSerializeDataForFrame(param->url, |
+ WebCString(encodedContent.data(), encodedContent.length()), |
+ status); |
+} |
+ |
+void WebPageSerializerImpl::openTagToString(Element* element, |
+ SerializeDomParam* param) |
+{ |
+ bool needSkip; |
+ StringBuilder result; |
+ // Do pre action for open tag. |
+ result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip)); |
+ if (needSkip) |
+ return; |
+ // Add open tag |
+ result.append('<'); |
+ result.append(element->nodeName().lower()); |
+ // Go through all attributes and serialize them. |
+ AttributeCollection attributes = element->attributes(); |
+ AttributeCollection::iterator end = attributes.end(); |
+ for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) { |
+ result.append(' '); |
+ // Add attribute pair |
+ result.append(it->name().toString()); |
+ result.appendLiteral("=\""); |
+ if (!it->value().isEmpty()) { |
+ const String& attrValue = it->value(); |
+ |
+ // Check whether we need to replace some resource links |
+ // with local resource paths. |
+ const QualifiedName& attrName = it->name(); |
+ if (element->hasLegalLinkAttribute(attrName)) { |
+ // For links start with "javascript:", we do not change it. |
+ if (attrValue.startsWith("javascript:", TextCaseInsensitive)) { |
+ result.append(attrValue); |
+ } else { |
+ // Get the absolute link |
+ WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOwnerElement(element); |
+ String completeURL = subFrame ? subFrame->frame()->document()->url() : |
+ param->document->completeURL(attrValue); |
+ // Check whether we have local files for those link. |
+ if (m_localLinks.contains(completeURL)) { |
+ if (!param->directoryName.isEmpty()) { |
+ result.appendLiteral("./"); |
+ result.append(param->directoryName); |
+ result.append('/'); |
+ } |
+ result.append(m_localLinks.get(completeURL)); |
+ } else { |
+ result.append(completeURL); |
+ } |
+ } |
+ } else { |
+ if (param->isHTMLDocument) |
+ result.append(m_htmlEntities.convertEntitiesInString(attrValue)); |
+ else |
+ result.append(m_xmlEntities.convertEntitiesInString(attrValue)); |
+ } |
+ } |
+ result.append('\"'); |
+ } |
+ |
+ // Do post action for open tag. |
+ String addedContents = postActionAfterSerializeOpenTag(element, param); |
+ // Complete the open tag for element when it has child/children. |
+ if (element->hasChildren() || param->haveAddedContentsBeforeEnd) |
+ result.append('>'); |
+ // Append the added contents generate in post action of open tag. |
+ result.append(addedContents); |
+ // Save the result to data buffer. |
+ saveHTMLContentToBuffer(result.toString(), param); |
+} |
+ |
+// Serialize end tag of an specified element. |
+void WebPageSerializerImpl::endTagToString(Element* element, |
+ SerializeDomParam* param) |
+{ |
+ bool needSkip; |
+ StringBuilder result; |
+ // Do pre action for end tag. |
+ result.append(preActionBeforeSerializeEndTag(element, param, &needSkip)); |
+ if (needSkip) |
+ return; |
+ // Write end tag when element has child/children. |
+ if (element->hasChildren() || param->haveAddedContentsBeforeEnd) { |
+ result.appendLiteral("</"); |
+ result.append(element->nodeName().lower()); |
+ result.append('>'); |
+ } else { |
+ // Check whether we have to write end tag for empty element. |
+ if (param->isHTMLDocument) { |
+ result.append('>'); |
+ // FIXME: This code is horribly wrong. WebPageSerializerImpl must die. |
+ if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) { |
+ // We need to write end tag when it is required. |
+ result.appendLiteral("</"); |
+ result.append(element->nodeName().lower()); |
+ result.append('>'); |
+ } |
+ } else { |
+ // For xml base document. |
+ result.appendLiteral(" />"); |
+ } |
+ } |
+ // Do post action for end tag. |
+ result.append(postActionAfterSerializeEndTag(element, param)); |
+ // Save the result to data buffer. |
+ saveHTMLContentToBuffer(result.toString(), param); |
+} |
+ |
+void WebPageSerializerImpl::buildContentForNode(Node* node, |
+ SerializeDomParam* param) |
+{ |
+ switch (node->nodeType()) { |
+ case Node::ELEMENT_NODE: |
+ // Process open tag of element. |
+ openTagToString(toElement(node), param); |
+ // Walk through the children nodes and process it. |
+ for (Node *child = node->firstChild(); child; child = child->nextSibling()) |
+ buildContentForNode(child, param); |
+ // Process end tag of element. |
+ endTagToString(toElement(node), param); |
+ break; |
+ case Node::TEXT_NODE: |
+ saveHTMLContentToBuffer(createMarkup(node), param); |
+ break; |
+ case Node::ATTRIBUTE_NODE: |
+ case Node::DOCUMENT_NODE: |
+ case Node::DOCUMENT_FRAGMENT_NODE: |
+ // Should not exist. |
+ ASSERT_NOT_REACHED(); |
+ break; |
+ // Document type node can be in DOM? |
+ case Node::DOCUMENT_TYPE_NODE: |
+ param->haveSeenDocType = true; |
+ default: |
+ // For other type node, call default action. |
+ saveHTMLContentToBuffer(createMarkup(node), param); |
+ break; |
+ } |
+} |
+ |
+WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame, |
+ bool recursiveSerialization, |
+ WebPageSerializerClient* client, |
+ const WebVector<WebURL>& links, |
+ const WebVector<WebString>& localPaths, |
+ const WebString& localDirectoryName) |
+ : m_client(client) |
+ , m_recursiveSerialization(recursiveSerialization) |
+ , m_framesCollected(false) |
+ , m_localDirectoryName(localDirectoryName) |
+ , m_htmlEntities(false) |
+ , m_xmlEntities(true) |
+{ |
+ // Must specify available webframe. |
+ ASSERT(frame); |
+ m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame); |
+ // Make sure we have non 0 client. |
+ ASSERT(client); |
+ // Build local resources map. |
+ ASSERT(links.size() == localPaths.size()); |
+ for (size_t i = 0; i < links.size(); i++) { |
+ KURL url = links[i]; |
+ ASSERT(!m_localLinks.contains(url.string())); |
+ m_localLinks.set(url.string(), localPaths[i]); |
+ } |
+ |
+ ASSERT(m_dataBuffer.isEmpty()); |
+} |
+ |
+void WebPageSerializerImpl::collectTargetFrames() |
+{ |
+ ASSERT(!m_framesCollected); |
+ m_framesCollected = true; |
+ |
+ // First, process main frame. |
+ m_frames.append(m_specifiedWebLocalFrameImpl); |
+ // Return now if user only needs to serialize specified frame, not including |
+ // all sub-frames. |
+ if (!m_recursiveSerialization) |
+ return; |
+ // Collect all frames inside the specified frame. |
+ for (WebLocalFrameImpl* frame : m_frames) { |
+ // Get current using document. |
+ Document* currentDoc = frame->frame()->document(); |
+ // Go through sub-frames. |
+ RefPtrWillBeRawPtr<HTMLAllCollection> all = currentDoc->all(); |
+ |
+ for (unsigned i = 0; Element* element = all->item(i); ++i) { |
+ if (!element->isHTMLElement()) |
+ continue; |
+ WebLocalFrameImpl* webFrame = |
+ WebLocalFrameImpl::fromFrameOwnerElement(element); |
+ if (webFrame) |
+ m_frames.append(webFrame); |
+ } |
+ } |
+} |
+ |
+bool WebPageSerializerImpl::serialize() |
+{ |
+ if (!m_framesCollected) |
+ collectTargetFrames(); |
+ |
+ bool didSerialization = false; |
+ KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url(); |
+ |
+ for (unsigned i = 0; i < m_frames.size(); ++i) { |
+ WebLocalFrameImpl* webFrame = m_frames[i]; |
+ Document* document = webFrame->frame()->document(); |
+ const KURL& url = document->url(); |
+ |
+ if (!url.isValid() || !m_localLinks.contains(url.string())) |
+ continue; |
+ |
+ didSerialization = true; |
+ |
+ const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding(); |
+ String directoryName = url == mainURL ? m_localDirectoryName : ""; |
+ |
+ SerializeDomParam param(url, textEncoding, document, directoryName); |
+ |
+ Element* documentElement = document->documentElement(); |
+ if (documentElement) |
+ buildContentForNode(documentElement, ¶m); |
+ |
+ encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, ¶m, ForceFlush); |
+ } |
+ |
+ ASSERT(m_dataBuffer.isEmpty()); |
+ m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished); |
+ return didSerialization; |
+} |
+ |
+} // namespace blink |