Index: third_party/WebKit/Source/web/WebPageSerializerImpl.cpp |
diff --git a/third_party/WebKit/Source/web/WebPageSerializerImpl.cpp b/third_party/WebKit/Source/web/WebPageSerializerImpl.cpp |
deleted file mode 100644 |
index 545f3a00256b4cd7bfba84bc182dfab911344680..0000000000000000000000000000000000000000 |
--- a/third_party/WebKit/Source/web/WebPageSerializerImpl.cpp |
+++ /dev/null |
@@ -1,467 +0,0 @@ |
-/* |
- * Copyright (C) 2009 Google Inc. All rights reserved. |
- * |
- * Redistribution and use in source and binary forms, with or without |
- * modification, are permitted provided that the following conditions are |
- * met: |
- * |
- * * Redistributions of source code must retain the above copyright |
- * notice, this list of conditions and the following disclaimer. |
- * * Redistributions in binary form must reproduce the above |
- * copyright notice, this list of conditions and the following disclaimer |
- * in the documentation and/or other materials provided with the |
- * distribution. |
- * * Neither the name of Google Inc. nor the names of its |
- * contributors may be used to endorse or promote products derived from |
- * this software without specific prior written permission. |
- * |
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT |
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
- */ |
- |
-// How we handle the base tag better. |
-// Current status: |
-// At now the normal way we use to handling base tag is |
-// a) For those links which have corresponding local saved files, such as |
-// savable CSS, JavaScript files, they will be written to relative URLs which |
-// point to local saved file. Why those links can not be resolved as absolute |
-// file URLs, because if they are resolved as absolute URLs, after moving the |
-// file location from one directory to another directory, the file URLs will |
-// be dead links. |
-// b) For those links which have not corresponding local saved files, such as |
-// links in A, AREA tags, they will be resolved as absolute URLs. |
-// c) We comment all base tags when serialzing DOM for the page. |
-// FireFox also uses above way to handle base tag. |
-// |
-// Problem: |
-// This way can not handle the following situation: |
-// the base tag is written by JavaScript. |
-// For example. The page "www.yahoo.com" use |
-// "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL |
-// of page when loading page. So when saving page as completed-HTML, we assume |
-// that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved |
-// completed-HTML page, then the JavaScript will insert a base tag |
-// <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to |
-// local saved resource files will be resolved as |
-// "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource |
-// files can not be loaded correctly. Also the page will be rendered ugly since |
-// all saved sub-resource files (such as CSS, JavaScript files) and sub-frame |
-// files can not be fetched. |
-// Now FireFox, IE and WebKit based Browser all have this problem. |
-// |
-// Solution: |
-// My solution is that we comment old base tag and write new base tag: |
-// <base href="." ...> after the previous commented base tag. In WebKit, it |
-// always uses the latest "href" attribute of base tag to set document's base |
-// URL. Based on this behavior, when we encounter a base tag, we comment it and |
-// write a new base tag <base href="."> after the previous commented base tag. |
-// The new added base tag can help engine to locate correct base URL for |
-// correctly loading local saved resource files. Also I think we need to inherit |
-// the base target value from document object when appending new base tag. |
-// If there are multiple base tags in original document, we will comment all old |
-// base tags and append new base tag after each old base tag because we do not |
-// know those old base tags are original content or added by JavaScript. If |
-// they are added by JavaScript, it means when loading saved page, the script(s) |
-// will still insert base tag(s) to DOM, so the new added base tag(s) can |
-// override the incorrect base URL and make sure we alway load correct local |
-// saved resource files. |
- |
-#include "web/WebPageSerializerImpl.h" |
- |
-#include "core/HTMLNames.h" |
-#include "core/dom/Document.h" |
-#include "core/dom/DocumentType.h" |
-#include "core/dom/Element.h" |
-#include "core/editing/serializers/Serialization.h" |
-#include "core/html/HTMLAllCollection.h" |
-#include "core/html/HTMLElement.h" |
-#include "core/html/HTMLFormElement.h" |
-#include "core/html/HTMLHtmlElement.h" |
-#include "core/html/HTMLMetaElement.h" |
-#include "core/loader/DocumentLoader.h" |
-#include "core/loader/FrameLoader.h" |
-#include "core/page/PageSerializer.h" |
-#include "public/platform/WebVector.h" |
-#include "web/WebLocalFrameImpl.h" |
-#include "wtf/text/TextEncoding.h" |
- |
-namespace blink { |
- |
-// Maximum length of data buffer which is used to temporary save generated |
-// html content data. This is a soft limit which might be passed if a very large |
-// contegious string is found in the page. |
-static const unsigned dataBufferCapacity = 65536; |
- |
-WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url, |
- const WTF::TextEncoding& textEncoding, |
- Document* document) |
- : url(url) |
- , textEncoding(textEncoding) |
- , document(document) |
- , isHTMLDocument(document->isHTMLDocument()) |
- , haveSeenDocType(false) |
- , haveAddedCharsetDeclaration(false) |
- , skipMetaElement(nullptr) |
- , isInScriptOrStyleTag(false) |
- , haveAddedXMLProcessingDirective(false) |
- , haveAddedContentsBeforeEnd(false) |
-{ |
-} |
- |
-String WebPageSerializerImpl::preActionBeforeSerializeOpenTag( |
- const Element* element, SerializeDomParam* param, bool* needSkip) |
-{ |
- StringBuilder result; |
- |
- *needSkip = false; |
- if (param->isHTMLDocument) { |
- // Skip the open tag of original META tag which declare charset since we |
- // have overrided the META which have correct charset declaration after |
- // serializing open tag of HEAD element. |
- ASSERT(element); |
- if (isHTMLMetaElement(element) && toHTMLMetaElement(element)->computeEncoding().isValid()) { |
- // Found META tag declared charset, we need to skip it when |
- // serializing DOM. |
- param->skipMetaElement = element; |
- *needSkip = true; |
- } else if (isHTMLHtmlElement(*element)) { |
- // Check something before processing the open tag of HEAD element. |
- // First we add doc type declaration if original document has it. |
- if (!param->haveSeenDocType) { |
- param->haveSeenDocType = true; |
- result.append(createMarkup(param->document->doctype())); |
- } |
- |
- // Add MOTW declaration before html tag. |
- // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. |
- result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(param->url)); |
- } else if (isHTMLBaseElement(*element)) { |
- // Comment the BASE tag when serializing dom. |
- result.appendLiteral("<!--"); |
- } |
- } else { |
- // Write XML declaration. |
- if (!param->haveAddedXMLProcessingDirective) { |
- param->haveAddedXMLProcessingDirective = true; |
- // Get encoding info. |
- String xmlEncoding = param->document->xmlEncoding(); |
- if (xmlEncoding.isEmpty()) |
- xmlEncoding = param->document->encodingName(); |
- if (xmlEncoding.isEmpty()) |
- xmlEncoding = UTF8Encoding().name(); |
- result.appendLiteral("<?xml version=\""); |
- result.append(param->document->xmlVersion()); |
- result.appendLiteral("\" encoding=\""); |
- result.append(xmlEncoding); |
- if (param->document->xmlStandalone()) |
- result.appendLiteral("\" standalone=\"yes"); |
- result.appendLiteral("\"?>\n"); |
- } |
- // Add doc type declaration if original document has it. |
- if (!param->haveSeenDocType) { |
- param->haveSeenDocType = true; |
- result.append(createMarkup(param->document->doctype())); |
- } |
- } |
- return result.toString(); |
-} |
- |
-String WebPageSerializerImpl::postActionAfterSerializeOpenTag( |
- const Element* element, SerializeDomParam* param) |
-{ |
- StringBuilder result; |
- |
- param->haveAddedContentsBeforeEnd = false; |
- if (!param->isHTMLDocument) |
- return result.toString(); |
- // Check after processing the open tag of HEAD element |
- if (!param->haveAddedCharsetDeclaration |
- && isHTMLHeadElement(*element)) { |
- param->haveAddedCharsetDeclaration = true; |
- // Check meta element. WebKit only pre-parse the first 512 bytes |
- // of the document. If the whole <HEAD> is larger and meta is the |
- // end of head part, then this kind of pages aren't decoded correctly |
- // because of this issue. So when we serialize the DOM, we need to |
- // make sure the meta will in first child of head tag. |
- // See http://bugs.webkit.org/show_bug.cgi?id=16621. |
- // First we generate new content for writing correct META element. |
- result.append(WebPageSerializer::generateMetaCharsetDeclaration( |
- String(param->textEncoding.name()))); |
- |
- param->haveAddedContentsBeforeEnd = true; |
- // Will search each META which has charset declaration, and skip them all |
- // in PreActionBeforeSerializeOpenTag. |
- } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) { |
- param->isInScriptOrStyleTag = true; |
- } |
- |
- return result.toString(); |
-} |
- |
-String WebPageSerializerImpl::preActionBeforeSerializeEndTag( |
- const Element* element, SerializeDomParam* param, bool* needSkip) |
-{ |
- String result; |
- |
- *needSkip = false; |
- if (!param->isHTMLDocument) |
- return result; |
- // Skip the end tag of original META tag which declare charset. |
- // Need not to check whether it's META tag since we guarantee |
- // skipMetaElement is definitely META tag if it's not 0. |
- if (param->skipMetaElement == element) { |
- *needSkip = true; |
- } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) { |
- ASSERT(param->isInScriptOrStyleTag); |
- param->isInScriptOrStyleTag = false; |
- } |
- |
- return result; |
-} |
- |
-// After we finish serializing end tag of a element, we give the target |
-// element a chance to do some post work to add some additional data. |
-String WebPageSerializerImpl::postActionAfterSerializeEndTag( |
- const Element* element, SerializeDomParam* param) |
-{ |
- StringBuilder result; |
- |
- if (!param->isHTMLDocument) |
- return result.toString(); |
- // Comment the BASE tag when serializing DOM. |
- if (isHTMLBaseElement(*element)) { |
- result.appendLiteral("-->"); |
- // Append a new base tag declaration. |
- result.append(WebPageSerializer::generateBaseTagDeclaration( |
- param->document->baseTarget())); |
- } |
- |
- return result.toString(); |
-} |
- |
-void WebPageSerializerImpl::saveHTMLContentToBuffer( |
- const String& result, SerializeDomParam* param) |
-{ |
- m_dataBuffer.append(result); |
- encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished, |
- param, |
- DoNotForceFlush); |
-} |
- |
-void WebPageSerializerImpl::encodeAndFlushBuffer( |
- WebPageSerializerClient::PageSerializationStatus status, |
- SerializeDomParam* param, |
- FlushOption flushOption) |
-{ |
- // Data buffer is not full nor do we want to force flush. |
- if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity) |
- return; |
- |
- String content = m_dataBuffer.toString(); |
- m_dataBuffer.clear(); |
- |
- CString encodedContent = param->textEncoding.encode(content, WTF::EntitiesForUnencodables); |
- |
- // Send result to the client. |
- m_client->didSerializeDataForFrame(WebCString(encodedContent), status); |
-} |
- |
-// TODO(yosin): We should utilize |MarkupFormatter| here to share code, |
-// especially escaping attribute values, done by |WebEntities| |m_htmlEntities| |
-// and |m_xmlEntities|. |
-void WebPageSerializerImpl::openTagToString(Element* element, |
- SerializeDomParam* param) |
-{ |
- bool needSkip; |
- StringBuilder result; |
- // Do pre action for open tag. |
- result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip)); |
- if (needSkip) |
- return; |
- // Add open tag |
- result.append('<'); |
- result.append(element->nodeName().lower()); |
- // Go through all attributes and serialize them. |
- AttributeCollection attributes = element->attributes(); |
- AttributeCollection::iterator end = attributes.end(); |
- for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) { |
- result.append(' '); |
- // Add attribute pair |
- result.append(it->name().toString()); |
- result.appendLiteral("=\""); |
- if (!it->value().isEmpty()) { |
- const String& attrValue = it->value(); |
- |
- // Check whether we need to replace some resource links |
- // with local resource paths. |
- const QualifiedName& attrName = it->name(); |
- if (element->hasLegalLinkAttribute(attrName)) { |
- // For links start with "javascript:", we do not change it. |
- if (attrValue.startsWith("javascript:", TextCaseInsensitive)) { |
- result.append(m_htmlEntities.convertEntitiesInString(attrValue)); |
- } else { |
- // Get the absolute link |
- String completeURL = param->document->completeURL(attrValue); |
- // Check whether we have local files for those link. |
- if (m_localLinks.contains(completeURL)) { |
- result.append(m_htmlEntities.convertEntitiesInString(m_localLinks.get(completeURL))); |
- } else { |
- result.append(m_htmlEntities.convertEntitiesInString(completeURL)); |
- } |
- } |
- } else { |
- if (param->isHTMLDocument) |
- result.append(m_htmlEntities.convertEntitiesInString(attrValue)); |
- else |
- result.append(m_xmlEntities.convertEntitiesInString(attrValue)); |
- } |
- } |
- result.append('\"'); |
- } |
- |
- // Do post action for open tag. |
- String addedContents = postActionAfterSerializeOpenTag(element, param); |
- // Complete the open tag for element when it has child/children. |
- if (element->hasChildren() || param->haveAddedContentsBeforeEnd) |
- result.append('>'); |
- // Append the added contents generate in post action of open tag. |
- result.append(addedContents); |
- // Save the result to data buffer. |
- saveHTMLContentToBuffer(result.toString(), param); |
-} |
- |
-// Serialize end tag of an specified element. |
-void WebPageSerializerImpl::endTagToString(Element* element, |
- SerializeDomParam* param) |
-{ |
- bool needSkip; |
- StringBuilder result; |
- // Do pre action for end tag. |
- result.append(preActionBeforeSerializeEndTag(element, param, &needSkip)); |
- if (needSkip) |
- return; |
- // Write end tag when element has child/children. |
- if (element->hasChildren() || param->haveAddedContentsBeforeEnd) { |
- result.appendLiteral("</"); |
- result.append(element->nodeName().lower()); |
- result.append('>'); |
- } else { |
- // Check whether we have to write end tag for empty element. |
- if (param->isHTMLDocument) { |
- result.append('>'); |
- // FIXME: This code is horribly wrong. WebPageSerializerImpl must die. |
- if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsInsertHTML()) { |
- // We need to write end tag when it is required. |
- result.appendLiteral("</"); |
- result.append(element->nodeName().lower()); |
- result.append('>'); |
- } |
- } else { |
- // For xml base document. |
- result.appendLiteral(" />"); |
- } |
- } |
- // Do post action for end tag. |
- result.append(postActionAfterSerializeEndTag(element, param)); |
- // Save the result to data buffer. |
- saveHTMLContentToBuffer(result.toString(), param); |
-} |
- |
-void WebPageSerializerImpl::buildContentForNode(Node* node, |
- SerializeDomParam* param) |
-{ |
- switch (node->nodeType()) { |
- case Node::ELEMENT_NODE: |
- // Process open tag of element. |
- openTagToString(toElement(node), param); |
- // Walk through the children nodes and process it. |
- for (Node *child = node->firstChild(); child; child = child->nextSibling()) |
- buildContentForNode(child, param); |
- // Process end tag of element. |
- endTagToString(toElement(node), param); |
- break; |
- case Node::TEXT_NODE: |
- saveHTMLContentToBuffer(createMarkup(node), param); |
- break; |
- case Node::ATTRIBUTE_NODE: |
- case Node::DOCUMENT_NODE: |
- case Node::DOCUMENT_FRAGMENT_NODE: |
- // Should not exist. |
- ASSERT_NOT_REACHED(); |
- break; |
- // Document type node can be in DOM? |
- case Node::DOCUMENT_TYPE_NODE: |
- param->haveSeenDocType = true; |
- default: |
- // For other type node, call default action. |
- saveHTMLContentToBuffer(createMarkup(node), param); |
- break; |
- } |
-} |
- |
-WebPageSerializerImpl::WebPageSerializerImpl( |
- WebLocalFrame* frame, |
- WebPageSerializerClient* client, |
- const WebVector<std::pair<WebURL, WebString>>& urlsToLocalPaths) |
- : m_client(client) |
- , m_htmlEntities(false) |
- , m_xmlEntities(true) |
-{ |
- // Must specify available webframe. |
- ASSERT(frame); |
- m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame); |
- // Make sure we have non 0 client. |
- ASSERT(client); |
- // Build local resources map. |
- for (const auto& it : urlsToLocalPaths) { |
- KURL url = it.first; |
- ASSERT(!m_localLinks.contains(url.string())); |
- m_localLinks.set(url.string(), it.second); |
- } |
- |
- ASSERT(m_dataBuffer.isEmpty()); |
-} |
- |
-bool WebPageSerializerImpl::serialize() |
-{ |
- bool didSerialization = false; |
- |
- Document* document = m_specifiedWebLocalFrameImpl->frame()->document(); |
- const KURL& url = document->url(); |
- |
- if (url.isValid()) { |
- didSerialization = true; |
- |
- const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding(); |
- if (textEncoding.isNonByteBasedEncoding()) { |
- const UChar byteOrderMark = 0xFEFF; |
- m_dataBuffer.append(byteOrderMark); |
- } |
- |
- SerializeDomParam param(url, textEncoding, document); |
- |
- Element* documentElement = document->documentElement(); |
- if (documentElement) |
- buildContentForNode(documentElement, ¶m); |
- |
- encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, ¶m, ForceFlush); |
- } else { |
- // Report empty contents for invalid URLs. |
- m_client->didSerializeDataForFrame( |
- WebCString(), WebPageSerializerClient::CurrentFrameIsFinished); |
- } |
- |
- ASSERT(m_dataBuffer.isEmpty()); |
- return didSerialization; |
-} |
- |
-} // namespace blink |