| Index: trunk/Source/web/WebPageSerializer.cpp
|
| ===================================================================
|
| --- trunk/Source/web/WebPageSerializer.cpp (revision 162155)
|
| +++ trunk/Source/web/WebPageSerializer.cpp (working copy)
|
| @@ -35,6 +35,7 @@
|
| #include "WebFrame.h"
|
| #include "WebFrameImpl.h"
|
| #include "WebPageSerializerClient.h"
|
| +#include "WebPageSerializerImpl.h"
|
| #include "WebView.h"
|
| #include "WebViewImpl.h"
|
| #include "core/dom/Document.h"
|
| @@ -58,6 +59,126 @@
|
|
|
| using namespace WebCore;
|
|
|
| +namespace {
|
| +
|
| +KURL getSubResourceURLFromElement(Element* element)
|
| +{
|
| + ASSERT(element);
|
| + const QualifiedName* attributeName = 0;
|
| + if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames::scriptTag))
|
| + attributeName = &HTMLNames::srcAttr;
|
| + else if (element->hasTagName(HTMLNames::inputTag)) {
|
| + if (toHTMLInputElement(element)->isImageButton())
|
| + attributeName = &HTMLNames::srcAttr;
|
| + } else if (element->hasTagName(HTMLNames::bodyTag)
|
| + || isHTMLTableElement(element)
|
| + || element->hasTagName(HTMLNames::trTag)
|
| + || element->hasTagName(HTMLNames::tdTag))
|
| + attributeName = &HTMLNames::backgroundAttr;
|
| + else if (element->hasTagName(HTMLNames::blockquoteTag)
|
| + || element->hasTagName(HTMLNames::qTag)
|
| + || element->hasTagName(HTMLNames::delTag)
|
| + || element->hasTagName(HTMLNames::insTag))
|
| + attributeName = &HTMLNames::citeAttr;
|
| + else if (element->hasTagName(HTMLNames::linkTag)) {
|
| + // If the link element is not css, ignore it.
|
| + if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/css")) {
|
| + // FIXME: Add support for extracting links of sub-resources which
|
| + // are inside style-sheet such as @import, @font-face, url(), etc.
|
| + attributeName = &HTMLNames::hrefAttr;
|
| + }
|
| + } else if (element->hasTagName(HTMLNames::objectTag))
|
| + attributeName = &HTMLNames::dataAttr;
|
| + else if (element->hasTagName(HTMLNames::embedTag))
|
| + attributeName = &HTMLNames::srcAttr;
|
| +
|
| + if (!attributeName)
|
| + return KURL();
|
| +
|
| + String value = element->getAttribute(*attributeName);
|
| + // Ignore javascript content.
|
| + if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", false))
|
| + return KURL();
|
| +
|
| + return element->document().completeURL(value);
|
| +}
|
| +
|
| +void retrieveResourcesForElement(Element* element,
|
| + Vector<Frame*>* visitedFrames,
|
| + Vector<Frame*>* framesToVisit,
|
| + Vector<KURL>* frameURLs,
|
| + Vector<KURL>* resourceURLs)
|
| +{
|
| + // If the node is a frame, we'll process it later in retrieveResourcesForFrame.
|
| + if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNames::frameTag)
|
| + || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTMLNames::embedTag))
|
| + && element->isFrameOwnerElement()) {
|
| + if (Frame* frame = toHTMLFrameOwnerElement(element)->contentFrame()) {
|
| + if (!visitedFrames->contains(frame))
|
| + framesToVisit->append(frame);
|
| + return;
|
| + }
|
| + }
|
| +
|
| + KURL url = getSubResourceURLFromElement(element);
|
| + if (url.isEmpty() || !url.isValid())
|
| + return; // No subresource for this node.
|
| +
|
| + // Ignore URLs that have a non-standard protocols. Since the FTP protocol
|
| + // does no have a cache mechanism, we skip it as well.
|
| + if (!url.protocolIsInHTTPFamily() && !url.isLocalFile())
|
| + return;
|
| +
|
| + if (!resourceURLs->contains(url))
|
| + resourceURLs->append(url);
|
| +}
|
| +
|
| +void retrieveResourcesForFrame(Frame* frame,
|
| + const blink::WebVector<blink::WebCString>& supportedSchemes,
|
| + Vector<Frame*>* visitedFrames,
|
| + Vector<Frame*>* framesToVisit,
|
| + Vector<KURL>* frameURLs,
|
| + Vector<KURL>* resourceURLs)
|
| +{
|
| + KURL frameURL = frame->loader().documentLoader()->request().url();
|
| +
|
| + // If the frame's URL is invalid, ignore it, it is not retrievable.
|
| + if (!frameURL.isValid())
|
| + return;
|
| +
|
| + // Ignore frames from unsupported schemes.
|
| + bool isValidScheme = false;
|
| + for (size_t i = 0; i < supportedSchemes.size(); ++i) {
|
| + if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data())) {
|
| + isValidScheme = true;
|
| + break;
|
| + }
|
| + }
|
| + if (!isValidScheme)
|
| + return;
|
| +
|
| + // If we have already seen that frame, ignore it.
|
| + if (visitedFrames->contains(frame))
|
| + return;
|
| + visitedFrames->append(frame);
|
| + if (!frameURLs->contains(frameURL))
|
| + frameURLs->append(frameURL);
|
| +
|
| + // Now get the resources associated with each node of the document.
|
| + RefPtr<HTMLCollection> allNodes = frame->document()->all();
|
| + for (unsigned i = 0; i < allNodes->length(); ++i) {
|
| + Node* node = allNodes->item(i);
|
| + // We are only interested in HTML resources.
|
| + if (!node->isElementNode())
|
| + continue;
|
| + retrieveResourcesForElement(toElement(node),
|
| + visitedFrames, framesToVisit,
|
| + frameURLs, resourceURLs);
|
| + }
|
| +}
|
| +
|
| +} // namespace
|
| +
|
| namespace blink {
|
|
|
| void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Resource>* resourcesParam)
|
| @@ -109,27 +230,72 @@
|
| const WebVector<WebString>& localPaths,
|
| const WebString& localDirectoryName)
|
| {
|
| - ASSERT(frame);
|
| - ASSERT(client);
|
| - ASSERT(links.size() == localPaths.size());
|
| + WebPageSerializerImpl serializerImpl(
|
| + frame, recursive, client, links, localPaths, localDirectoryName);
|
| + return serializerImpl.serialize();
|
| +}
|
|
|
| - LinkLocalPathMap m_localLinks;
|
| +bool WebPageSerializer::retrieveAllResources(WebView* view,
|
| + const WebVector<WebCString>& supportedSchemes,
|
| + WebVector<WebURL>* resourceURLs,
|
| + WebVector<WebURL>* frameURLs) {
|
| + WebFrameImpl* mainFrame = toWebFrameImpl(view->mainFrame());
|
| + if (!mainFrame)
|
| + return false;
|
|
|
| - for (size_t i = 0; i < links.size(); i++) {
|
| - KURL url = links[i];
|
| - ASSERT(!m_localLinks.contains(url.string()));
|
| - m_localLinks.set(url.string(), localPaths[i]);
|
| + Vector<Frame*> framesToVisit;
|
| + Vector<Frame*> visitedFrames;
|
| + Vector<KURL> frameKURLs;
|
| + Vector<KURL> resourceKURLs;
|
| +
|
| + // Let's retrieve the resources from every frame in this page.
|
| + framesToVisit.append(mainFrame->frame());
|
| + while (!framesToVisit.isEmpty()) {
|
| + Frame* frame = framesToVisit[0];
|
| + framesToVisit.remove(0);
|
| + retrieveResourcesForFrame(frame, supportedSchemes,
|
| + &visitedFrames, &framesToVisit,
|
| + &frameKURLs, &resourceKURLs);
|
| }
|
|
|
| - Vector<SerializedResource> resources;
|
| - PageSerializer serializer(&resources, &m_localLinks, localDirectoryName);
|
| - serializer.serialize(toWebViewImpl(frame->view())->page());
|
| + // Converts the results to WebURLs.
|
| + WebVector<WebURL> resultResourceURLs(resourceKURLs.size());
|
| + for (size_t i = 0; i < resourceKURLs.size(); ++i) {
|
| + resultResourceURLs[i] = resourceKURLs[i];
|
| + // A frame's src can point to the same URL as another resource, keep the
|
| + // resource URL only in such cases.
|
| + size_t index = frameKURLs.find(resourceKURLs[i]);
|
| + if (index != kNotFound)
|
| + frameKURLs.remove(index);
|
| + }
|
| + *resourceURLs = resultResourceURLs;
|
| + WebVector<WebURL> resultFrameURLs(frameKURLs.size());
|
| + for (size_t i = 0; i < frameKURLs.size(); ++i)
|
| + resultFrameURLs[i] = frameKURLs[i];
|
| + *frameURLs = resultFrameURLs;
|
|
|
| - for (Vector<SerializedResource>::const_iterator iter = resources.begin(); iter != resources.end(); ++iter) {
|
| - client->didSerializeDataForFrame(iter->url, WebCString(iter->data->data(), iter->data->size()), WebPageSerializerClient::CurrentFrameIsFinished);
|
| - }
|
| - client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerializerClient::AllFramesAreFinished);
|
| return true;
|
| }
|
|
|
| +WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& charset)
|
| +{
|
| + String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + static_cast<const String&>(charset) + "\">";
|
| + return charsetString;
|
| +}
|
| +
|
| +WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
|
| +{
|
| + return String::format("\n<!-- saved from url=(%04d)%s -->\n",
|
| + static_cast<int>(url.spec().length()),
|
| + url.spec().data());
|
| +}
|
| +
|
| +WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTarget)
|
| +{
|
| + if (baseTarget.isEmpty())
|
| + return String("<base href=\".\">");
|
| + String baseString = "<base href=\".\" target=\"" + static_cast<const String&>(baseTarget) + "\">";
|
| + return baseString;
|
| +}
|
| +
|
| } // namespace blink
|
|
|