Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(378)

Side by Side Diff: Source/web/WebPageSerializer.cpp

Issue 68613003: Merges the two different page serializers (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master
Patch Set: Remove newline after XML decl Created 7 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « Source/core/page/PageSerializer.cpp ('k') | Source/web/WebPageSerializerImpl.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2009 Google Inc. All rights reserved. 2 * Copyright (C) 2009 Google Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are 5 * modification, are permitted provided that the following conditions are
6 * met: 6 * met:
7 * 7 *
8 * * Redistributions of source code must retain the above copyright 8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer. 9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above 10 * * Redistributions in binary form must reproduce the above
(...skipping 17 matching lines...) Expand all
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */ 29 */
30 30
31 #include "config.h" 31 #include "config.h"
32 #include "WebPageSerializer.h" 32 #include "WebPageSerializer.h"
33 33
34 #include "HTMLNames.h" 34 #include "HTMLNames.h"
35 #include "WebFrame.h" 35 #include "WebFrame.h"
36 #include "WebFrameImpl.h" 36 #include "WebFrameImpl.h"
37 #include "WebPageSerializerClient.h" 37 #include "WebPageSerializerClient.h"
38 #include "WebPageSerializerImpl.h"
39 #include "WebView.h" 38 #include "WebView.h"
40 #include "WebViewImpl.h" 39 #include "WebViewImpl.h"
41 #include "core/dom/Document.h" 40 #include "core/dom/Document.h"
42 #include "core/dom/Element.h" 41 #include "core/dom/Element.h"
43 #include "core/html/HTMLAllCollection.h" 42 #include "core/html/HTMLAllCollection.h"
44 #include "core/html/HTMLFrameOwnerElement.h" 43 #include "core/html/HTMLFrameOwnerElement.h"
45 #include "core/html/HTMLInputElement.h" 44 #include "core/html/HTMLInputElement.h"
46 #include "core/html/HTMLTableElement.h" 45 #include "core/html/HTMLTableElement.h"
47 #include "core/loader/DocumentLoader.h" 46 #include "core/loader/DocumentLoader.h"
48 #include "core/frame/Frame.h" 47 #include "core/frame/Frame.h"
49 #include "core/page/PageSerializer.h" 48 #include "core/page/PageSerializer.h"
50 #include "platform/SerializedResource.h" 49 #include "platform/SerializedResource.h"
51 #include "platform/mhtml/MHTMLArchive.h" 50 #include "platform/mhtml/MHTMLArchive.h"
52 #include "platform/weborigin/KURL.h" 51 #include "platform/weborigin/KURL.h"
53 #include "public/platform/WebCString.h" 52 #include "public/platform/WebCString.h"
54 #include "public/platform/WebString.h" 53 #include "public/platform/WebString.h"
55 #include "public/platform/WebURL.h" 54 #include "public/platform/WebURL.h"
56 #include "public/platform/WebVector.h" 55 #include "public/platform/WebVector.h"
57 #include "wtf/Vector.h" 56 #include "wtf/Vector.h"
58 #include "wtf/text/StringConcatenate.h" 57 #include "wtf/text/StringConcatenate.h"
59 58
60 using namespace WebCore; 59 using namespace WebCore;
61 60
62 namespace {
63
64 KURL getSubResourceURLFromElement(Element* element)
65 {
66 ASSERT(element);
67 const QualifiedName* attributeName = 0;
68 if (element->hasTagName(HTMLNames::imgTag) || element->hasTagName(HTMLNames: :scriptTag))
69 attributeName = &HTMLNames::srcAttr;
70 else if (element->hasTagName(HTMLNames::inputTag)) {
71 if (toHTMLInputElement(element)->isImageButton())
72 attributeName = &HTMLNames::srcAttr;
73 } else if (element->hasTagName(HTMLNames::bodyTag)
74 || isHTMLTableElement(element)
75 || element->hasTagName(HTMLNames::trTag)
76 || element->hasTagName(HTMLNames::tdTag))
77 attributeName = &HTMLNames::backgroundAttr;
78 else if (element->hasTagName(HTMLNames::blockquoteTag)
79 || element->hasTagName(HTMLNames::qTag)
80 || element->hasTagName(HTMLNames::delTag)
81 || element->hasTagName(HTMLNames::insTag))
82 attributeName = &HTMLNames::citeAttr;
83 else if (element->hasTagName(HTMLNames::linkTag)) {
84 // If the link element is not css, ignore it.
85 if (equalIgnoringCase(element->getAttribute(HTMLNames::typeAttr), "text/ css")) {
86 // FIXME: Add support for extracting links of sub-resources which
87 // are inside style-sheet such as @import, @font-face, url(), etc.
88 attributeName = &HTMLNames::hrefAttr;
89 }
90 } else if (element->hasTagName(HTMLNames::objectTag))
91 attributeName = &HTMLNames::dataAttr;
92 else if (element->hasTagName(HTMLNames::embedTag))
93 attributeName = &HTMLNames::srcAttr;
94
95 if (!attributeName)
96 return KURL();
97
98 String value = element->getAttribute(*attributeName);
99 // Ignore javascript content.
100 if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", fal se))
101 return KURL();
102
103 return element->document().completeURL(value);
104 }
105
106 void retrieveResourcesForElement(Element* element,
107 Vector<Frame*>* visitedFrames,
108 Vector<Frame*>* framesToVisit,
109 Vector<KURL>* frameURLs,
110 Vector<KURL>* resourceURLs)
111 {
112 // If the node is a frame, we'll process it later in retrieveResourcesForFra me.
113 if ((element->hasTagName(HTMLNames::iframeTag) || element->hasTagName(HTMLNa mes::frameTag)
114 || element->hasTagName(HTMLNames::objectTag) || element->hasTagName(HTML Names::embedTag))
115 && element->isFrameOwnerElement()) {
116 if (Frame* frame = toHTMLFrameOwnerElement(element)->contentFrame()) {
117 if (!visitedFrames->contains(frame))
118 framesToVisit->append(frame);
119 return;
120 }
121 }
122
123 KURL url = getSubResourceURLFromElement(element);
124 if (url.isEmpty() || !url.isValid())
125 return; // No subresource for this node.
126
127 // Ignore URLs that have a non-standard protocols. Since the FTP protocol
128 // does no have a cache mechanism, we skip it as well.
129 if (!url.protocolIsInHTTPFamily() && !url.isLocalFile())
130 return;
131
132 if (!resourceURLs->contains(url))
133 resourceURLs->append(url);
134 }
135
136 void retrieveResourcesForFrame(Frame* frame,
137 const blink::WebVector<blink::WebCString>& suppor tedSchemes,
138 Vector<Frame*>* visitedFrames,
139 Vector<Frame*>* framesToVisit,
140 Vector<KURL>* frameURLs,
141 Vector<KURL>* resourceURLs)
142 {
143 KURL frameURL = frame->loader().documentLoader()->request().url();
144
145 // If the frame's URL is invalid, ignore it, it is not retrievable.
146 if (!frameURL.isValid())
147 return;
148
149 // Ignore frames from unsupported schemes.
150 bool isValidScheme = false;
151 for (size_t i = 0; i < supportedSchemes.size(); ++i) {
152 if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data() )) {
153 isValidScheme = true;
154 break;
155 }
156 }
157 if (!isValidScheme)
158 return;
159
160 // If we have already seen that frame, ignore it.
161 if (visitedFrames->contains(frame))
162 return;
163 visitedFrames->append(frame);
164 if (!frameURLs->contains(frameURL))
165 frameURLs->append(frameURL);
166
167 // Now get the resources associated with each node of the document.
168 RefPtr<HTMLCollection> allNodes = frame->document()->all();
169 for (unsigned i = 0; i < allNodes->length(); ++i) {
170 Node* node = allNodes->item(i);
171 // We are only interested in HTML resources.
172 if (!node->isElementNode())
173 continue;
174 retrieveResourcesForElement(toElement(node),
175 visitedFrames, framesToVisit,
176 frameURLs, resourceURLs);
177 }
178 }
179
180 } // namespace
181
182 namespace blink { 61 namespace blink {
183 62
184 void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Re source>* resourcesParam) 63 void WebPageSerializer::serialize(WebView* view, WebVector<WebPageSerializer::Re source>* resourcesParam)
185 { 64 {
186 Vector<SerializedResource> resources; 65 Vector<SerializedResource> resources;
187 PageSerializer serializer(&resources); 66 PageSerializer serializer(&resources);
188 serializer.serialize(toWebViewImpl(view)->page()); 67 serializer.serialize(toWebViewImpl(view)->page());
189 68
190 Vector<Resource> result; 69 Vector<Resource> result;
191 for (Vector<SerializedResource>::const_iterator iter = resources.begin(); it er != resources.end(); ++iter) { 70 for (Vector<SerializedResource>::const_iterator iter = resources.begin(); it er != resources.end(); ++iter) {
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
223 return WebCString(mhtml->data(), mhtml->size()); 102 return WebCString(mhtml->data(), mhtml->size());
224 } 103 }
225 104
226 bool WebPageSerializer::serialize(WebFrame* frame, 105 bool WebPageSerializer::serialize(WebFrame* frame,
227 bool recursive, 106 bool recursive,
228 WebPageSerializerClient* client, 107 WebPageSerializerClient* client,
229 const WebVector<WebURL>& links, 108 const WebVector<WebURL>& links,
230 const WebVector<WebString>& localPaths, 109 const WebVector<WebString>& localPaths,
231 const WebString& localDirectoryName) 110 const WebString& localDirectoryName)
232 { 111 {
233 WebPageSerializerImpl serializerImpl( 112 ASSERT(frame);
234 frame, recursive, client, links, localPaths, localDirectoryName); 113 ASSERT(client);
235 return serializerImpl.serialize(); 114 ASSERT(links.size() == localPaths.size());
236 }
237 115
238 bool WebPageSerializer::retrieveAllResources(WebView* view, 116 LinkLocalPathMap m_localLinks;
239 const WebVector<WebCString>& suppor tedSchemes,
240 WebVector<WebURL>* resourceURLs,
241 WebVector<WebURL>* frameURLs) {
242 WebFrameImpl* mainFrame = toWebFrameImpl(view->mainFrame());
243 if (!mainFrame)
244 return false;
245 117
246 Vector<Frame*> framesToVisit; 118 for (size_t i = 0; i < links.size(); i++) {
247 Vector<Frame*> visitedFrames; 119 KURL url = links[i];
248 Vector<KURL> frameKURLs; 120 ASSERT(!m_localLinks.contains(url.string()));
249 Vector<KURL> resourceKURLs; 121 m_localLinks.set(url.string(), localPaths[i]);
250
251 // Let's retrieve the resources from every frame in this page.
252 framesToVisit.append(mainFrame->frame());
253 while (!framesToVisit.isEmpty()) {
254 Frame* frame = framesToVisit[0];
255 framesToVisit.remove(0);
256 retrieveResourcesForFrame(frame, supportedSchemes,
257 &visitedFrames, &framesToVisit,
258 &frameKURLs, &resourceKURLs);
259 } 122 }
260 123
261 // Converts the results to WebURLs. 124 Vector<SerializedResource> resources;
262 WebVector<WebURL> resultResourceURLs(resourceKURLs.size()); 125 PageSerializer serializer(&resources, &m_localLinks, localDirectoryName);
263 for (size_t i = 0; i < resourceKURLs.size(); ++i) { 126 serializer.serialize(toWebViewImpl(frame->view())->page());
264 resultResourceURLs[i] = resourceKURLs[i]; 127
265 // A frame's src can point to the same URL as another resource, keep the 128 for (Vector<SerializedResource>::const_iterator iter = resources.begin(); it er != resources.end(); ++iter) {
266 // resource URL only in such cases. 129 client->didSerializeDataForFrame(iter->url, WebCString(iter->data->data( ), iter->data->size()), WebPageSerializerClient::CurrentFrameIsFinished);
267 size_t index = frameKURLs.find(resourceKURLs[i]);
268 if (index != kNotFound)
269 frameKURLs.remove(index);
270 } 130 }
271 *resourceURLs = resultResourceURLs; 131 client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerialize rClient::AllFramesAreFinished);
272 WebVector<WebURL> resultFrameURLs(frameKURLs.size());
273 for (size_t i = 0; i < frameKURLs.size(); ++i)
274 resultFrameURLs[i] = frameKURLs[i];
275 *frameURLs = resultFrameURLs;
276
277 return true; 132 return true;
278 } 133 }
279 134
280 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& cha rset) 135 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& cha rset)
281 { 136 {
282 String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/htm l; charset=" + static_cast<const String&>(charset) + "\">"; 137 String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/htm l; charset=" + static_cast<const String&>(charset) + "\">";
283 return charsetString; 138 return charsetString;
284 } 139 }
285 140
286 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url) 141 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
287 { 142 {
288 return String::format("\n<!-- saved from url=(%04d)%s -->\n", 143 return String::format("\n<!-- saved from url=(%04d)%s -->\n",
289 static_cast<int>(url.spec().length()), 144 static_cast<int>(url.spec().length()),
290 url.spec().data()); 145 url.spec().data());
291 } 146 }
292 147
293 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTar get) 148 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTar get)
294 { 149 {
295 if (baseTarget.isEmpty()) 150 if (baseTarget.isEmpty())
296 return String("<base href=\".\">"); 151 return String("<base href=\".\">");
297 String baseString = "<base href=\".\" target=\"" + static_cast<const String& >(baseTarget) + "\">"; 152 String baseString = "<base href=\".\" target=\"" + static_cast<const String& >(baseTarget) + "\">";
298 return baseString; 153 return baseString;
299 } 154 }
300 155
301 } // namespace blink 156 } // namespace blink
OLDNEW
« no previous file with comments | « Source/core/page/PageSerializer.cpp ('k') | Source/web/WebPageSerializerImpl.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698