Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(472)

Side by Side Diff: Source/web/WebPageSerializer.cpp

Issue 1177733003: Merge page serializers [12/12] (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | Source/web/WebPageSerializerImpl.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 /* 1 /*
2 * Copyright (C) 2009 Google Inc. All rights reserved. 2 * Copyright (C) 2009 Google Inc. All rights reserved.
3 * 3 *
4 * Redistribution and use in source and binary forms, with or without 4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are 5 * modification, are permitted provided that the following conditions are
6 * met: 6 * met:
7 * 7 *
8 * * Redistributions of source code must retain the above copyright 8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer. 9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above 10 * * Redistributions in binary form must reproduce the above
(...skipping 11 matching lines...) Expand all
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */ 29 */
30 30
31 #include "config.h" 31 #include "config.h"
32
32 #include "public/web/WebPageSerializer.h" 33 #include "public/web/WebPageSerializer.h"
33 34
34 #include "core/HTMLNames.h"
35 #include "core/dom/Document.h" 35 #include "core/dom/Document.h"
36 #include "core/dom/Element.h" 36 #include "core/dom/Element.h"
37 #include "core/frame/LocalFrame.h" 37 #include "core/frame/Frame.h"
38 #include "core/html/HTMLAllCollection.h" 38 #include "core/html/HTMLAllCollection.h"
39 #include "core/html/HTMLFrameElementBase.h"
40 #include "core/html/HTMLFrameOwnerElement.h" 39 #include "core/html/HTMLFrameOwnerElement.h"
41 #include "core/html/HTMLInputElement.h" 40 #include "core/html/HTMLInputElement.h"
42 #include "core/html/HTMLTableElement.h" 41 #include "core/html/HTMLTableElement.h"
43 #include "core/loader/DocumentLoader.h" 42 #include "core/loader/DocumentLoader.h"
44 #include "core/page/Page.h" 43 #include "core/page/Page.h"
45 #include "core/page/PageSerializer.h" 44 #include "core/page/PageSerializer.h"
46 #include "platform/SerializedResource.h" 45 #include "platform/SerializedResource.h"
47 #include "platform/mhtml/MHTMLArchive.h" 46 #include "platform/mhtml/MHTMLArchive.h"
48 #include "platform/weborigin/KURL.h" 47 #include "platform/weborigin/KURL.h"
49 #include "public/platform/WebCString.h" 48 #include "public/platform/WebCString.h"
50 #include "public/platform/WebString.h" 49 #include "public/platform/WebString.h"
51 #include "public/platform/WebURL.h" 50 #include "public/platform/WebURL.h"
52 #include "public/platform/WebVector.h" 51 #include "public/platform/WebVector.h"
53 #include "public/web/WebFrame.h" 52 #include "public/web/WebLocalFrame.h"
54 #include "public/web/WebPageSerializerClient.h" 53 #include "public/web/WebPageSerializerClient.h"
55 #include "public/web/WebView.h"
56 #include "web/WebLocalFrameImpl.h"
57 #include "web/WebPageSerializerImpl.h"
58 #include "web/WebViewImpl.h" 54 #include "web/WebViewImpl.h"
59 #include "wtf/Vector.h" 55 #include "wtf/Vector.h"
60 #include "wtf/text/StringConcatenate.h" 56 #include "wtf/text/StringConcatenate.h"
61 57
62 namespace blink { 58 namespace blink {
63 59
64 namespace { 60 namespace {
65 61
66 KURL getSubResourceURLFromElement(Element* element)
67 {
68 ASSERT(element);
69 const QualifiedName& attributeName = element->subResourceAttributeName();
70 if (attributeName == QualifiedName::null())
71 return KURL();
72
73 String value = element->getAttribute(attributeName);
74 // Ignore javascript content.
75 if (value.isEmpty() || value.stripWhiteSpace().startsWith("javascript:", Tex tCaseInsensitive))
76 return KURL();
77
78 return element->document().completeURL(value);
79 }
80
81 void retrieveResourcesForElement(Element* element,
82 Vector<LocalFrame*>* visitedFrames,
83 Vector<LocalFrame*>* framesToVisit,
84 Vector<KURL>* frameURLs,
85 Vector<KURL>* resourceURLs)
86 {
87 ASSERT(element);
88 // If the node is a frame, we'll process it later in retrieveResourcesForFra me.
89 if (isHTMLFrameElementBase(*element) || isHTMLObjectElement(*element) || isH TMLEmbedElement(*element)) {
90 Frame* frame = toHTMLFrameOwnerElement(element)->contentFrame();
91 if (frame && frame->isLocalFrame()) {
92 if (!visitedFrames->contains(toLocalFrame(frame)))
93 framesToVisit->append(toLocalFrame(frame));
94 return;
95 }
96 }
97
98 KURL url = getSubResourceURLFromElement(element);
99 if (url.isEmpty() || !url.isValid())
100 return; // No subresource for this node.
101
102 // Ignore URLs that have a non-standard protocols. Since the FTP protocol
103 // does no have a cache mechanism, we skip it as well.
104 if (!url.protocolIsInHTTPFamily() && !url.isLocalFile())
105 return;
106
107 if (!resourceURLs->contains(url))
108 resourceURLs->append(url);
109 }
110
111 void retrieveResourcesForFrame(LocalFrame* frame,
112 const WebVector<WebCString>& supportedSchemes,
113 Vector<LocalFrame*>* visitedFrames,
114 Vector<LocalFrame*>* framesToVisit,
115 Vector<KURL>* frameURLs,
116 Vector<KURL>* resourceURLs)
117 {
118 KURL frameURL = frame->loader().documentLoader()->request().url();
119
120 // If the frame's URL is invalid, ignore it, it is not retrievable.
121 if (!frameURL.isValid())
122 return;
123
124 // Ignore frames from unsupported schemes.
125 bool isValidScheme = false;
126 for (size_t i = 0; i < supportedSchemes.size(); ++i) {
127 if (frameURL.protocolIs(static_cast<CString>(supportedSchemes[i]).data() )) {
128 isValidScheme = true;
129 break;
130 }
131 }
132 if (!isValidScheme)
133 return;
134
135 // If we have already seen that frame, ignore it.
136 if (visitedFrames->contains(frame))
137 return;
138 visitedFrames->append(frame);
139 if (!frameURLs->contains(frameURL))
140 frameURLs->append(frameURL);
141
142 // Now get the resources associated with each node of the document.
143 RefPtrWillBeRawPtr<HTMLAllCollection> allElements = frame->document()->all() ;
144 for (unsigned i = 0; i < allElements->length(); ++i) {
145 Element* element = allElements->item(i);
146 retrieveResourcesForElement(element,
147 visitedFrames, framesToVisit,
148 frameURLs, resourceURLs);
149 }
150 }
151
152 class MHTMLPageSerializerDelegate final : public PageSerializer::Delegate { 62 class MHTMLPageSerializerDelegate final : public PageSerializer::Delegate {
153 public: 63 public:
154 ~MHTMLPageSerializerDelegate() override; 64 ~MHTMLPageSerializerDelegate() override;
155 bool shouldIgnoreAttribute(const Attribute&) override; 65 bool shouldIgnoreAttribute(const Attribute&) override;
156 }; 66 };
157 67
158
159 MHTMLPageSerializerDelegate::~MHTMLPageSerializerDelegate() 68 MHTMLPageSerializerDelegate::~MHTMLPageSerializerDelegate()
160 { 69 {
161 } 70 }
162 71
163 bool MHTMLPageSerializerDelegate::shouldIgnoreAttribute(const Attribute& attribu te) 72 bool MHTMLPageSerializerDelegate::shouldIgnoreAttribute(const Attribute& attribu te)
164 { 73 {
165 // TODO(fgorski): Presence of srcset attribute causes MHTML to not display i mages, as only the value of src 74 // TODO(fgorski): Presence of srcset attribute causes MHTML to not display i mages, as only the value of src
166 // is pulled into the archive. Discarding srcset prevents the problem. Long term we should make sure to MHTML 75 // is pulled into the archive. Discarding srcset prevents the problem. Long term we should make sure to MHTML
167 // plays nicely with srcset. 76 // plays nicely with srcset.
168 return attribute.localName() == HTMLNames::srcsetAttr; 77 return attribute.localName() == HTMLNames::srcsetAttr;
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
205 return WebCString(mhtml->data(), mhtml->size()); 114 return WebCString(mhtml->data(), mhtml->size());
206 } 115 }
207 116
208 WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view) 117 WebCString WebPageSerializer::serializeToMHTMLUsingBinaryEncoding(WebView* view)
209 { 118 {
210 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page( ), MHTMLArchive::UseBinaryEncoding); 119 RefPtr<SharedBuffer> mhtml = serializePageToMHTML(toWebViewImpl(view)->page( ), MHTMLArchive::UseBinaryEncoding);
211 // FIXME: we are copying all the data here. Idealy we would have a WebShared Data(). 120 // FIXME: we are copying all the data here. Idealy we would have a WebShared Data().
212 return WebCString(mhtml->data(), mhtml->size()); 121 return WebCString(mhtml->data(), mhtml->size());
213 } 122 }
214 123
215 bool WebPageSerializer::serialize(WebLocalFrame* frame, 124 bool WebPageSerializer::serialize(WebLocalFrame* frame, bool recursive, WebPageS erializerClient* client,
216 bool recursive, 125 const WebVector<WebURL>& links, const WebVector<WebString>& localPaths, cons t WebString& localDirectoryName)
217 WebPageSerializerClient* client,
218 const WebVector<WebURL>& links,
219 const WebVector<WebString>& localPaths,
220 const WebString& localDirectoryName)
221 { 126 {
222 WebPageSerializerImpl serializerImpl( 127 ASSERT(frame);
223 frame, recursive, client, links, localPaths, localDirectoryName); 128 ASSERT(client);
224 return serializerImpl.serialize(); 129 ASSERT(links.size() == localPaths.size());
225 }
226 130
227 bool WebPageSerializer::retrieveAllResources(WebView* view, 131 LinkLocalPathMap m_localLinks;
228 const WebVector<WebCString>& suppor tedSchemes,
229 WebVector<WebURL>* resourceURLs,
230 WebVector<WebURL>* frameURLs) {
231 WebLocalFrameImpl* mainFrame = toWebLocalFrameImpl(view->mainFrame());
232 if (!mainFrame)
233 return false;
234 132
235 Vector<LocalFrame*> framesToVisit; 133 for (size_t i = 0; i < links.size(); i++) {
yosin_UTC9 2015/06/15 09:38:38 We can use range-for on |WebVector|.
Tiger (Sony Mobile) 2015/06/15 12:00:25 The index is used both for links and localPaths to
236 Vector<LocalFrame*> visitedFrames; 134 KURL url = links[i];
237 Vector<KURL> frameKURLs; 135 ASSERT(!m_localLinks.contains(url.string()));
238 Vector<KURL> resourceKURLs; 136 m_localLinks.set(url.string(), localPaths[i]);
239
240 // Let's retrieve the resources from every frame in this page.
241 framesToVisit.append(mainFrame->frame());
242 while (!framesToVisit.isEmpty()) {
243 LocalFrame* frame = framesToVisit[0];
244 framesToVisit.remove(0);
245 retrieveResourcesForFrame(frame, supportedSchemes,
246 &visitedFrames, &framesToVisit,
247 &frameKURLs, &resourceKURLs);
248 } 137 }
249 138
250 // Converts the results to WebURLs. 139 Vector<SerializedResource> resources;
251 WebVector<WebURL> resultResourceURLs(resourceKURLs.size()); 140 PageSerializer serializer(&resources, nullptr, &m_localLinks, localDirectory Name);
252 for (size_t i = 0; i < resourceKURLs.size(); ++i) { 141 serializer.serialize(toWebViewImpl(frame->view())->page());
253 resultResourceURLs[i] = resourceKURLs[i]; 142
254 // A frame's src can point to the same URL as another resource, keep the 143 for (Vector<SerializedResource>::const_iterator iter = resources.begin(); it er != resources.end(); ++iter) {
yosin_UTC9 2015/06/15 09:38:38 We can use range-for on |Vector|.
Tiger (Sony Mobile) 2015/06/15 12:00:25 Done.
255 // resource URL only in such cases. 144 client->didSerializeDataForFrame(iter->url, WebCString(iter->data->data( ), iter->data->size()), WebPageSerializerClient::CurrentFrameIsFinished);
256 size_t index = frameKURLs.find(resourceKURLs[i]);
257 if (index != kNotFound)
258 frameKURLs.remove(index);
259 } 145 }
260 *resourceURLs = resultResourceURLs; 146 client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSerialize rClient::AllFramesAreFinished);
261 WebVector<WebURL> resultFrameURLs(frameKURLs.size());
262 for (size_t i = 0; i < frameKURLs.size(); ++i)
263 resultFrameURLs[i] = frameKURLs[i];
264 *frameURLs = resultFrameURLs;
265
266 return true; 147 return true;
267 } 148 }
268 149
269 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& cha rset) 150 WebString WebPageSerializer::generateMetaCharsetDeclaration(const WebString& cha rset)
270 { 151 {
271 String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/htm l; charset=" + static_cast<const String&>(charset) + "\">"; 152 String charsetString = "<meta http-equiv=\"Content-Type\" content=\"text/htm l; charset=" + static_cast<const String&>(charset) + "\">";
272 return charsetString; 153 return charsetString;
273 } 154 }
274 155
275 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url) 156 WebString WebPageSerializer::generateMarkOfTheWebDeclaration(const WebURL& url)
276 { 157 {
277 return String::format("\n<!-- saved from url=(%04d)%s -->\n", 158 return String::format("\n<!-- saved from url=(%04d)%s -->\n",
278 static_cast<int>(url.spec().length()), 159 static_cast<int>(url.spec().length()), url.spec().data());
279 url.spec().data());
280 } 160 }
281 161
282 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTar get) 162 WebString WebPageSerializer::generateBaseTagDeclaration(const WebString& baseTar get)
283 { 163 {
284 if (baseTarget.isEmpty()) 164 if (baseTarget.isEmpty())
285 return String("<base href=\".\">"); 165 return String("<base href=\".\">");
286 String baseString = "<base href=\".\" target=\"" + static_cast<const String& >(baseTarget) + "\">"; 166 String baseString = "<base href=\".\" target=\"" + static_cast<const String& >(baseTarget) + "\">";
287 return baseString; 167 return baseString;
288 } 168 }
289 169
290 } // namespace blink 170 } // namespace blink
OLDNEW
« no previous file with comments | « no previous file | Source/web/WebPageSerializerImpl.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698