Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(53)

Side by Side Diff: third_party/WebKit/Source/core/page/PageSerializer.cpp

Issue 1541463002: Rename [Web]PageSerializer[Test|Client|Impl] to ...FrameSerializer... (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@mhtml-deduplication-of-resources
Patch Set: Rebasing... Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (C) 2011 Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "core/page/PageSerializer.h"
32
33 #include "core/HTMLNames.h"
34 #include "core/InputTypeNames.h"
35 #include "core/css/CSSFontFaceRule.h"
36 #include "core/css/CSSFontFaceSrcValue.h"
37 #include "core/css/CSSImageValue.h"
38 #include "core/css/CSSImportRule.h"
39 #include "core/css/CSSRuleList.h"
40 #include "core/css/CSSStyleDeclaration.h"
41 #include "core/css/CSSStyleRule.h"
42 #include "core/css/CSSValueList.h"
43 #include "core/css/StylePropertySet.h"
44 #include "core/css/StyleRule.h"
45 #include "core/css/StyleSheetContents.h"
46 #include "core/dom/Document.h"
47 #include "core/dom/Element.h"
48 #include "core/dom/Text.h"
49 #include "core/editing/serializers/MarkupAccumulator.h"
50 #include "core/fetch/FontResource.h"
51 #include "core/fetch/ImageResource.h"
52 #include "core/frame/LocalFrame.h"
53 #include "core/html/HTMLFrameElementBase.h"
54 #include "core/html/HTMLImageElement.h"
55 #include "core/html/HTMLInputElement.h"
56 #include "core/html/HTMLLinkElement.h"
57 #include "core/html/HTMLMetaElement.h"
58 #include "core/html/HTMLStyleElement.h"
59 #include "core/html/ImageDocument.h"
60 #include "core/page/Page.h"
61 #include "core/style/StyleFetchedImage.h"
62 #include "core/style/StyleImage.h"
63 #include "platform/SerializedResource.h"
64 #include "platform/graphics/Image.h"
65 #include "platform/heap/Handle.h"
66 #include "wtf/HashSet.h"
67 #include "wtf/OwnPtr.h"
68 #include "wtf/text/CString.h"
69 #include "wtf/text/StringBuilder.h"
70 #include "wtf/text/TextEncoding.h"
71 #include "wtf/text/WTFString.h"
72
73 namespace blink {
74
75 static bool shouldIgnoreElement(const Element& element)
76 {
77 if (isHTMLScriptElement(element))
78 return true;
79 if (isHTMLNoScriptElement(element))
80 return true;
81 return isHTMLMetaElement(element) && toHTMLMetaElement(element).computeEncod ing().isValid();
82 }
83
84 class SerializerMarkupAccumulator : public MarkupAccumulator {
85 STACK_ALLOCATED();
86 public:
87 SerializerMarkupAccumulator(PageSerializer::Delegate&, const Document&, Will BeHeapVector<RawPtrWillBeMember<Node>>&);
88 ~SerializerMarkupAccumulator() override;
89
90 protected:
91 void appendText(StringBuilder& out, Text&) override;
92 bool shouldIgnoreAttribute(const Attribute&) override;
93 void appendElement(StringBuilder& out, Element&, Namespaces*) override;
94 void appendAttribute(StringBuilder& out, const Element&, const Attribute&, N amespaces*) override;
95 void appendStartTag(Node&, Namespaces* = nullptr) override;
96 void appendEndTag(const Element&) override;
97
98 private:
99 void appendAttributeValue(StringBuilder& out, const String& attributeValue);
100 void appendRewrittenAttribute(
101 StringBuilder& out,
102 const Element&,
103 const String& attributeName,
104 const String& attributeValue);
105
106 PageSerializer::Delegate& m_delegate;
107 RawPtrWillBeMember<const Document> m_document;
108
109 // FIXME: |PageSerializer| uses |m_nodes| for collecting nodes in document
110 // included into serialized text then extracts image, object, etc. The size
111 // of this vector isn't small for large document. It is better to use
112 // callback like functionality.
113 WillBeHeapVector<RawPtrWillBeMember<Node>>& m_nodes;
114
115 // Elements with links rewritten via appendAttribute method.
116 WillBeHeapHashSet<RawPtrWillBeMember<const Element>> m_elementsWithRewritten Links;
117 };
118
119 SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer::Delegat e& delegate, const Document& document, WillBeHeapVector<RawPtrWillBeMember<Node> >& nodes)
120 : MarkupAccumulator(ResolveAllURLs)
121 , m_delegate(delegate)
122 , m_document(&document)
123 , m_nodes(nodes)
124 {
125 }
126
127 SerializerMarkupAccumulator::~SerializerMarkupAccumulator()
128 {
129 }
130
131 void SerializerMarkupAccumulator::appendText(StringBuilder& result, Text& text)
132 {
133 Element* parent = text.parentElement();
134 if (parent && !shouldIgnoreElement(*parent))
135 MarkupAccumulator::appendText(result, text);
136 }
137
138 bool SerializerMarkupAccumulator::shouldIgnoreAttribute(const Attribute& attribu te)
139 {
140 return m_delegate.shouldIgnoreAttribute(attribute);
141 }
142
143 void SerializerMarkupAccumulator::appendElement(StringBuilder& result, Element& element, Namespaces* namespaces)
144 {
145 if (!shouldIgnoreElement(element))
146 MarkupAccumulator::appendElement(result, element, namespaces);
147
148 // TODO(tiger): Refactor MarkupAccumulator so it is easier to append an elem ent like this, without special cases for XHTML
149 if (isHTMLHeadElement(element)) {
150 result.appendLiteral("<meta http-equiv=\"Content-Type\" content=\"");
151 appendAttributeValue(result, m_document->suggestedMIMEType());
152 result.appendLiteral("; charset=");
153 appendAttributeValue(result, m_document->characterSet());
154 if (m_document->isXHTMLDocument())
155 result.appendLiteral("\" />");
156 else
157 result.appendLiteral("\">");
158 }
159
160 // FIXME: For object (plugins) tags and video tag we could replace them by a n image of their current contents.
161 }
162
163 void SerializerMarkupAccumulator::appendAttribute(
164 StringBuilder& out,
165 const Element& element,
166 const Attribute& attribute,
167 Namespaces* namespaces)
168 {
169 // Check if link rewriting can affect the attribute.
170 bool isLinkAttribute = element.hasLegalLinkAttribute(attribute.name());
171 bool isSrcDocAttribute = isHTMLFrameElementBase(element)
172 && attribute.name() == HTMLNames::srcdocAttr;
173 if (isLinkAttribute || isSrcDocAttribute) {
174 // Check if the delegate wants to do link rewriting for the element.
175 String newLinkForTheElement;
176 if (m_delegate.rewriteLink(element, newLinkForTheElement)) {
177 if (isLinkAttribute) {
178 // Rewrite element links.
179 appendRewrittenAttribute(
180 out, element, attribute.name().toString(), newLinkForTheElem ent);
181 } else {
182 ASSERT(isSrcDocAttribute);
183 // Emit src instead of srcdoc attribute for frame elements - we want the
184 // serialized subframe to use html contents from the link provid ed by
185 // Delegate::rewriteLink rather than html contents from srcdoc
186 // attribute.
187 appendRewrittenAttribute(
188 out, element, HTMLNames::srcAttr.localName(), newLinkForTheE lement);
189 }
190 return;
191 }
192 }
193
194 // Fallback to appending the original attribute.
195 MarkupAccumulator::appendAttribute(out, element, attribute, namespaces);
196 }
197
198 void SerializerMarkupAccumulator::appendStartTag(Node& node, Namespaces* namespa ces)
199 {
200 MarkupAccumulator::appendStartTag(node, namespaces);
201 m_nodes.append(&node);
202 }
203
204 void SerializerMarkupAccumulator::appendEndTag(const Element& element)
205 {
206 if (!shouldIgnoreElement(element))
207 MarkupAccumulator::appendEndTag(element);
208 }
209
210 void SerializerMarkupAccumulator::appendAttributeValue(
211 StringBuilder& out,
212 const String& attributeValue)
213 {
214 MarkupFormatter::appendAttributeValue(out, attributeValue, m_document->isHTM LDocument());
215 }
216
217 void SerializerMarkupAccumulator::appendRewrittenAttribute(
218 StringBuilder& out,
219 const Element& element,
220 const String& attributeName,
221 const String& attributeValue)
222 {
223 if (m_elementsWithRewrittenLinks.contains(&element))
224 return;
225 m_elementsWithRewrittenLinks.add(&element);
226
227 // Append the rewritten attribute.
228 // TODO(tiger): Refactor MarkupAccumulator so it is easier to append an attr ibute like this.
229 out.append(' ');
230 out.append(attributeName);
231 out.appendLiteral("=\"");
232 appendAttributeValue(out, attributeValue);
233 out.appendLiteral("\"");
234 }
235
236 // TODO(tiger): Right now there is no support for rewriting URLs inside CSS
237 // documents which leads to bugs like <https://crbug.com/251898>. Not being
238 // able to rewrite URLs inside CSS documents means that resources imported from
239 // url(...) statements in CSS might not work when rewriting links for the
240 // "Webpage, Complete" method of saving a page. It will take some work but it
241 // needs to be done if we want to continue to support non-MHTML saved pages.
242
243 PageSerializer::PageSerializer(
244 Vector<SerializedResource>& resources,
245 Delegate& delegate)
246 : m_resources(&resources)
247 , m_delegate(delegate)
248 {
249 }
250
251 void PageSerializer::serializeFrame(const LocalFrame& frame)
252 {
253 ASSERT(frame.document());
254 Document& document = *frame.document();
255 KURL url = document.url();
256
257 // If frame is an image document, add the image and don't continue
258 if (document.isImageDocument()) {
259 ImageDocument& imageDocument = toImageDocument(document);
260 addImageToResources(imageDocument.cachedImage(), url);
261 return;
262 }
263
264 WillBeHeapVector<RawPtrWillBeMember<Node>> serializedNodes;
265 SerializerMarkupAccumulator accumulator(m_delegate, document, serializedNode s);
266 String text = serializeNodes<EditingStrategy>(accumulator, document, Include Node);
267
268 CString frameHTML = document.encoding().encode(text, WTF::EntitiesForUnencod ables);
269 m_resources->append(SerializedResource(url, document.suggestedMIMEType(), Sh aredBuffer::create(frameHTML.data(), frameHTML.length())));
270
271 for (Node* node: serializedNodes) {
272 ASSERT(node);
273 if (!node->isElementNode())
274 continue;
275
276 Element& element = toElement(*node);
277 // We have to process in-line style as it might contain some resources ( typically background images).
278 if (element.isStyledElement()) {
279 retrieveResourcesForProperties(element.inlineStyle(), document);
280 retrieveResourcesForProperties(element.presentationAttributeStyle(), document);
281 }
282
283 if (isHTMLImageElement(element)) {
284 HTMLImageElement& imageElement = toHTMLImageElement(element);
285 KURL url = document.completeURL(imageElement.getAttribute(HTMLNames: :srcAttr));
286 ImageResource* cachedImage = imageElement.cachedImage();
287 addImageToResources(cachedImage, url);
288 } else if (isHTMLInputElement(element)) {
289 HTMLInputElement& inputElement = toHTMLInputElement(element);
290 if (inputElement.type() == InputTypeNames::image && inputElement.ima geLoader()) {
291 KURL url = inputElement.src();
292 ImageResource* cachedImage = inputElement.imageLoader()->image() ;
293 addImageToResources(cachedImage, url);
294 }
295 } else if (isHTMLLinkElement(element)) {
296 HTMLLinkElement& linkElement = toHTMLLinkElement(element);
297 if (CSSStyleSheet* sheet = linkElement.sheet()) {
298 KURL url = document.completeURL(linkElement.getAttribute(HTMLNam es::hrefAttr));
299 serializeCSSStyleSheet(*sheet, url);
300 }
301 } else if (isHTMLStyleElement(element)) {
302 HTMLStyleElement& styleElement = toHTMLStyleElement(element);
303 if (CSSStyleSheet* sheet = styleElement.sheet())
304 serializeCSSStyleSheet(*sheet, KURL());
305 }
306 }
307 }
308
309 void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet& styleSheet, const KUR L& url)
310 {
311 StringBuilder cssText;
312 cssText.appendLiteral("@charset \"");
313 cssText.append(styleSheet.contents()->charset().lower());
314 cssText.appendLiteral("\";\n\n");
315
316 for (unsigned i = 0; i < styleSheet.length(); ++i) {
317 CSSRule* rule = styleSheet.item(i);
318 String itemText = rule->cssText();
319 if (!itemText.isEmpty()) {
320 cssText.append(itemText);
321 if (i < styleSheet.length() - 1)
322 cssText.appendLiteral("\n\n");
323 }
324
325 // Some rules have resources associated with them that we need to retrie ve.
326 serializeCSSRule(rule);
327 }
328
329 if (shouldAddURL(url)) {
330 WTF::TextEncoding textEncoding(styleSheet.contents()->charset());
331 ASSERT(textEncoding.isValid());
332 String textString = cssText.toString();
333 CString text = textEncoding.encode(textString, WTF::EntitiesForUnencodab les);
334 m_resources->append(SerializedResource(url, String("text/css"), SharedBu ffer::create(text.data(), text.length())));
335 m_resourceURLs.add(url);
336 }
337 }
338
339 void PageSerializer::serializeCSSRule(CSSRule* rule)
340 {
341 ASSERT(rule->parentStyleSheet()->ownerDocument());
342 Document& document = *rule->parentStyleSheet()->ownerDocument();
343
344 switch (rule->type()) {
345 case CSSRule::STYLE_RULE:
346 retrieveResourcesForProperties(&toCSSStyleRule(rule)->styleRule()->prope rties(), document);
347 break;
348
349 case CSSRule::IMPORT_RULE: {
350 CSSImportRule* importRule = toCSSImportRule(rule);
351 KURL sheetBaseURL = rule->parentStyleSheet()->baseURL();
352 ASSERT(sheetBaseURL.isValid());
353 KURL importURL = KURL(sheetBaseURL, importRule->href());
354 if (m_resourceURLs.contains(importURL))
355 break;
356 if (importRule->styleSheet())
357 serializeCSSStyleSheet(*importRule->styleSheet(), importURL);
358 break;
359 }
360
361 // Rules inheriting CSSGroupingRule
362 case CSSRule::MEDIA_RULE:
363 case CSSRule::SUPPORTS_RULE: {
364 CSSRuleList* ruleList = rule->cssRules();
365 for (unsigned i = 0; i < ruleList->length(); ++i)
366 serializeCSSRule(ruleList->item(i));
367 break;
368 }
369
370 case CSSRule::FONT_FACE_RULE:
371 retrieveResourcesForProperties(&toCSSFontFaceRule(rule)->styleRule()->pr operties(), document);
372 break;
373
374 // Rules in which no external resources can be referenced
375 case CSSRule::CHARSET_RULE:
376 case CSSRule::PAGE_RULE:
377 case CSSRule::KEYFRAMES_RULE:
378 case CSSRule::KEYFRAME_RULE:
379 case CSSRule::VIEWPORT_RULE:
380 break;
381
382 default:
383 ASSERT_NOT_REACHED();
384 }
385 }
386
387 bool PageSerializer::shouldAddURL(const KURL& url)
388 {
389 return url.isValid() && !m_resourceURLs.contains(url) && !url.protocolIsData ()
390 && !m_delegate.shouldSkipResource(url);
391 }
392
393 void PageSerializer::addToResources(Resource* resource, PassRefPtr<SharedBuffer> data, const KURL& url)
394 {
395 if (!data) {
396 WTF_LOG_ERROR("No data for resource %s", url.string().utf8().data());
397 return;
398 }
399
400 String mimeType = resource->response().mimeType();
401 m_resources->append(SerializedResource(url, mimeType, data));
402 m_resourceURLs.add(url);
403 }
404
405 void PageSerializer::addImageToResources(ImageResource* image, const KURL& url)
406 {
407 if (!shouldAddURL(url))
408 return;
409
410 if (!image || !image->hasImage() || image->errorOccurred())
411 return;
412
413 RefPtr<SharedBuffer> data = image->image()->data();
414 addToResources(image, data, url);
415 }
416
417 void PageSerializer::addFontToResources(FontResource* font)
418 {
419 if (!font || !shouldAddURL(font->url()) || !font->isLoaded() || !font->resou rceBuffer())
420 return;
421
422 RefPtr<SharedBuffer> data(font->resourceBuffer());
423
424 addToResources(font, data, font->url());
425 }
426
427 void PageSerializer::retrieveResourcesForProperties(const StylePropertySet* styl eDeclaration, Document& document)
428 {
429 if (!styleDeclaration)
430 return;
431
432 // The background-image and list-style-image (for ul or ol) are the CSS prop erties
433 // that make use of images. We iterate to make sure we include any other
434 // image properties there might be.
435 unsigned propertyCount = styleDeclaration->propertyCount();
436 for (unsigned i = 0; i < propertyCount; ++i) {
437 RefPtrWillBeRawPtr<CSSValue> cssValue = styleDeclaration->propertyAt(i). value();
438 retrieveResourcesForCSSValue(cssValue.get(), document);
439 }
440 }
441
442 void PageSerializer::retrieveResourcesForCSSValue(CSSValue* cssValue, Document& document)
443 {
444 if (cssValue->isImageValue()) {
445 CSSImageValue* imageValue = toCSSImageValue(cssValue);
446 if (imageValue->isCachePending())
447 return;
448 StyleImage* styleImage = imageValue->cachedImage();
449 if (!styleImage || !styleImage->isImageResource())
450 return;
451
452 addImageToResources(styleImage->cachedImage(), styleImage->cachedImage() ->url());
453 } else if (cssValue->isFontFaceSrcValue()) {
454 CSSFontFaceSrcValue* fontFaceSrcValue = toCSSFontFaceSrcValue(cssValue);
455 if (fontFaceSrcValue->isLocal()) {
456 return;
457 }
458
459 addFontToResources(fontFaceSrcValue->fetch(&document));
460 } else if (cssValue->isValueList()) {
461 CSSValueList* cssValueList = toCSSValueList(cssValue);
462 for (unsigned i = 0; i < cssValueList->length(); i++)
463 retrieveResourcesForCSSValue(cssValueList->item(i), document);
464 }
465 }
466
467 // Returns MOTW (Mark of the Web) declaration before html tag which is in
468 // HTML comment, e.g. "<!-- saved from url=(%04d)%s -->"
469 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx.
470 String PageSerializer::markOfTheWebDeclaration(const KURL& url)
471 {
472 StringBuilder builder;
473 bool emitsMinus = false;
474 CString orignalUrl = url.string().ascii();
475 for (const char* string = orignalUrl.data(); *string; ++string) {
476 const char ch = *string;
477 if (ch == '-' && emitsMinus) {
478 builder.append("%2D");
479 emitsMinus = false;
480 continue;
481 }
482 emitsMinus = ch == '-';
483 builder.append(ch);
484 }
485 CString escapedUrl = builder.toString().ascii();
486 return String::format("saved from url=(%04d)%s", static_cast<int>(escapedUrl .length()), escapedUrl.data());
487 }
488
489 } // namespace blink
OLDNEW
« no previous file with comments | « third_party/WebKit/Source/core/page/PageSerializer.h ('k') | third_party/WebKit/Source/web/WebFrameSerializer.cpp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698