OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (C) 2011 Google Inc. All rights reserved. | |
3 * | |
4 * Redistribution and use in source and binary forms, with or without | |
5 * modification, are permitted provided that the following conditions are | |
6 * met: | |
7 * | |
8 * * Redistributions of source code must retain the above copyright | |
9 * notice, this list of conditions and the following disclaimer. | |
10 * * Redistributions in binary form must reproduce the above | |
11 * copyright notice, this list of conditions and the following disclaimer | |
12 * in the documentation and/or other materials provided with the | |
13 * distribution. | |
14 * * Neither the name of Google Inc. nor the names of its | |
15 * contributors may be used to endorse or promote products derived from | |
16 * this software without specific prior written permission. | |
17 * | |
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 #include "core/page/PageSerializer.h" | |
32 | |
33 #include "core/HTMLNames.h" | |
34 #include "core/InputTypeNames.h" | |
35 #include "core/css/CSSFontFaceRule.h" | |
36 #include "core/css/CSSFontFaceSrcValue.h" | |
37 #include "core/css/CSSImageValue.h" | |
38 #include "core/css/CSSImportRule.h" | |
39 #include "core/css/CSSRuleList.h" | |
40 #include "core/css/CSSStyleDeclaration.h" | |
41 #include "core/css/CSSStyleRule.h" | |
42 #include "core/css/CSSValueList.h" | |
43 #include "core/css/StylePropertySet.h" | |
44 #include "core/css/StyleRule.h" | |
45 #include "core/css/StyleSheetContents.h" | |
46 #include "core/dom/Document.h" | |
47 #include "core/dom/Element.h" | |
48 #include "core/dom/Text.h" | |
49 #include "core/editing/serializers/MarkupAccumulator.h" | |
50 #include "core/fetch/FontResource.h" | |
51 #include "core/fetch/ImageResource.h" | |
52 #include "core/frame/LocalFrame.h" | |
53 #include "core/html/HTMLFrameElementBase.h" | |
54 #include "core/html/HTMLImageElement.h" | |
55 #include "core/html/HTMLInputElement.h" | |
56 #include "core/html/HTMLLinkElement.h" | |
57 #include "core/html/HTMLMetaElement.h" | |
58 #include "core/html/HTMLStyleElement.h" | |
59 #include "core/html/ImageDocument.h" | |
60 #include "core/page/Page.h" | |
61 #include "core/style/StyleFetchedImage.h" | |
62 #include "core/style/StyleImage.h" | |
63 #include "platform/SerializedResource.h" | |
64 #include "platform/graphics/Image.h" | |
65 #include "platform/heap/Handle.h" | |
66 #include "wtf/HashSet.h" | |
67 #include "wtf/OwnPtr.h" | |
68 #include "wtf/text/CString.h" | |
69 #include "wtf/text/StringBuilder.h" | |
70 #include "wtf/text/TextEncoding.h" | |
71 #include "wtf/text/WTFString.h" | |
72 | |
73 namespace blink { | |
74 | |
75 static bool shouldIgnoreElement(const Element& element) | |
76 { | |
77 if (isHTMLScriptElement(element)) | |
78 return true; | |
79 if (isHTMLNoScriptElement(element)) | |
80 return true; | |
81 return isHTMLMetaElement(element) && toHTMLMetaElement(element).computeEncod
ing().isValid(); | |
82 } | |
83 | |
84 class SerializerMarkupAccumulator : public MarkupAccumulator { | |
85 STACK_ALLOCATED(); | |
86 public: | |
87 SerializerMarkupAccumulator(PageSerializer::Delegate&, const Document&, Will
BeHeapVector<RawPtrWillBeMember<Node>>&); | |
88 ~SerializerMarkupAccumulator() override; | |
89 | |
90 protected: | |
91 void appendText(StringBuilder& out, Text&) override; | |
92 bool shouldIgnoreAttribute(const Attribute&) override; | |
93 void appendElement(StringBuilder& out, Element&, Namespaces*) override; | |
94 void appendAttribute(StringBuilder& out, const Element&, const Attribute&, N
amespaces*) override; | |
95 void appendStartTag(Node&, Namespaces* = nullptr) override; | |
96 void appendEndTag(const Element&) override; | |
97 | |
98 private: | |
99 void appendAttributeValue(StringBuilder& out, const String& attributeValue); | |
100 void appendRewrittenAttribute( | |
101 StringBuilder& out, | |
102 const Element&, | |
103 const String& attributeName, | |
104 const String& attributeValue); | |
105 | |
106 PageSerializer::Delegate& m_delegate; | |
107 RawPtrWillBeMember<const Document> m_document; | |
108 | |
109 // FIXME: |PageSerializer| uses |m_nodes| for collecting nodes in document | |
110 // included into serialized text then extracts image, object, etc. The size | |
111 // of this vector isn't small for large document. It is better to use | |
112 // callback like functionality. | |
113 WillBeHeapVector<RawPtrWillBeMember<Node>>& m_nodes; | |
114 | |
115 // Elements with links rewritten via appendAttribute method. | |
116 WillBeHeapHashSet<RawPtrWillBeMember<const Element>> m_elementsWithRewritten
Links; | |
117 }; | |
118 | |
119 SerializerMarkupAccumulator::SerializerMarkupAccumulator(PageSerializer::Delegat
e& delegate, const Document& document, WillBeHeapVector<RawPtrWillBeMember<Node>
>& nodes) | |
120 : MarkupAccumulator(ResolveAllURLs) | |
121 , m_delegate(delegate) | |
122 , m_document(&document) | |
123 , m_nodes(nodes) | |
124 { | |
125 } | |
126 | |
127 SerializerMarkupAccumulator::~SerializerMarkupAccumulator() | |
128 { | |
129 } | |
130 | |
131 void SerializerMarkupAccumulator::appendText(StringBuilder& result, Text& text) | |
132 { | |
133 Element* parent = text.parentElement(); | |
134 if (parent && !shouldIgnoreElement(*parent)) | |
135 MarkupAccumulator::appendText(result, text); | |
136 } | |
137 | |
138 bool SerializerMarkupAccumulator::shouldIgnoreAttribute(const Attribute& attribu
te) | |
139 { | |
140 return m_delegate.shouldIgnoreAttribute(attribute); | |
141 } | |
142 | |
143 void SerializerMarkupAccumulator::appendElement(StringBuilder& result, Element&
element, Namespaces* namespaces) | |
144 { | |
145 if (!shouldIgnoreElement(element)) | |
146 MarkupAccumulator::appendElement(result, element, namespaces); | |
147 | |
148 // TODO(tiger): Refactor MarkupAccumulator so it is easier to append an elem
ent like this, without special cases for XHTML | |
149 if (isHTMLHeadElement(element)) { | |
150 result.appendLiteral("<meta http-equiv=\"Content-Type\" content=\""); | |
151 appendAttributeValue(result, m_document->suggestedMIMEType()); | |
152 result.appendLiteral("; charset="); | |
153 appendAttributeValue(result, m_document->characterSet()); | |
154 if (m_document->isXHTMLDocument()) | |
155 result.appendLiteral("\" />"); | |
156 else | |
157 result.appendLiteral("\">"); | |
158 } | |
159 | |
160 // FIXME: For object (plugins) tags and video tag we could replace them by a
n image of their current contents. | |
161 } | |
162 | |
163 void SerializerMarkupAccumulator::appendAttribute( | |
164 StringBuilder& out, | |
165 const Element& element, | |
166 const Attribute& attribute, | |
167 Namespaces* namespaces) | |
168 { | |
169 // Check if link rewriting can affect the attribute. | |
170 bool isLinkAttribute = element.hasLegalLinkAttribute(attribute.name()); | |
171 bool isSrcDocAttribute = isHTMLFrameElementBase(element) | |
172 && attribute.name() == HTMLNames::srcdocAttr; | |
173 if (isLinkAttribute || isSrcDocAttribute) { | |
174 // Check if the delegate wants to do link rewriting for the element. | |
175 String newLinkForTheElement; | |
176 if (m_delegate.rewriteLink(element, newLinkForTheElement)) { | |
177 if (isLinkAttribute) { | |
178 // Rewrite element links. | |
179 appendRewrittenAttribute( | |
180 out, element, attribute.name().toString(), newLinkForTheElem
ent); | |
181 } else { | |
182 ASSERT(isSrcDocAttribute); | |
183 // Emit src instead of srcdoc attribute for frame elements - we
want the | |
184 // serialized subframe to use html contents from the link provid
ed by | |
185 // Delegate::rewriteLink rather than html contents from srcdoc | |
186 // attribute. | |
187 appendRewrittenAttribute( | |
188 out, element, HTMLNames::srcAttr.localName(), newLinkForTheE
lement); | |
189 } | |
190 return; | |
191 } | |
192 } | |
193 | |
194 // Fallback to appending the original attribute. | |
195 MarkupAccumulator::appendAttribute(out, element, attribute, namespaces); | |
196 } | |
197 | |
198 void SerializerMarkupAccumulator::appendStartTag(Node& node, Namespaces* namespa
ces) | |
199 { | |
200 MarkupAccumulator::appendStartTag(node, namespaces); | |
201 m_nodes.append(&node); | |
202 } | |
203 | |
204 void SerializerMarkupAccumulator::appendEndTag(const Element& element) | |
205 { | |
206 if (!shouldIgnoreElement(element)) | |
207 MarkupAccumulator::appendEndTag(element); | |
208 } | |
209 | |
210 void SerializerMarkupAccumulator::appendAttributeValue( | |
211 StringBuilder& out, | |
212 const String& attributeValue) | |
213 { | |
214 MarkupFormatter::appendAttributeValue(out, attributeValue, m_document->isHTM
LDocument()); | |
215 } | |
216 | |
217 void SerializerMarkupAccumulator::appendRewrittenAttribute( | |
218 StringBuilder& out, | |
219 const Element& element, | |
220 const String& attributeName, | |
221 const String& attributeValue) | |
222 { | |
223 if (m_elementsWithRewrittenLinks.contains(&element)) | |
224 return; | |
225 m_elementsWithRewrittenLinks.add(&element); | |
226 | |
227 // Append the rewritten attribute. | |
228 // TODO(tiger): Refactor MarkupAccumulator so it is easier to append an attr
ibute like this. | |
229 out.append(' '); | |
230 out.append(attributeName); | |
231 out.appendLiteral("=\""); | |
232 appendAttributeValue(out, attributeValue); | |
233 out.appendLiteral("\""); | |
234 } | |
235 | |
236 // TODO(tiger): Right now there is no support for rewriting URLs inside CSS | |
237 // documents which leads to bugs like <https://crbug.com/251898>. Not being | |
238 // able to rewrite URLs inside CSS documents means that resources imported from | |
239 // url(...) statements in CSS might not work when rewriting links for the | |
240 // "Webpage, Complete" method of saving a page. It will take some work but it | |
241 // needs to be done if we want to continue to support non-MHTML saved pages. | |
242 | |
243 PageSerializer::PageSerializer( | |
244 Vector<SerializedResource>& resources, | |
245 Delegate& delegate) | |
246 : m_resources(&resources) | |
247 , m_delegate(delegate) | |
248 { | |
249 } | |
250 | |
251 void PageSerializer::serializeFrame(const LocalFrame& frame) | |
252 { | |
253 ASSERT(frame.document()); | |
254 Document& document = *frame.document(); | |
255 KURL url = document.url(); | |
256 | |
257 // If frame is an image document, add the image and don't continue | |
258 if (document.isImageDocument()) { | |
259 ImageDocument& imageDocument = toImageDocument(document); | |
260 addImageToResources(imageDocument.cachedImage(), url); | |
261 return; | |
262 } | |
263 | |
264 WillBeHeapVector<RawPtrWillBeMember<Node>> serializedNodes; | |
265 SerializerMarkupAccumulator accumulator(m_delegate, document, serializedNode
s); | |
266 String text = serializeNodes<EditingStrategy>(accumulator, document, Include
Node); | |
267 | |
268 CString frameHTML = document.encoding().encode(text, WTF::EntitiesForUnencod
ables); | |
269 m_resources->append(SerializedResource(url, document.suggestedMIMEType(), Sh
aredBuffer::create(frameHTML.data(), frameHTML.length()))); | |
270 | |
271 for (Node* node: serializedNodes) { | |
272 ASSERT(node); | |
273 if (!node->isElementNode()) | |
274 continue; | |
275 | |
276 Element& element = toElement(*node); | |
277 // We have to process in-line style as it might contain some resources (
typically background images). | |
278 if (element.isStyledElement()) { | |
279 retrieveResourcesForProperties(element.inlineStyle(), document); | |
280 retrieveResourcesForProperties(element.presentationAttributeStyle(),
document); | |
281 } | |
282 | |
283 if (isHTMLImageElement(element)) { | |
284 HTMLImageElement& imageElement = toHTMLImageElement(element); | |
285 KURL url = document.completeURL(imageElement.getAttribute(HTMLNames:
:srcAttr)); | |
286 ImageResource* cachedImage = imageElement.cachedImage(); | |
287 addImageToResources(cachedImage, url); | |
288 } else if (isHTMLInputElement(element)) { | |
289 HTMLInputElement& inputElement = toHTMLInputElement(element); | |
290 if (inputElement.type() == InputTypeNames::image && inputElement.ima
geLoader()) { | |
291 KURL url = inputElement.src(); | |
292 ImageResource* cachedImage = inputElement.imageLoader()->image()
; | |
293 addImageToResources(cachedImage, url); | |
294 } | |
295 } else if (isHTMLLinkElement(element)) { | |
296 HTMLLinkElement& linkElement = toHTMLLinkElement(element); | |
297 if (CSSStyleSheet* sheet = linkElement.sheet()) { | |
298 KURL url = document.completeURL(linkElement.getAttribute(HTMLNam
es::hrefAttr)); | |
299 serializeCSSStyleSheet(*sheet, url); | |
300 } | |
301 } else if (isHTMLStyleElement(element)) { | |
302 HTMLStyleElement& styleElement = toHTMLStyleElement(element); | |
303 if (CSSStyleSheet* sheet = styleElement.sheet()) | |
304 serializeCSSStyleSheet(*sheet, KURL()); | |
305 } | |
306 } | |
307 } | |
308 | |
309 void PageSerializer::serializeCSSStyleSheet(CSSStyleSheet& styleSheet, const KUR
L& url) | |
310 { | |
311 StringBuilder cssText; | |
312 cssText.appendLiteral("@charset \""); | |
313 cssText.append(styleSheet.contents()->charset().lower()); | |
314 cssText.appendLiteral("\";\n\n"); | |
315 | |
316 for (unsigned i = 0; i < styleSheet.length(); ++i) { | |
317 CSSRule* rule = styleSheet.item(i); | |
318 String itemText = rule->cssText(); | |
319 if (!itemText.isEmpty()) { | |
320 cssText.append(itemText); | |
321 if (i < styleSheet.length() - 1) | |
322 cssText.appendLiteral("\n\n"); | |
323 } | |
324 | |
325 // Some rules have resources associated with them that we need to retrie
ve. | |
326 serializeCSSRule(rule); | |
327 } | |
328 | |
329 if (shouldAddURL(url)) { | |
330 WTF::TextEncoding textEncoding(styleSheet.contents()->charset()); | |
331 ASSERT(textEncoding.isValid()); | |
332 String textString = cssText.toString(); | |
333 CString text = textEncoding.encode(textString, WTF::EntitiesForUnencodab
les); | |
334 m_resources->append(SerializedResource(url, String("text/css"), SharedBu
ffer::create(text.data(), text.length()))); | |
335 m_resourceURLs.add(url); | |
336 } | |
337 } | |
338 | |
339 void PageSerializer::serializeCSSRule(CSSRule* rule) | |
340 { | |
341 ASSERT(rule->parentStyleSheet()->ownerDocument()); | |
342 Document& document = *rule->parentStyleSheet()->ownerDocument(); | |
343 | |
344 switch (rule->type()) { | |
345 case CSSRule::STYLE_RULE: | |
346 retrieveResourcesForProperties(&toCSSStyleRule(rule)->styleRule()->prope
rties(), document); | |
347 break; | |
348 | |
349 case CSSRule::IMPORT_RULE: { | |
350 CSSImportRule* importRule = toCSSImportRule(rule); | |
351 KURL sheetBaseURL = rule->parentStyleSheet()->baseURL(); | |
352 ASSERT(sheetBaseURL.isValid()); | |
353 KURL importURL = KURL(sheetBaseURL, importRule->href()); | |
354 if (m_resourceURLs.contains(importURL)) | |
355 break; | |
356 if (importRule->styleSheet()) | |
357 serializeCSSStyleSheet(*importRule->styleSheet(), importURL); | |
358 break; | |
359 } | |
360 | |
361 // Rules inheriting CSSGroupingRule | |
362 case CSSRule::MEDIA_RULE: | |
363 case CSSRule::SUPPORTS_RULE: { | |
364 CSSRuleList* ruleList = rule->cssRules(); | |
365 for (unsigned i = 0; i < ruleList->length(); ++i) | |
366 serializeCSSRule(ruleList->item(i)); | |
367 break; | |
368 } | |
369 | |
370 case CSSRule::FONT_FACE_RULE: | |
371 retrieveResourcesForProperties(&toCSSFontFaceRule(rule)->styleRule()->pr
operties(), document); | |
372 break; | |
373 | |
374 // Rules in which no external resources can be referenced | |
375 case CSSRule::CHARSET_RULE: | |
376 case CSSRule::PAGE_RULE: | |
377 case CSSRule::KEYFRAMES_RULE: | |
378 case CSSRule::KEYFRAME_RULE: | |
379 case CSSRule::VIEWPORT_RULE: | |
380 break; | |
381 | |
382 default: | |
383 ASSERT_NOT_REACHED(); | |
384 } | |
385 } | |
386 | |
387 bool PageSerializer::shouldAddURL(const KURL& url) | |
388 { | |
389 return url.isValid() && !m_resourceURLs.contains(url) && !url.protocolIsData
() | |
390 && !m_delegate.shouldSkipResource(url); | |
391 } | |
392 | |
393 void PageSerializer::addToResources(Resource* resource, PassRefPtr<SharedBuffer>
data, const KURL& url) | |
394 { | |
395 if (!data) { | |
396 WTF_LOG_ERROR("No data for resource %s", url.string().utf8().data()); | |
397 return; | |
398 } | |
399 | |
400 String mimeType = resource->response().mimeType(); | |
401 m_resources->append(SerializedResource(url, mimeType, data)); | |
402 m_resourceURLs.add(url); | |
403 } | |
404 | |
405 void PageSerializer::addImageToResources(ImageResource* image, const KURL& url) | |
406 { | |
407 if (!shouldAddURL(url)) | |
408 return; | |
409 | |
410 if (!image || !image->hasImage() || image->errorOccurred()) | |
411 return; | |
412 | |
413 RefPtr<SharedBuffer> data = image->image()->data(); | |
414 addToResources(image, data, url); | |
415 } | |
416 | |
417 void PageSerializer::addFontToResources(FontResource* font) | |
418 { | |
419 if (!font || !shouldAddURL(font->url()) || !font->isLoaded() || !font->resou
rceBuffer()) | |
420 return; | |
421 | |
422 RefPtr<SharedBuffer> data(font->resourceBuffer()); | |
423 | |
424 addToResources(font, data, font->url()); | |
425 } | |
426 | |
427 void PageSerializer::retrieveResourcesForProperties(const StylePropertySet* styl
eDeclaration, Document& document) | |
428 { | |
429 if (!styleDeclaration) | |
430 return; | |
431 | |
432 // The background-image and list-style-image (for ul or ol) are the CSS prop
erties | |
433 // that make use of images. We iterate to make sure we include any other | |
434 // image properties there might be. | |
435 unsigned propertyCount = styleDeclaration->propertyCount(); | |
436 for (unsigned i = 0; i < propertyCount; ++i) { | |
437 RefPtrWillBeRawPtr<CSSValue> cssValue = styleDeclaration->propertyAt(i).
value(); | |
438 retrieveResourcesForCSSValue(cssValue.get(), document); | |
439 } | |
440 } | |
441 | |
442 void PageSerializer::retrieveResourcesForCSSValue(CSSValue* cssValue, Document&
document) | |
443 { | |
444 if (cssValue->isImageValue()) { | |
445 CSSImageValue* imageValue = toCSSImageValue(cssValue); | |
446 if (imageValue->isCachePending()) | |
447 return; | |
448 StyleImage* styleImage = imageValue->cachedImage(); | |
449 if (!styleImage || !styleImage->isImageResource()) | |
450 return; | |
451 | |
452 addImageToResources(styleImage->cachedImage(), styleImage->cachedImage()
->url()); | |
453 } else if (cssValue->isFontFaceSrcValue()) { | |
454 CSSFontFaceSrcValue* fontFaceSrcValue = toCSSFontFaceSrcValue(cssValue); | |
455 if (fontFaceSrcValue->isLocal()) { | |
456 return; | |
457 } | |
458 | |
459 addFontToResources(fontFaceSrcValue->fetch(&document)); | |
460 } else if (cssValue->isValueList()) { | |
461 CSSValueList* cssValueList = toCSSValueList(cssValue); | |
462 for (unsigned i = 0; i < cssValueList->length(); i++) | |
463 retrieveResourcesForCSSValue(cssValueList->item(i), document); | |
464 } | |
465 } | |
466 | |
467 // Returns MOTW (Mark of the Web) declaration before html tag which is in | |
468 // HTML comment, e.g. "<!-- saved from url=(%04d)%s -->" | |
469 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx. | |
470 String PageSerializer::markOfTheWebDeclaration(const KURL& url) | |
471 { | |
472 StringBuilder builder; | |
473 bool emitsMinus = false; | |
474 CString orignalUrl = url.string().ascii(); | |
475 for (const char* string = orignalUrl.data(); *string; ++string) { | |
476 const char ch = *string; | |
477 if (ch == '-' && emitsMinus) { | |
478 builder.append("%2D"); | |
479 emitsMinus = false; | |
480 continue; | |
481 } | |
482 emitsMinus = ch == '-'; | |
483 builder.append(ch); | |
484 } | |
485 CString escapedUrl = builder.toString().ascii(); | |
486 return String::format("saved from url=(%04d)%s", static_cast<int>(escapedUrl
.length()), escapedUrl.data()); | |
487 } | |
488 | |
489 } // namespace blink | |
OLD | NEW |