OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (C) 2009 Google Inc. All rights reserved. | |
3 * | |
4 * Redistribution and use in source and binary forms, with or without | |
5 * modification, are permitted provided that the following conditions are | |
6 * met: | |
7 * | |
8 * * Redistributions of source code must retain the above copyright | |
9 * notice, this list of conditions and the following disclaimer. | |
10 * * Redistributions in binary form must reproduce the above | |
11 * copyright notice, this list of conditions and the following disclaimer | |
12 * in the documentation and/or other materials provided with the | |
13 * distribution. | |
14 * * Neither the name of Google Inc. nor the names of its | |
15 * contributors may be used to endorse or promote products derived from | |
16 * this software without specific prior written permission. | |
17 * | |
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 // How we handle the base tag better. | |
32 // Current status: | |
33 // At now the normal way we use to handling base tag is | |
34 // a) For those links which have corresponding local saved files, such as | |
35 // savable CSS, JavaScript files, they will be written to relative URLs which | |
36 // point to local saved file. Why those links can not be resolved as absolute | |
37 // file URLs, because if they are resolved as absolute URLs, after moving the | |
38 // file location from one directory to another directory, the file URLs will | |
39 // be dead links. | |
40 // b) For those links which have not corresponding local saved files, such as | |
41 // links in A, AREA tags, they will be resolved as absolute URLs. | |
42 // c) We comment all base tags when serialzing DOM for the page. | |
43 // FireFox also uses above way to handle base tag. | |
44 // | |
45 // Problem: | |
46 // This way can not handle the following situation: | |
47 // the base tag is written by JavaScript. | |
48 // For example. The page "www.yahoo.com" use | |
49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL | |
50 // of page when loading page. So when saving page as completed-HTML, we assume | |
51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved | |
52 // completed-HTML page, then the JavaScript will insert a base tag | |
53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to | |
54 // local saved resource files will be resolved as | |
55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource | |
56 // files can not be loaded correctly. Also the page will be rendered ugly since | |
57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame | |
58 // files can not be fetched. | |
59 // Now FireFox, IE and WebKit based Browser all have this problem. | |
60 // | |
61 // Solution: | |
62 // My solution is that we comment old base tag and write new base tag: | |
63 // <base href="." ...> after the previous commented base tag. In WebKit, it | |
64 // always uses the latest "href" attribute of base tag to set document's base | |
65 // URL. Based on this behavior, when we encounter a base tag, we comment it and | |
66 // write a new base tag <base href="."> after the previous commented base tag. | |
67 // The new added base tag can help engine to locate correct base URL for | |
68 // correctly loading local saved resource files. Also I think we need to inherit | |
69 // the base target value from document object when appending new base tag. | |
70 // If there are multiple base tags in original document, we will comment all old | |
71 // base tags and append new base tag after each old base tag because we do not | |
72 // know those old base tags are original content or added by JavaScript. If | |
73 // they are added by JavaScript, it means when loading saved page, the script(s) | |
74 // will still insert base tag(s) to DOM, so the new added base tag(s) can | |
75 // override the incorrect base URL and make sure we alway load correct local | |
76 // saved resource files. | |
77 | |
78 #include "web/WebPageSerializerImpl.h" | |
79 | |
80 #include "core/HTMLNames.h" | |
81 #include "core/dom/Document.h" | |
82 #include "core/dom/DocumentType.h" | |
83 #include "core/dom/Element.h" | |
84 #include "core/editing/serializers/Serialization.h" | |
85 #include "core/html/HTMLAllCollection.h" | |
86 #include "core/html/HTMLElement.h" | |
87 #include "core/html/HTMLFormElement.h" | |
88 #include "core/html/HTMLHtmlElement.h" | |
89 #include "core/html/HTMLMetaElement.h" | |
90 #include "core/loader/DocumentLoader.h" | |
91 #include "core/loader/FrameLoader.h" | |
92 #include "core/page/PageSerializer.h" | |
93 #include "public/platform/WebVector.h" | |
94 #include "web/WebLocalFrameImpl.h" | |
95 #include "wtf/text/TextEncoding.h" | |
96 | |
97 namespace blink { | |
98 | |
99 // Maximum length of data buffer which is used to temporary save generated | |
100 // html content data. This is a soft limit which might be passed if a very large | |
101 // contegious string is found in the page. | |
102 static const unsigned dataBufferCapacity = 65536; | |
103 | |
104 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url, | |
105 const WTF::TextEncod
ing& textEncoding, | |
106 Document* document) | |
107 : url(url) | |
108 , textEncoding(textEncoding) | |
109 , document(document) | |
110 , isHTMLDocument(document->isHTMLDocument()) | |
111 , haveSeenDocType(false) | |
112 , haveAddedCharsetDeclaration(false) | |
113 , skipMetaElement(nullptr) | |
114 , isInScriptOrStyleTag(false) | |
115 , haveAddedXMLProcessingDirective(false) | |
116 , haveAddedContentsBeforeEnd(false) | |
117 { | |
118 } | |
119 | |
120 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag( | |
121 const Element* element, SerializeDomParam* param, bool* needSkip) | |
122 { | |
123 StringBuilder result; | |
124 | |
125 *needSkip = false; | |
126 if (param->isHTMLDocument) { | |
127 // Skip the open tag of original META tag which declare charset since we | |
128 // have overrided the META which have correct charset declaration after | |
129 // serializing open tag of HEAD element. | |
130 ASSERT(element); | |
131 if (isHTMLMetaElement(element) && toHTMLMetaElement(element)->computeEnc
oding().isValid()) { | |
132 // Found META tag declared charset, we need to skip it when | |
133 // serializing DOM. | |
134 param->skipMetaElement = element; | |
135 *needSkip = true; | |
136 } else if (isHTMLHtmlElement(*element)) { | |
137 // Check something before processing the open tag of HEAD element. | |
138 // First we add doc type declaration if original document has it. | |
139 if (!param->haveSeenDocType) { | |
140 param->haveSeenDocType = true; | |
141 result.append(createMarkup(param->document->doctype())); | |
142 } | |
143 | |
144 // Add MOTW declaration before html tag. | |
145 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx
. | |
146 result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(par
am->url)); | |
147 } else if (isHTMLBaseElement(*element)) { | |
148 // Comment the BASE tag when serializing dom. | |
149 result.appendLiteral("<!--"); | |
150 } | |
151 } else { | |
152 // Write XML declaration. | |
153 if (!param->haveAddedXMLProcessingDirective) { | |
154 param->haveAddedXMLProcessingDirective = true; | |
155 // Get encoding info. | |
156 String xmlEncoding = param->document->xmlEncoding(); | |
157 if (xmlEncoding.isEmpty()) | |
158 xmlEncoding = param->document->encodingName(); | |
159 if (xmlEncoding.isEmpty()) | |
160 xmlEncoding = UTF8Encoding().name(); | |
161 result.appendLiteral("<?xml version=\""); | |
162 result.append(param->document->xmlVersion()); | |
163 result.appendLiteral("\" encoding=\""); | |
164 result.append(xmlEncoding); | |
165 if (param->document->xmlStandalone()) | |
166 result.appendLiteral("\" standalone=\"yes"); | |
167 result.appendLiteral("\"?>\n"); | |
168 } | |
169 // Add doc type declaration if original document has it. | |
170 if (!param->haveSeenDocType) { | |
171 param->haveSeenDocType = true; | |
172 result.append(createMarkup(param->document->doctype())); | |
173 } | |
174 } | |
175 return result.toString(); | |
176 } | |
177 | |
178 String WebPageSerializerImpl::postActionAfterSerializeOpenTag( | |
179 const Element* element, SerializeDomParam* param) | |
180 { | |
181 StringBuilder result; | |
182 | |
183 param->haveAddedContentsBeforeEnd = false; | |
184 if (!param->isHTMLDocument) | |
185 return result.toString(); | |
186 // Check after processing the open tag of HEAD element | |
187 if (!param->haveAddedCharsetDeclaration | |
188 && isHTMLHeadElement(*element)) { | |
189 param->haveAddedCharsetDeclaration = true; | |
190 // Check meta element. WebKit only pre-parse the first 512 bytes | |
191 // of the document. If the whole <HEAD> is larger and meta is the | |
192 // end of head part, then this kind of pages aren't decoded correctly | |
193 // because of this issue. So when we serialize the DOM, we need to | |
194 // make sure the meta will in first child of head tag. | |
195 // See http://bugs.webkit.org/show_bug.cgi?id=16621. | |
196 // First we generate new content for writing correct META element. | |
197 result.append(WebPageSerializer::generateMetaCharsetDeclaration( | |
198 String(param->textEncoding.name()))); | |
199 | |
200 param->haveAddedContentsBeforeEnd = true; | |
201 // Will search each META which has charset declaration, and skip them al
l | |
202 // in PreActionBeforeSerializeOpenTag. | |
203 } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) { | |
204 param->isInScriptOrStyleTag = true; | |
205 } | |
206 | |
207 return result.toString(); | |
208 } | |
209 | |
210 String WebPageSerializerImpl::preActionBeforeSerializeEndTag( | |
211 const Element* element, SerializeDomParam* param, bool* needSkip) | |
212 { | |
213 String result; | |
214 | |
215 *needSkip = false; | |
216 if (!param->isHTMLDocument) | |
217 return result; | |
218 // Skip the end tag of original META tag which declare charset. | |
219 // Need not to check whether it's META tag since we guarantee | |
220 // skipMetaElement is definitely META tag if it's not 0. | |
221 if (param->skipMetaElement == element) { | |
222 *needSkip = true; | |
223 } else if (isHTMLScriptElement(*element) || isHTMLScriptElement(*element)) { | |
224 ASSERT(param->isInScriptOrStyleTag); | |
225 param->isInScriptOrStyleTag = false; | |
226 } | |
227 | |
228 return result; | |
229 } | |
230 | |
231 // After we finish serializing end tag of a element, we give the target | |
232 // element a chance to do some post work to add some additional data. | |
233 String WebPageSerializerImpl::postActionAfterSerializeEndTag( | |
234 const Element* element, SerializeDomParam* param) | |
235 { | |
236 StringBuilder result; | |
237 | |
238 if (!param->isHTMLDocument) | |
239 return result.toString(); | |
240 // Comment the BASE tag when serializing DOM. | |
241 if (isHTMLBaseElement(*element)) { | |
242 result.appendLiteral("-->"); | |
243 // Append a new base tag declaration. | |
244 result.append(WebPageSerializer::generateBaseTagDeclaration( | |
245 param->document->baseTarget())); | |
246 } | |
247 | |
248 return result.toString(); | |
249 } | |
250 | |
251 void WebPageSerializerImpl::saveHTMLContentToBuffer( | |
252 const String& result, SerializeDomParam* param) | |
253 { | |
254 m_dataBuffer.append(result); | |
255 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished, | |
256 param, | |
257 DoNotForceFlush); | |
258 } | |
259 | |
260 void WebPageSerializerImpl::encodeAndFlushBuffer( | |
261 WebPageSerializerClient::PageSerializationStatus status, | |
262 SerializeDomParam* param, | |
263 FlushOption flushOption) | |
264 { | |
265 // Data buffer is not full nor do we want to force flush. | |
266 if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity
) | |
267 return; | |
268 | |
269 String content = m_dataBuffer.toString(); | |
270 m_dataBuffer.clear(); | |
271 | |
272 CString encodedContent = param->textEncoding.encode(content, WTF::EntitiesFo
rUnencodables); | |
273 | |
274 // Send result to the client. | |
275 m_client->didSerializeDataForFrame(WebCString(encodedContent), status); | |
276 } | |
277 | |
278 // TODO(yosin): We should utilize |MarkupFormatter| here to share code, | |
279 // especially escaping attribute values, done by |WebEntities| |m_htmlEntities| | |
280 // and |m_xmlEntities|. | |
281 void WebPageSerializerImpl::openTagToString(Element* element, | |
282 SerializeDomParam* param) | |
283 { | |
284 bool needSkip; | |
285 StringBuilder result; | |
286 // Do pre action for open tag. | |
287 result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip)); | |
288 if (needSkip) | |
289 return; | |
290 // Add open tag | |
291 result.append('<'); | |
292 result.append(element->nodeName().lower()); | |
293 // Go through all attributes and serialize them. | |
294 AttributeCollection attributes = element->attributes(); | |
295 AttributeCollection::iterator end = attributes.end(); | |
296 for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it)
{ | |
297 result.append(' '); | |
298 // Add attribute pair | |
299 result.append(it->name().toString()); | |
300 result.appendLiteral("=\""); | |
301 if (!it->value().isEmpty()) { | |
302 const String& attrValue = it->value(); | |
303 | |
304 // Check whether we need to replace some resource links | |
305 // with local resource paths. | |
306 const QualifiedName& attrName = it->name(); | |
307 if (element->hasLegalLinkAttribute(attrName)) { | |
308 // For links start with "javascript:", we do not change it. | |
309 if (attrValue.startsWith("javascript:", TextCaseInsensitive)) { | |
310 result.append(m_htmlEntities.convertEntitiesInString(attrVal
ue)); | |
311 } else { | |
312 // Get the absolute link | |
313 String completeURL = param->document->completeURL(attrValue)
; | |
314 // Check whether we have local files for those link. | |
315 if (m_localLinks.contains(completeURL)) { | |
316 result.append(m_htmlEntities.convertEntitiesInString(m_l
ocalLinks.get(completeURL))); | |
317 } else { | |
318 result.append(m_htmlEntities.convertEntitiesInString(com
pleteURL)); | |
319 } | |
320 } | |
321 } else { | |
322 if (param->isHTMLDocument) | |
323 result.append(m_htmlEntities.convertEntitiesInString(attrVal
ue)); | |
324 else | |
325 result.append(m_xmlEntities.convertEntitiesInString(attrValu
e)); | |
326 } | |
327 } | |
328 result.append('\"'); | |
329 } | |
330 | |
331 // Do post action for open tag. | |
332 String addedContents = postActionAfterSerializeOpenTag(element, param); | |
333 // Complete the open tag for element when it has child/children. | |
334 if (element->hasChildren() || param->haveAddedContentsBeforeEnd) | |
335 result.append('>'); | |
336 // Append the added contents generate in post action of open tag. | |
337 result.append(addedContents); | |
338 // Save the result to data buffer. | |
339 saveHTMLContentToBuffer(result.toString(), param); | |
340 } | |
341 | |
342 // Serialize end tag of an specified element. | |
343 void WebPageSerializerImpl::endTagToString(Element* element, | |
344 SerializeDomParam* param) | |
345 { | |
346 bool needSkip; | |
347 StringBuilder result; | |
348 // Do pre action for end tag. | |
349 result.append(preActionBeforeSerializeEndTag(element, param, &needSkip)); | |
350 if (needSkip) | |
351 return; | |
352 // Write end tag when element has child/children. | |
353 if (element->hasChildren() || param->haveAddedContentsBeforeEnd) { | |
354 result.appendLiteral("</"); | |
355 result.append(element->nodeName().lower()); | |
356 result.append('>'); | |
357 } else { | |
358 // Check whether we have to write end tag for empty element. | |
359 if (param->isHTMLDocument) { | |
360 result.append('>'); | |
361 // FIXME: This code is horribly wrong. WebPageSerializerImpl must d
ie. | |
362 if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsI
nsertHTML()) { | |
363 // We need to write end tag when it is required. | |
364 result.appendLiteral("</"); | |
365 result.append(element->nodeName().lower()); | |
366 result.append('>'); | |
367 } | |
368 } else { | |
369 // For xml base document. | |
370 result.appendLiteral(" />"); | |
371 } | |
372 } | |
373 // Do post action for end tag. | |
374 result.append(postActionAfterSerializeEndTag(element, param)); | |
375 // Save the result to data buffer. | |
376 saveHTMLContentToBuffer(result.toString(), param); | |
377 } | |
378 | |
379 void WebPageSerializerImpl::buildContentForNode(Node* node, | |
380 SerializeDomParam* param) | |
381 { | |
382 switch (node->nodeType()) { | |
383 case Node::ELEMENT_NODE: | |
384 // Process open tag of element. | |
385 openTagToString(toElement(node), param); | |
386 // Walk through the children nodes and process it. | |
387 for (Node *child = node->firstChild(); child; child = child->nextSibling
()) | |
388 buildContentForNode(child, param); | |
389 // Process end tag of element. | |
390 endTagToString(toElement(node), param); | |
391 break; | |
392 case Node::TEXT_NODE: | |
393 saveHTMLContentToBuffer(createMarkup(node), param); | |
394 break; | |
395 case Node::ATTRIBUTE_NODE: | |
396 case Node::DOCUMENT_NODE: | |
397 case Node::DOCUMENT_FRAGMENT_NODE: | |
398 // Should not exist. | |
399 ASSERT_NOT_REACHED(); | |
400 break; | |
401 // Document type node can be in DOM? | |
402 case Node::DOCUMENT_TYPE_NODE: | |
403 param->haveSeenDocType = true; | |
404 default: | |
405 // For other type node, call default action. | |
406 saveHTMLContentToBuffer(createMarkup(node), param); | |
407 break; | |
408 } | |
409 } | |
410 | |
411 WebPageSerializerImpl::WebPageSerializerImpl( | |
412 WebLocalFrame* frame, | |
413 WebPageSerializerClient* client, | |
414 const WebVector<std::pair<WebURL, WebString>>& urlsToLocalPaths) | |
415 : m_client(client) | |
416 , m_htmlEntities(false) | |
417 , m_xmlEntities(true) | |
418 { | |
419 // Must specify available webframe. | |
420 ASSERT(frame); | |
421 m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame); | |
422 // Make sure we have non 0 client. | |
423 ASSERT(client); | |
424 // Build local resources map. | |
425 for (const auto& it : urlsToLocalPaths) { | |
426 KURL url = it.first; | |
427 ASSERT(!m_localLinks.contains(url.string())); | |
428 m_localLinks.set(url.string(), it.second); | |
429 } | |
430 | |
431 ASSERT(m_dataBuffer.isEmpty()); | |
432 } | |
433 | |
434 bool WebPageSerializerImpl::serialize() | |
435 { | |
436 bool didSerialization = false; | |
437 | |
438 Document* document = m_specifiedWebLocalFrameImpl->frame()->document(); | |
439 const KURL& url = document->url(); | |
440 | |
441 if (url.isValid()) { | |
442 didSerialization = true; | |
443 | |
444 const WTF::TextEncoding& textEncoding = document->encoding().isValid() ?
document->encoding() : UTF8Encoding(); | |
445 if (textEncoding.isNonByteBasedEncoding()) { | |
446 const UChar byteOrderMark = 0xFEFF; | |
447 m_dataBuffer.append(byteOrderMark); | |
448 } | |
449 | |
450 SerializeDomParam param(url, textEncoding, document); | |
451 | |
452 Element* documentElement = document->documentElement(); | |
453 if (documentElement) | |
454 buildContentForNode(documentElement, ¶m); | |
455 | |
456 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &p
aram, ForceFlush); | |
457 } else { | |
458 // Report empty contents for invalid URLs. | |
459 m_client->didSerializeDataForFrame( | |
460 WebCString(), WebPageSerializerClient::CurrentFrameIsFinished); | |
461 } | |
462 | |
463 ASSERT(m_dataBuffer.isEmpty()); | |
464 return didSerialization; | |
465 } | |
466 | |
467 } // namespace blink | |
OLD | NEW |