OLD | NEW |
| (Empty) |
1 /* | |
2 * Copyright (C) 2009 Google Inc. All rights reserved. | |
3 * | |
4 * Redistribution and use in source and binary forms, with or without | |
5 * modification, are permitted provided that the following conditions are | |
6 * met: | |
7 * | |
8 * * Redistributions of source code must retain the above copyright | |
9 * notice, this list of conditions and the following disclaimer. | |
10 * * Redistributions in binary form must reproduce the above | |
11 * copyright notice, this list of conditions and the following disclaimer | |
12 * in the documentation and/or other materials provided with the | |
13 * distribution. | |
14 * * Neither the name of Google Inc. nor the names of its | |
15 * contributors may be used to endorse or promote products derived from | |
16 * this software without specific prior written permission. | |
17 * | |
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 */ | |
30 | |
31 // How we handle the base tag better. | |
32 // Current status: | |
33 // At now the normal way we use to handling base tag is | |
34 // a) For those links which have corresponding local saved files, such as | |
35 // savable CSS, JavaScript files, they will be written to relative URLs which | |
36 // point to local saved file. Why those links can not be resolved as absolute | |
37 // file URLs, because if they are resolved as absolute URLs, after moving the | |
38 // file location from one directory to another directory, the file URLs will | |
39 // be dead links. | |
40 // b) For those links which have not corresponding local saved files, such as | |
41 // links in A, AREA tags, they will be resolved as absolute URLs. | |
42 // c) We comment all base tags when serialzing DOM for the page. | |
43 // FireFox also uses above way to handle base tag. | |
44 // | |
45 // Problem: | |
46 // This way can not handle the following situation: | |
47 // the base tag is written by JavaScript. | |
48 // For example. The page "www.yahoo.com" use | |
49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL | |
50 // of page when loading page. So when saving page as completed-HTML, we assume | |
51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved | |
52 // completed-HTML page, then the JavaScript will insert a base tag | |
53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to | |
54 // local saved resource files will be resolved as | |
55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource | |
56 // files can not be loaded correctly. Also the page will be rendered ugly since | |
57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame | |
58 // files can not be fetched. | |
59 // Now FireFox, IE and WebKit based Browser all have this problem. | |
60 // | |
61 // Solution: | |
62 // My solution is that we comment old base tag and write new base tag: | |
63 // <base href="." ...> after the previous commented base tag. In WebKit, it | |
64 // always uses the latest "href" attribute of base tag to set document's base | |
65 // URL. Based on this behavior, when we encounter a base tag, we comment it and | |
66 // write a new base tag <base href="."> after the previous commented base tag. | |
67 // The new added base tag can help engine to locate correct base URL for | |
68 // correctly loading local saved resource files. Also I think we need to inherit | |
69 // the base target value from document object when appending new base tag. | |
70 // If there are multiple base tags in original document, we will comment all old | |
71 // base tags and append new base tag after each old base tag because we do not | |
72 // know those old base tags are original content or added by JavaScript. If | |
73 // they are added by JavaScript, it means when loading saved page, the script(s) | |
74 // will still insert base tag(s) to DOM, so the new added base tag(s) can | |
75 // override the incorrect base URL and make sure we alway load correct local | |
76 // saved resource files. | |
77 | |
78 #include "config.h" | |
79 #include "WebPageSerializerImpl.h" | |
80 | |
81 #include "DOMUtilitiesPrivate.h" | |
82 #include "HTMLNames.h" | |
83 #include "WebFrameImpl.h" | |
84 #include "core/dom/Document.h" | |
85 #include "core/dom/DocumentType.h" | |
86 #include "core/dom/Element.h" | |
87 #include "core/editing/markup.h" | |
88 #include "core/html/HTMLAllCollection.h" | |
89 #include "core/html/HTMLElement.h" | |
90 #include "core/html/HTMLFormElement.h" | |
91 #include "core/html/HTMLHtmlElement.h" | |
92 #include "core/html/HTMLMetaElement.h" | |
93 #include "core/loader/DocumentLoader.h" | |
94 #include "core/loader/FrameLoader.h" | |
95 #include "public/platform/WebVector.h" | |
96 #include "weborigin/KURL.h" | |
97 #include "wtf/text/TextEncoding.h" | |
98 | |
99 using namespace WebCore; | |
100 | |
101 namespace blink { | |
102 | |
103 // Maximum length of data buffer which is used to temporary save generated | |
104 // html content data. This is a soft limit which might be passed if a very large | |
105 // contegious string is found in the page. | |
106 static const unsigned dataBufferCapacity = 65536; | |
107 | |
108 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url, | |
109 const WTF::TextEncod
ing& textEncoding, | |
110 Document* document, | |
111 const String& direct
oryName) | |
112 : url(url) | |
113 , textEncoding(textEncoding) | |
114 , document(document) | |
115 , directoryName(directoryName) | |
116 , isHTMLDocument(document->isHTMLDocument()) | |
117 , haveSeenDocType(false) | |
118 , haveAddedCharsetDeclaration(false) | |
119 , skipMetaElement(0) | |
120 , isInScriptOrStyleTag(false) | |
121 , haveAddedXMLProcessingDirective(false) | |
122 , haveAddedContentsBeforeEnd(false) | |
123 { | |
124 } | |
125 | |
126 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag( | |
127 const Element* element, SerializeDomParam* param, bool* needSkip) | |
128 { | |
129 StringBuilder result; | |
130 | |
131 *needSkip = false; | |
132 if (param->isHTMLDocument) { | |
133 // Skip the open tag of original META tag which declare charset since we | |
134 // have overrided the META which have correct charset declaration after | |
135 // serializing open tag of HEAD element. | |
136 if (element->hasTagName(HTMLNames::metaTag)) { | |
137 const HTMLMetaElement* meta = toHTMLMetaElement(element); | |
138 // Check whether the META tag has declared charset or not. | |
139 String equiv = meta->httpEquiv(); | |
140 if (equalIgnoringCase(equiv, "content-type")) { | |
141 String content = meta->content(); | |
142 if (content.length() && content.contains("charset", false)) { | |
143 // Find META tag declared charset, we need to skip it when | |
144 // serializing DOM. | |
145 param->skipMetaElement = element; | |
146 *needSkip = true; | |
147 } | |
148 } | |
149 } else if (isHTMLHtmlElement(element)) { | |
150 // Check something before processing the open tag of HEAD element. | |
151 // First we add doc type declaration if original document has it. | |
152 if (!param->haveSeenDocType) { | |
153 param->haveSeenDocType = true; | |
154 result.append(createMarkup(param->document->doctype())); | |
155 } | |
156 | |
157 // Add MOTW declaration before html tag. | |
158 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx
. | |
159 result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(par
am->url)); | |
160 } else if (element->hasTagName(HTMLNames::baseTag)) { | |
161 // Comment the BASE tag when serializing dom. | |
162 result.append("<!--"); | |
163 } | |
164 } else { | |
165 // Write XML declaration. | |
166 if (!param->haveAddedXMLProcessingDirective) { | |
167 param->haveAddedXMLProcessingDirective = true; | |
168 // Get encoding info. | |
169 String xmlEncoding = param->document->xmlEncoding(); | |
170 if (xmlEncoding.isEmpty()) | |
171 xmlEncoding = param->document->encodingName(); | |
172 if (xmlEncoding.isEmpty()) | |
173 xmlEncoding = UTF8Encoding().name(); | |
174 result.append("<?xml version=\""); | |
175 result.append(param->document->xmlVersion()); | |
176 result.append("\" encoding=\""); | |
177 result.append(xmlEncoding); | |
178 if (param->document->xmlStandalone()) | |
179 result.append("\" standalone=\"yes"); | |
180 result.append("\"?>\n"); | |
181 } | |
182 // Add doc type declaration if original document has it. | |
183 if (!param->haveSeenDocType) { | |
184 param->haveSeenDocType = true; | |
185 result.append(createMarkup(param->document->doctype())); | |
186 } | |
187 } | |
188 return result.toString(); | |
189 } | |
190 | |
191 String WebPageSerializerImpl::postActionAfterSerializeOpenTag( | |
192 const Element* element, SerializeDomParam* param) | |
193 { | |
194 StringBuilder result; | |
195 | |
196 param->haveAddedContentsBeforeEnd = false; | |
197 if (!param->isHTMLDocument) | |
198 return result.toString(); | |
199 // Check after processing the open tag of HEAD element | |
200 if (!param->haveAddedCharsetDeclaration | |
201 && element->hasTagName(HTMLNames::headTag)) { | |
202 param->haveAddedCharsetDeclaration = true; | |
203 // Check meta element. WebKit only pre-parse the first 512 bytes | |
204 // of the document. If the whole <HEAD> is larger and meta is the | |
205 // end of head part, then this kind of pages aren't decoded correctly | |
206 // because of this issue. So when we serialize the DOM, we need to | |
207 // make sure the meta will in first child of head tag. | |
208 // See http://bugs.webkit.org/show_bug.cgi?id=16621. | |
209 // First we generate new content for writing correct META element. | |
210 result.append(WebPageSerializer::generateMetaCharsetDeclaration( | |
211 String(param->textEncoding.name()))); | |
212 | |
213 param->haveAddedContentsBeforeEnd = true; | |
214 // Will search each META which has charset declaration, and skip them al
l | |
215 // in PreActionBeforeSerializeOpenTag. | |
216 } else if (element->hasTagName(HTMLNames::scriptTag) | |
217 || element->hasTagName(HTMLNames::styleTag)) { | |
218 param->isInScriptOrStyleTag = true; | |
219 } | |
220 | |
221 return result.toString(); | |
222 } | |
223 | |
224 String WebPageSerializerImpl::preActionBeforeSerializeEndTag( | |
225 const Element* element, SerializeDomParam* param, bool* needSkip) | |
226 { | |
227 String result; | |
228 | |
229 *needSkip = false; | |
230 if (!param->isHTMLDocument) | |
231 return result; | |
232 // Skip the end tag of original META tag which declare charset. | |
233 // Need not to check whether it's META tag since we guarantee | |
234 // skipMetaElement is definitely META tag if it's not 0. | |
235 if (param->skipMetaElement == element) | |
236 *needSkip = true; | |
237 else if (element->hasTagName(HTMLNames::scriptTag) | |
238 || element->hasTagName(HTMLNames::styleTag)) { | |
239 ASSERT(param->isInScriptOrStyleTag); | |
240 param->isInScriptOrStyleTag = false; | |
241 } | |
242 | |
243 return result; | |
244 } | |
245 | |
246 // After we finish serializing end tag of a element, we give the target | |
247 // element a chance to do some post work to add some additional data. | |
248 String WebPageSerializerImpl::postActionAfterSerializeEndTag( | |
249 const Element* element, SerializeDomParam* param) | |
250 { | |
251 StringBuilder result; | |
252 | |
253 if (!param->isHTMLDocument) | |
254 return result.toString(); | |
255 // Comment the BASE tag when serializing DOM. | |
256 if (element->hasTagName(HTMLNames::baseTag)) { | |
257 result.append("-->"); | |
258 // Append a new base tag declaration. | |
259 result.append(WebPageSerializer::generateBaseTagDeclaration( | |
260 param->document->baseTarget())); | |
261 } | |
262 | |
263 return result.toString(); | |
264 } | |
265 | |
266 void WebPageSerializerImpl::saveHTMLContentToBuffer( | |
267 const String& result, SerializeDomParam* param) | |
268 { | |
269 m_dataBuffer.append(result); | |
270 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished, | |
271 param, | |
272 DoNotForceFlush); | |
273 } | |
274 | |
275 void WebPageSerializerImpl::encodeAndFlushBuffer( | |
276 WebPageSerializerClient::PageSerializationStatus status, | |
277 SerializeDomParam* param, | |
278 FlushOption flushOption) | |
279 { | |
280 // Data buffer is not full nor do we want to force flush. | |
281 if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity
) | |
282 return; | |
283 | |
284 String content = m_dataBuffer.toString(); | |
285 m_dataBuffer.clear(); | |
286 | |
287 CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF
::EntitiesForUnencodables); | |
288 | |
289 // Send result to the client. | |
290 m_client->didSerializeDataForFrame(param->url, | |
291 WebCString(encodedContent.data(), encoded
Content.length()), | |
292 status); | |
293 } | |
294 | |
295 void WebPageSerializerImpl::openTagToString(Element* element, | |
296 SerializeDomParam* param) | |
297 { | |
298 bool needSkip; | |
299 StringBuilder result; | |
300 // Do pre action for open tag. | |
301 result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip)); | |
302 if (needSkip) | |
303 return; | |
304 // Add open tag | |
305 result.append('<'); | |
306 result.append(element->nodeName().lower()); | |
307 // Go through all attributes and serialize them. | |
308 if (element->hasAttributes()) { | |
309 unsigned numAttrs = element->attributeCount(); | |
310 for (unsigned i = 0; i < numAttrs; i++) { | |
311 result.append(' '); | |
312 // Add attribute pair | |
313 const Attribute *attribute = element->attributeItem(i); | |
314 result.append(attribute->name().toString()); | |
315 result.appendLiteral("=\""); | |
316 if (!attribute->value().isEmpty()) { | |
317 const String& attrValue = attribute->value(); | |
318 | |
319 // Check whether we need to replace some resource links | |
320 // with local resource paths. | |
321 const QualifiedName& attrName = attribute->name(); | |
322 if (elementHasLegalLinkAttribute(element, attrName)) { | |
323 // For links start with "javascript:", we do not change it. | |
324 if (attrValue.startsWith("javascript:", false)) | |
325 result.append(attrValue); | |
326 else { | |
327 // Get the absolute link | |
328 WebFrameImpl* subFrame = WebFrameImpl::fromFrameOwnerEle
ment(element); | |
329 String completeURL = subFrame ? subFrame->frame()->docum
ent()->url() : | |
330 param->document->complet
eURL(attrValue); | |
331 // Check whether we have local files for those link. | |
332 if (m_localLinks.contains(completeURL)) { | |
333 if (!param->directoryName.isEmpty()) { | |
334 result.appendLiteral("./"); | |
335 result.append(param->directoryName); | |
336 result.append('/'); | |
337 } | |
338 result.append(m_localLinks.get(completeURL)); | |
339 } else | |
340 result.append(completeURL); | |
341 } | |
342 } else { | |
343 if (param->isHTMLDocument) | |
344 result.append(m_htmlEntities.convertEntitiesInString(att
rValue)); | |
345 else | |
346 result.append(m_xmlEntities.convertEntitiesInString(attr
Value)); | |
347 } | |
348 } | |
349 result.append('\"'); | |
350 } | |
351 } | |
352 | |
353 // Do post action for open tag. | |
354 String addedContents = postActionAfterSerializeOpenTag(element, param); | |
355 // Complete the open tag for element when it has child/children. | |
356 if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd) | |
357 result.append('>'); | |
358 // Append the added contents generate in post action of open tag. | |
359 result.append(addedContents); | |
360 // Save the result to data buffer. | |
361 saveHTMLContentToBuffer(result.toString(), param); | |
362 } | |
363 | |
364 // Serialize end tag of an specified element. | |
365 void WebPageSerializerImpl::endTagToString(Element* element, | |
366 SerializeDomParam* param) | |
367 { | |
368 bool needSkip; | |
369 StringBuilder result; | |
370 // Do pre action for end tag. | |
371 result.append(preActionBeforeSerializeEndTag(element, param, &needSkip)); | |
372 if (needSkip) | |
373 return; | |
374 // Write end tag when element has child/children. | |
375 if (element->hasChildNodes() || param->haveAddedContentsBeforeEnd) { | |
376 result.appendLiteral("</"); | |
377 result.append(element->nodeName().lower()); | |
378 result.append('>'); | |
379 } else { | |
380 // Check whether we have to write end tag for empty element. | |
381 if (param->isHTMLDocument) { | |
382 result.append('>'); | |
383 // FIXME: This code is horribly wrong. WebPageSerializerImpl must d
ie. | |
384 if (!element->isHTMLElement() || !toHTMLElement(element)->ieForbidsI
nsertHTML()) { | |
385 // We need to write end tag when it is required. | |
386 result.appendLiteral("</"); | |
387 result.append(element->nodeName().lower()); | |
388 result.append('>'); | |
389 } | |
390 } else { | |
391 // For xml base document. | |
392 result.appendLiteral(" />"); | |
393 } | |
394 } | |
395 // Do post action for end tag. | |
396 result.append(postActionAfterSerializeEndTag(element, param)); | |
397 // Save the result to data buffer. | |
398 saveHTMLContentToBuffer(result.toString(), param); | |
399 } | |
400 | |
401 void WebPageSerializerImpl::buildContentForNode(Node* node, | |
402 SerializeDomParam* param) | |
403 { | |
404 switch (node->nodeType()) { | |
405 case Node::ELEMENT_NODE: | |
406 // Process open tag of element. | |
407 openTagToString(toElement(node), param); | |
408 // Walk through the children nodes and process it. | |
409 for (Node *child = node->firstChild(); child; child = child->nextSibling
()) | |
410 buildContentForNode(child, param); | |
411 // Process end tag of element. | |
412 endTagToString(toElement(node), param); | |
413 break; | |
414 case Node::TEXT_NODE: | |
415 saveHTMLContentToBuffer(createMarkup(node), param); | |
416 break; | |
417 case Node::ATTRIBUTE_NODE: | |
418 case Node::DOCUMENT_NODE: | |
419 case Node::DOCUMENT_FRAGMENT_NODE: | |
420 // Should not exist. | |
421 ASSERT_NOT_REACHED(); | |
422 break; | |
423 // Document type node can be in DOM? | |
424 case Node::DOCUMENT_TYPE_NODE: | |
425 param->haveSeenDocType = true; | |
426 default: | |
427 // For other type node, call default action. | |
428 saveHTMLContentToBuffer(createMarkup(node), param); | |
429 break; | |
430 } | |
431 } | |
432 | |
433 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame, | |
434 bool recursiveSerialization, | |
435 WebPageSerializerClient* client, | |
436 const WebVector<WebURL>& links, | |
437 const WebVector<WebString>& localPa
ths, | |
438 const WebString& localDirectoryName
) | |
439 : m_client(client) | |
440 , m_recursiveSerialization(recursiveSerialization) | |
441 , m_framesCollected(false) | |
442 , m_localDirectoryName(localDirectoryName) | |
443 , m_htmlEntities(false) | |
444 , m_xmlEntities(true) | |
445 { | |
446 // Must specify available webframe. | |
447 ASSERT(frame); | |
448 m_specifiedWebFrameImpl = toWebFrameImpl(frame); | |
449 // Make sure we have non 0 client. | |
450 ASSERT(client); | |
451 // Build local resources map. | |
452 ASSERT(links.size() == localPaths.size()); | |
453 for (size_t i = 0; i < links.size(); i++) { | |
454 KURL url = links[i]; | |
455 ASSERT(!m_localLinks.contains(url.string())); | |
456 m_localLinks.set(url.string(), localPaths[i]); | |
457 } | |
458 | |
459 ASSERT(m_dataBuffer.isEmpty()); | |
460 } | |
461 | |
462 void WebPageSerializerImpl::collectTargetFrames() | |
463 { | |
464 ASSERT(!m_framesCollected); | |
465 m_framesCollected = true; | |
466 | |
467 // First, process main frame. | |
468 m_frames.append(m_specifiedWebFrameImpl); | |
469 // Return now if user only needs to serialize specified frame, not including | |
470 // all sub-frames. | |
471 if (!m_recursiveSerialization) | |
472 return; | |
473 // Collect all frames inside the specified frame. | |
474 for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) { | |
475 WebFrameImpl* currentFrame = m_frames[i]; | |
476 // Get current using document. | |
477 Document* currentDoc = currentFrame->frame()->document(); | |
478 // Go through sub-frames. | |
479 RefPtr<HTMLCollection> all = currentDoc->all(); | |
480 | |
481 for (unsigned i = 0; Node* node = all->item(i); i++) { | |
482 if (!node->isHTMLElement()) | |
483 continue; | |
484 Element* element = toElement(node); | |
485 WebFrameImpl* webFrame = | |
486 WebFrameImpl::fromFrameOwnerElement(element); | |
487 if (webFrame) | |
488 m_frames.append(webFrame); | |
489 } | |
490 } | |
491 } | |
492 | |
493 bool WebPageSerializerImpl::serialize() | |
494 { | |
495 if (!m_framesCollected) | |
496 collectTargetFrames(); | |
497 | |
498 bool didSerialization = false; | |
499 KURL mainURL = m_specifiedWebFrameImpl->frame()->document()->url(); | |
500 | |
501 for (unsigned i = 0; i < m_frames.size(); ++i) { | |
502 WebFrameImpl* webFrame = m_frames[i]; | |
503 Document* document = webFrame->frame()->document(); | |
504 const KURL& url = document->url(); | |
505 | |
506 if (!url.isValid() || !m_localLinks.contains(url.string())) | |
507 continue; | |
508 | |
509 didSerialization = true; | |
510 | |
511 const WTF::TextEncoding& textEncoding = document->encoding().isValid() ?
document->encoding() : UTF8Encoding(); | |
512 String directoryName = url == mainURL ? m_localDirectoryName : ""; | |
513 | |
514 SerializeDomParam param(url, textEncoding, document, directoryName); | |
515 | |
516 Element* documentElement = document->documentElement(); | |
517 if (documentElement) | |
518 buildContentForNode(documentElement, ¶m); | |
519 | |
520 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &p
aram, ForceFlush); | |
521 } | |
522 | |
523 ASSERT(m_dataBuffer.isEmpty()); | |
524 m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSeriali
zerClient::AllFramesAreFinished); | |
525 return didSerialization; | |
526 } | |
527 | |
528 } // namespace blink | |
OLD | NEW |