Source/web/WebPageSerializerImpl.cpp - Issue 68613003: Merges the two different page serializers

Side by Side Diff: Source/web/WebPageSerializerImpl.cpp

Issue 68613003: Merges the two different page serializers (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master

Patch Set: Created 7 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 * Copyright (C) 2009 Google Inc. All rights reserved.

3 *

4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions are

6 * met:

7 *

8 * * Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.

10 * * Redistributions in binary form must reproduce the above

11 * copyright notice, this list of conditions and the following disclaimer

12 * in the documentation and/or other materials provided with the

13 * distribution.

14 * * Neither the name of Google Inc. nor the names of its

15 * contributors may be used to endorse or promote products derived from

16 * this software without specific prior written permission.

17 *

18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

29 */

30

31 // How we handle the base tag better.

32 // Current status:

33 // At now the normal way we use to handling base tag is

34 // a) For those links which have corresponding local saved files, such as

35 // savable CSS, JavaScript files, they will be written to relative URLs which

36 // point to local saved file. Why those links can not be resolved as absolute

37 // file URLs, because if they are resolved as absolute URLs, after moving the

38 // file location from one directory to another directory, the file URLs will

39 // be dead links.

40 // b) For those links which have not corresponding local saved files, such as

41 // links in A, AREA tags, they will be resolved as absolute URLs.

42 // c) We comment all base tags when serialzing DOM for the page.

43 // FireFox also uses above way to handle base tag.

44 //

45 // Problem:

46 // This way can not handle the following situation:

47 // the base tag is written by JavaScript.

48 // For example. The page "www.yahoo.com" use

49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL

50 // of page when loading page. So when saving page as completed-HTML, we assume

51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved

52 // completed-HTML page, then the JavaScript will insert a base tag

53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to

54 // local saved resource files will be resolved as

55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource

56 // files can not be loaded correctly. Also the page will be rendered ugly since

57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame

58 // files can not be fetched.

59 // Now FireFox, IE and WebKit based Browser all have this problem.

60 //

61 // Solution:

62 // My solution is that we comment old base tag and write new base tag:

63 // <base href="." ...> after the previous commented base tag. In WebKit, it

64 // always uses the latest "href" attribute of base tag to set document's base

65 // URL. Based on this behavior, when we encounter a base tag, we comment it and

66 // write a new base tag <base href="."> after the previous commented base tag.

67 // The new added base tag can help engine to locate correct base URL for

68 // correctly loading local saved resource files. Also I think we need to inherit

69 // the base target value from document object when appending new base tag.

70 // If there are multiple base tags in original document, we will comment all old

71 // base tags and append new base tag after each old base tag because we do not

72 // know those old base tags are original content or added by JavaScript. If

73 // they are added by JavaScript, it means when loading saved page, the script(s)

74 // will still insert base tag(s) to DOM, so the new added base tag(s) can

75 // override the incorrect base URL and make sure we alway load correct local

76 // saved resource files.

77

78 #include "config.h"

79 #include "WebPageSerializerImpl.h"

80

81 #include "DOMUtilitiesPrivate.h"

82 #include "HTMLNames.h"

83 #include "WebFrameImpl.h"

84 #include "core/dom/Document.h"

85 #include "core/dom/DocumentType.h"

86 #include "core/dom/Element.h"

87 #include "core/editing/markup.h"

88 #include "core/html/HTMLAllCollection.h"

89 #include "core/html/HTMLElement.h"

90 #include "core/html/HTMLFormElement.h"

91 #include "core/html/HTMLHtmlElement.h"

92 #include "core/html/HTMLMetaElement.h"

93 #include "core/loader/DocumentLoader.h"

94 #include "core/loader/FrameLoader.h"

95 #include "public/platform/WebVector.h"

96 #include "weborigin/KURL.h"

97 #include "wtf/text/TextEncoding.h"

98

99 using namespace WebCore;

100

101 namespace blink {

102

103 // Maximum length of data buffer which is used to temporary save generated

104 // html content data. This is a soft limit which might be passed if a very large

105 // contegious string is found in the page.

106 static const unsigned dataBufferCapacity = 65536;

107

108 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,

109 const WTF::TextEncod ing& textEncoding,

110 Document* document,

111 const String& direct oryName)

112 : url(url)

113 , textEncoding(textEncoding)

114 , document(document)

115 , directoryName(directoryName)

116 , isHTMLDocument(document->isHTMLDocument())

117 , haveSeenDocType(false)

118 , haveAddedCharsetDeclaration(false)

119 , skipMetaElement(0)

120 , isInScriptOrStyleTag(false)

121 , haveAddedXMLProcessingDirective(false)

122 , haveAddedContentsBeforeEnd(false)

123 {

124 }

125

126 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(

127 const Element* element, SerializeDomParam* param, bool* needSkip)

128 {

129 StringBuilder result;

130

131 *needSkip = false;

132 if (param->isHTMLDocument) {

133 // Skip the open tag of original META tag which declare charset since we

134 // have overrided the META which have correct charset declaration after

135 // serializing open tag of HEAD element.

136 if (element->hasTagName(HTMLNames::metaTag)) {

137 const HTMLMetaElement* meta = toHTMLMetaElement(element);

138 // Check whether the META tag has declared charset or not.

139 String equiv = meta->httpEquiv();

140 if (equalIgnoringCase(equiv, "content-type")) {

141 String content = meta->content();

142 if (content.length() && content.contains("charset", false)) {

143 // Find META tag declared charset, we need to skip it when

144 // serializing DOM.

145 param->skipMetaElement = element;

146 *needSkip = true;

147 }

148 }

149 } else if (isHTMLHtmlElement(element)) {

150 // Check something before processing the open tag of HEAD element.

151 // First we add doc type declaration if original document has it.

152 if (!param->haveSeenDocType) {

153 param->haveSeenDocType = true;

154 result.append(createMarkup(param->document->doctype()));

155 }

156

157 // Add MOTW declaration before html tag.

158 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx .

159 result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(par am->url));

160 } else if (element->hasTagName(HTMLNames::baseTag)) {

161 // Comment the BASE tag when serializing dom.

162 result.append("<!--");

163 }

164 } else {

165 // Write XML declaration.

166 if (!param->haveAddedXMLProcessingDirective) {

167 param->haveAddedXMLProcessingDirective = true;

168 // Get encoding info.

169 String xmlEncoding = param->document->xmlEncoding();

170 if (xmlEncoding.isEmpty())

171 xmlEncoding = param->document->encodingName();

172 if (xmlEncoding.isEmpty())

173 xmlEncoding = UTF8Encoding().name();

174 result.append("<?xml version=\"");

175 result.append(param->document->xmlVersion());

176 result.append("\" encoding=\"");

177 result.append(xmlEncoding);

178 if (param->document->xmlStandalone())

179 result.append("\" standalone=\"yes");

180 result.append("\"?>\n");

181 }

182 // Add doc type declaration if original document has it.

183 if (!param->haveSeenDocType) {

184 param->haveSeenDocType = true;

185 result.append(createMarkup(param->document->doctype()));

186 }

187 }

188 return result.toString();

189 }

190

191 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(

192 const Element* element, SerializeDomParam* param)

193 {

194 StringBuilder result;

195

196 param->haveAddedContentsBeforeEnd = false;

197 if (!param->isHTMLDocument)

198 return result.toString();

199 // Check after processing the open tag of HEAD element

200 if (!param->haveAddedCharsetDeclaration

201 && element->hasTagName(HTMLNames::headTag)) {

202 param->haveAddedCharsetDeclaration = true;

203 // Check meta element. WebKit only pre-parse the first 512 bytes

204 // of the document. If the whole <HEAD> is larger and meta is the

205 // end of head part, then this kind of pages aren't decoded correctly

206 // because of this issue. So when we serialize the DOM, we need to

207 // make sure the meta will in first child of head tag.

208 // See http://bugs.webkit.org/show_bug.cgi?id=16621.

209 // First we generate new content for writing correct META element.

210 result.append(WebPageSerializer::generateMetaCharsetDeclaration(

211 String(param->textEncoding.name())));

212

213 param->haveAddedContentsBeforeEnd = true;

214 // Will search each META which has charset declaration, and skip them al l

215 // in PreActionBeforeSerializeOpenTag.

216 } else if (element->hasTagName(HTMLNames::scriptTag)

217 \|\| element->hasTagName(HTMLNames::styleTag)) {

218 param->isInScriptOrStyleTag = true;

219 }

220

221 return result.toString();

222 }

223

224 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(

225 const Element* element, SerializeDomParam* param, bool* needSkip)

226 {

227 String result;

228

229 *needSkip = false;

230 if (!param->isHTMLDocument)

231 return result;

232 // Skip the end tag of original META tag which declare charset.

233 // Need not to check whether it's META tag since we guarantee

234 // skipMetaElement is definitely META tag if it's not 0.

235 if (param->skipMetaElement == element)

236 *needSkip = true;

237 else if (element->hasTagName(HTMLNames::scriptTag)

238 \|\| element->hasTagName(HTMLNames::styleTag)) {

239 ASSERT(param->isInScriptOrStyleTag);

240 param->isInScriptOrStyleTag = false;

241 }

242

243 return result;

244 }

245

246 // After we finish serializing end tag of a element, we give the target

247 // element a chance to do some post work to add some additional data.

248 String WebPageSerializerImpl::postActionAfterSerializeEndTag(

249 const Element* element, SerializeDomParam* param)

250 {

251 StringBuilder result;

252

253 if (!param->isHTMLDocument)

254 return result.toString();

255 // Comment the BASE tag when serializing DOM.

256 if (element->hasTagName(HTMLNames::baseTag)) {

257 result.append("-->");

258 // Append a new base tag declaration.

259 result.append(WebPageSerializer::generateBaseTagDeclaration(

260 param->document->baseTarget()));

261 }

262

263 return result.toString();

264 }

265

266 void WebPageSerializerImpl::saveHTMLContentToBuffer(

267 const String& result, SerializeDomParam* param)

268 {

269 m_dataBuffer.append(result);

270 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,

271 param,

272 DoNotForceFlush);

273 }

274

275 void WebPageSerializerImpl::encodeAndFlushBuffer(

276 WebPageSerializerClient::PageSerializationStatus status,

277 SerializeDomParam* param,

278 FlushOption flushOption)

279 {

280 // Data buffer is not full nor do we want to force flush.

281 if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity )

282 return;

283

284 String content = m_dataBuffer.toString();

285 m_dataBuffer.clear();

286

287 CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF ::EntitiesForUnencodables);

288

289 // Send result to the client.

290 m_client->didSerializeDataForFrame(param->url,

291 WebCString(encodedContent.data(), encoded Content.length()),

292 status);

293 }

294

295 void WebPageSerializerImpl::openTagToString(Element* element,

296 SerializeDomParam* param)

297 {

298 bool needSkip;

299 StringBuilder result;

300 // Do pre action for open tag.

301 result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));

302 if (needSkip)

303 return;

304 // Add open tag

305 result.append('<');

306 result.append(element->nodeName().lower());

307 // Go through all attributes and serialize them.

308 if (element->hasAttributes()) {

309 unsigned numAttrs = element->attributeCount();

310 for (unsigned i = 0; i < numAttrs; i++) {

311 result.append(' ');

312 // Add attribute pair

313 const Attribute *attribute = element->attributeItem(i);

314 result.append(attribute->name().toString());

315 result.appendLiteral("=\"");

316 if (!attribute->value().isEmpty()) {

317 const String& attrValue = attribute->value();

318

319 // Check whether we need to replace some resource links

320 // with local resource paths.

321 const QualifiedName& attrName = attribute->name();

322 if (elementHasLegalLinkAttribute(element, attrName)) {

323 // For links start with "javascript:", we do not change it.

324 if (attrValue.startsWith("javascript:", false))

325 result.append(attrValue);

326 else {

327 // Get the absolute link

328 WebFrameImpl* subFrame = WebFrameImpl::fromFrameOwnerEle ment(element);

329 String completeURL = subFrame ? subFrame->frame()->docum ent()->url() :

330 param->document->complet eURL(attrValue);

331 // Check whether we have local files for those link.

332 if (m_localLinks.contains(completeURL)) {

333 if (!param->directoryName.isEmpty()) {

334 result.appendLiteral("./");

335 result.append(param->directoryName);

336 result.append('/');

337 }

338 result.append(m_localLinks.get(completeURL));

339 } else

340 result.append(completeURL);

341 }

342 } else {

343 if (param->isHTMLDocument)

344 result.append(m_htmlEntities.convertEntitiesInString(att rValue));

345 else

346 result.append(m_xmlEntities.convertEntitiesInString(attr Value));

347 }

348 }

349 result.append('\"');

350 }

351 }

352

353 // Do post action for open tag.

354 String addedContents = postActionAfterSerializeOpenTag(element, param);

355 // Complete the open tag for element when it has child/children.

356 if (element->hasChildNodes() \|\| param->haveAddedContentsBeforeEnd)

357 result.append('>');

358 // Append the added contents generate in post action of open tag.

359 result.append(addedContents);

360 // Save the result to data buffer.

361 saveHTMLContentToBuffer(result.toString(), param);

362 }

363

364 // Serialize end tag of an specified element.

365 void WebPageSerializerImpl::endTagToString(Element* element,

366 SerializeDomParam* param)

367 {

368 bool needSkip;

369 StringBuilder result;

370 // Do pre action for end tag.

371 result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));

372 if (needSkip)

373 return;

374 // Write end tag when element has child/children.

375 if (element->hasChildNodes() \|\| param->haveAddedContentsBeforeEnd) {

376 result.appendLiteral("</");

377 result.append(element->nodeName().lower());

378 result.append('>');

379 } else {

380 // Check whether we have to write end tag for empty element.

381 if (param->isHTMLDocument) {

382 result.append('>');

383 // FIXME: This code is horribly wrong. WebPageSerializerImpl must d ie.

384 if (!element->isHTMLElement() \|\| !toHTMLElement(element)->ieForbidsI nsertHTML()) {

385 // We need to write end tag when it is required.

386 result.appendLiteral("</");

387 result.append(element->nodeName().lower());

388 result.append('>');

389 }

390 } else {

391 // For xml base document.

392 result.appendLiteral(" />");

393 }

394 }

395 // Do post action for end tag.

396 result.append(postActionAfterSerializeEndTag(element, param));

397 // Save the result to data buffer.

398 saveHTMLContentToBuffer(result.toString(), param);

399 }

400

401 void WebPageSerializerImpl::buildContentForNode(Node* node,

402 SerializeDomParam* param)

403 {

404 switch (node->nodeType()) {

405 case Node::ELEMENT_NODE:

406 // Process open tag of element.

407 openTagToString(toElement(node), param);

408 // Walk through the children nodes and process it.

409 for (Node *child = node->firstChild(); child; child = child->nextSibling ())

410 buildContentForNode(child, param);

411 // Process end tag of element.

412 endTagToString(toElement(node), param);

413 break;

414 case Node::TEXT_NODE:

415 saveHTMLContentToBuffer(createMarkup(node), param);

416 break;

417 case Node::ATTRIBUTE_NODE:

418 case Node::DOCUMENT_NODE:

419 case Node::DOCUMENT_FRAGMENT_NODE:

420 // Should not exist.

421 ASSERT_NOT_REACHED();

422 break;

423 // Document type node can be in DOM?

424 case Node::DOCUMENT_TYPE_NODE:

425 param->haveSeenDocType = true;

426 default:

427 // For other type node, call default action.

428 saveHTMLContentToBuffer(createMarkup(node), param);

429 break;

430 }

431 }

432

433 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,

434 bool recursiveSerialization,

435 WebPageSerializerClient* client,

436 const WebVector<WebURL>& links,

437 const WebVector<WebString>& localPa ths,

438 const WebString& localDirectoryName )

439 : m_client(client)

440 , m_recursiveSerialization(recursiveSerialization)

441 , m_framesCollected(false)

442 , m_localDirectoryName(localDirectoryName)

443 , m_htmlEntities(false)

444 , m_xmlEntities(true)

445 {

446 // Must specify available webframe.

447 ASSERT(frame);

448 m_specifiedWebFrameImpl = toWebFrameImpl(frame);

449 // Make sure we have non 0 client.

450 ASSERT(client);

451 // Build local resources map.

452 ASSERT(links.size() == localPaths.size());

453 for (size_t i = 0; i < links.size(); i++) {

454 KURL url = links[i];

455 ASSERT(!m_localLinks.contains(url.string()));

456 m_localLinks.set(url.string(), localPaths[i]);

457 }

458

459 ASSERT(m_dataBuffer.isEmpty());

460 }

461

462 void WebPageSerializerImpl::collectTargetFrames()

463 {

464 ASSERT(!m_framesCollected);

465 m_framesCollected = true;

466

467 // First, process main frame.

468 m_frames.append(m_specifiedWebFrameImpl);

469 // Return now if user only needs to serialize specified frame, not including

470 // all sub-frames.

471 if (!m_recursiveSerialization)

472 return;

473 // Collect all frames inside the specified frame.

474 for (int i = 0; i < static_cast<int>(m_frames.size()); ++i) {

475 WebFrameImpl* currentFrame = m_frames[i];

476 // Get current using document.

477 Document* currentDoc = currentFrame->frame()->document();

478 // Go through sub-frames.

479 RefPtr<HTMLCollection> all = currentDoc->all();

480

481 for (unsigned i = 0; Node* node = all->item(i); i++) {

482 if (!node->isHTMLElement())

483 continue;

484 Element* element = toElement(node);

485 WebFrameImpl* webFrame =

486 WebFrameImpl::fromFrameOwnerElement(element);

487 if (webFrame)

488 m_frames.append(webFrame);

489 }

490 }

491 }

492

493 bool WebPageSerializerImpl::serialize()

494 {

495 if (!m_framesCollected)

496 collectTargetFrames();

497

498 bool didSerialization = false;

499 KURL mainURL = m_specifiedWebFrameImpl->frame()->document()->url();

500

501 for (unsigned i = 0; i < m_frames.size(); ++i) {

502 WebFrameImpl* webFrame = m_frames[i];

503 Document* document = webFrame->frame()->document();

504 const KURL& url = document->url();

505

506 if (!url.isValid() \|\| !m_localLinks.contains(url.string()))

507 continue;

508

509 didSerialization = true;

510

511 const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding();

512 String directoryName = url == mainURL ? m_localDirectoryName : "";

513

514 SerializeDomParam param(url, textEncoding, document, directoryName);

515

516 Element* documentElement = document->documentElement();

517 if (documentElement)

518 buildContentForNode(documentElement, &param);

519

520 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &p aram, ForceFlush);

521 }

522

523 ASSERT(m_dataBuffer.isEmpty());

524 m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSeriali zerClient::AllFramesAreFinished);

525 return didSerialization;

526 }

527

528 } // namespace blink

OLD	NEW

« Source/web/WebPageSerializer.cpp ('K') | « Source/web/WebPageSerializerImpl.h ('k') | Source/web/tests/MHTMLTest.cpp » ('j') | public/web/WebPageSerializerClient.h » ('J')