Source/web/WebPageSerializerImpl.cpp - Issue 1177733003: Merge page serializers [12/12]

Side by Side Diff: Source/web/WebPageSerializerImpl.cpp

Issue 1177733003: Merge page serializers [12/12] (Closed) Base URL: https://chromium.googlesource.com/chromium/blink.git@master

Patch Set: Rebase Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 /*

2 * Copyright (C) 2009 Google Inc. All rights reserved.

3 *

4 * Redistribution and use in source and binary forms, with or without

5 * modification, are permitted provided that the following conditions are

6 * met:

7 *

8 * * Redistributions of source code must retain the above copyright

9 * notice, this list of conditions and the following disclaimer.

10 * * Redistributions in binary form must reproduce the above

11 * copyright notice, this list of conditions and the following disclaimer

12 * in the documentation and/or other materials provided with the

13 * distribution.

14 * * Neither the name of Google Inc. nor the names of its

15 * contributors may be used to endorse or promote products derived from

16 * this software without specific prior written permission.

17 *

18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR

21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT

22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,

23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT

24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,

25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY

26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT

27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE

28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

29 */

30

31 // How we handle the base tag better.

32 // Current status:

33 // At now the normal way we use to handling base tag is

34 // a) For those links which have corresponding local saved files, such as

35 // savable CSS, JavaScript files, they will be written to relative URLs which

36 // point to local saved file. Why those links can not be resolved as absolute

37 // file URLs, because if they are resolved as absolute URLs, after moving the

38 // file location from one directory to another directory, the file URLs will

39 // be dead links.

40 // b) For those links which have not corresponding local saved files, such as

41 // links in A, AREA tags, they will be resolved as absolute URLs.

42 // c) We comment all base tags when serialzing DOM for the page.

43 // FireFox also uses above way to handle base tag.

44 //

45 // Problem:

46 // This way can not handle the following situation:

47 // the base tag is written by JavaScript.

48 // For example. The page "www.yahoo.com" use

49 // "document.write('<base href="http://www.yahoo.com/"...');" to setup base URL

50 // of page when loading page. So when saving page as completed-HTML, we assume

51 // that we save "www.yahoo.com" to "c:\yahoo.htm". After then we load the saved

52 // completed-HTML page, then the JavaScript will insert a base tag

53 // <base href="http://www.yahoo.com/"...> to DOM, so all URLs which point to

54 // local saved resource files will be resolved as

55 // "http://www.yahoo.com/yahoo_files/...", which will cause all saved resource

56 // files can not be loaded correctly. Also the page will be rendered ugly since

57 // all saved sub-resource files (such as CSS, JavaScript files) and sub-frame

58 // files can not be fetched.

59 // Now FireFox, IE and WebKit based Browser all have this problem.

60 //

61 // Solution:

62 // My solution is that we comment old base tag and write new base tag:

63 // <base href="." ...> after the previous commented base tag. In WebKit, it

64 // always uses the latest "href" attribute of base tag to set document's base

65 // URL. Based on this behavior, when we encounter a base tag, we comment it and

66 // write a new base tag <base href="."> after the previous commented base tag.

67 // The new added base tag can help engine to locate correct base URL for

68 // correctly loading local saved resource files. Also I think we need to inherit

69 // the base target value from document object when appending new base tag.

70 // If there are multiple base tags in original document, we will comment all old

71 // base tags and append new base tag after each old base tag because we do not

72 // know those old base tags are original content or added by JavaScript. If

73 // they are added by JavaScript, it means when loading saved page, the script(s)

74 // will still insert base tag(s) to DOM, so the new added base tag(s) can

75 // override the incorrect base URL and make sure we alway load correct local

76 // saved resource files.

77

78 #include "config.h"

79 #include "web/WebPageSerializerImpl.h"

80

81 #include "core/HTMLNames.h"

82 #include "core/dom/Document.h"

83 #include "core/dom/DocumentType.h"

84 #include "core/dom/Element.h"

85 #include "core/editing/markup.h"

86 #include "core/html/HTMLAllCollection.h"

87 #include "core/html/HTMLElement.h"

88 #include "core/html/HTMLFormElement.h"

89 #include "core/html/HTMLHtmlElement.h"

90 #include "core/html/HTMLMetaElement.h"

91 #include "core/loader/DocumentLoader.h"

92 #include "core/loader/FrameLoader.h"

93 #include "public/platform/WebVector.h"

94 #include "web/WebLocalFrameImpl.h"

95 #include "wtf/text/TextEncoding.h"

96

97 namespace blink {

98

99 // Maximum length of data buffer which is used to temporary save generated

100 // html content data. This is a soft limit which might be passed if a very large

101 // contegious string is found in the page.

102 static const unsigned dataBufferCapacity = 65536;

103

104 WebPageSerializerImpl::SerializeDomParam::SerializeDomParam(const KURL& url,

105 const WTF::TextEncod ing& textEncoding,

106 Document* document,

107 const String& direct oryName)

108 : url(url)

109 , textEncoding(textEncoding)

110 , document(document)

111 , directoryName(directoryName)

112 , isHTMLDocument(document->isHTMLDocument())

113 , haveSeenDocType(false)

114 , haveAddedCharsetDeclaration(false)

115 , skipMetaElement(nullptr)

116 , isInScriptOrStyleTag(false)

117 , haveAddedXMLProcessingDirective(false)

118 , haveAddedContentsBeforeEnd(false)

119 {

120 }

121

122 String WebPageSerializerImpl::preActionBeforeSerializeOpenTag(

123 const Element* element, SerializeDomParam* param, bool* needSkip)

124 {

125 StringBuilder result;

126

127 *needSkip = false;

128 if (param->isHTMLDocument) {

129 // Skip the open tag of original META tag which declare charset since we

130 // have overrided the META which have correct charset declaration after

131 // serializing open tag of HEAD element.

132 ASSERT(element);

133 if (isHTMLMetaElement(*element)) {

134 const HTMLMetaElement& meta = toHTMLMetaElement(*element);

135 // Check whether the META tag has declared charset or not.

136 String equiv = meta.httpEquiv();

137 if (equalIgnoringCase(equiv, "content-type")) {

138 String content = meta.content();

139 if (content.length() && content.contains("charset", TextCaseInse nsitive)) {

140 // Find META tag declared charset, we need to skip it when

141 // serializing DOM.

142 param->skipMetaElement = element;

143 *needSkip = true;

144 }

145 }

146 } else if (isHTMLHtmlElement(*element)) {

147 // Check something before processing the open tag of HEAD element.

148 // First we add doc type declaration if original document has it.

149 if (!param->haveSeenDocType) {

150 param->haveSeenDocType = true;

151 result.append(createMarkup(param->document->doctype()));

152 }

153

154 // Add MOTW declaration before html tag.

155 // See http://msdn2.microsoft.com/en-us/library/ms537628(VS.85).aspx .

156 result.append(WebPageSerializer::generateMarkOfTheWebDeclaration(par am->url));

157 } else if (isHTMLBaseElement(*element)) {

158 // Comment the BASE tag when serializing dom.

159 result.appendLiteral("<!--");

160 }

161 } else {

162 // Write XML declaration.

163 if (!param->haveAddedXMLProcessingDirective) {

164 param->haveAddedXMLProcessingDirective = true;

165 // Get encoding info.

166 String xmlEncoding = param->document->xmlEncoding();

167 if (xmlEncoding.isEmpty())

168 xmlEncoding = param->document->encodingName();

169 if (xmlEncoding.isEmpty())

170 xmlEncoding = UTF8Encoding().name();

171 result.appendLiteral("<?xml version=\"");

172 result.append(param->document->xmlVersion());

173 result.appendLiteral("\" encoding=\"");

174 result.append(xmlEncoding);

175 if (param->document->xmlStandalone())

176 result.appendLiteral("\" standalone=\"yes");

177 result.appendLiteral("\"?>\n");

178 }

179 // Add doc type declaration if original document has it.

180 if (!param->haveSeenDocType) {

181 param->haveSeenDocType = true;

182 result.append(createMarkup(param->document->doctype()));

183 }

184 }

185 return result.toString();

186 }

187

188 String WebPageSerializerImpl::postActionAfterSerializeOpenTag(

189 const Element* element, SerializeDomParam* param)

190 {

191 StringBuilder result;

192

193 param->haveAddedContentsBeforeEnd = false;

194 if (!param->isHTMLDocument)

195 return result.toString();

196 // Check after processing the open tag of HEAD element

197 if (!param->haveAddedCharsetDeclaration

198 && isHTMLHeadElement(*element)) {

199 param->haveAddedCharsetDeclaration = true;

200 // Check meta element. WebKit only pre-parse the first 512 bytes

201 // of the document. If the whole <HEAD> is larger and meta is the

202 // end of head part, then this kind of pages aren't decoded correctly

203 // because of this issue. So when we serialize the DOM, we need to

204 // make sure the meta will in first child of head tag.

205 // See http://bugs.webkit.org/show_bug.cgi?id=16621.

206 // First we generate new content for writing correct META element.

207 result.append(WebPageSerializer::generateMetaCharsetDeclaration(

208 String(param->textEncoding.name())));

209

210 param->haveAddedContentsBeforeEnd = true;

211 // Will search each META which has charset declaration, and skip them al l

212 // in PreActionBeforeSerializeOpenTag.

213 } else if (isHTMLScriptElement(element) \|\| isHTMLScriptElement(element)) {

214 param->isInScriptOrStyleTag = true;

215 }

216

217 return result.toString();

218 }

219

220 String WebPageSerializerImpl::preActionBeforeSerializeEndTag(

221 const Element* element, SerializeDomParam* param, bool* needSkip)

222 {

223 String result;

224

225 *needSkip = false;

226 if (!param->isHTMLDocument)

227 return result;

228 // Skip the end tag of original META tag which declare charset.

229 // Need not to check whether it's META tag since we guarantee

230 // skipMetaElement is definitely META tag if it's not 0.

231 if (param->skipMetaElement == element) {

232 *needSkip = true;

233 } else if (isHTMLScriptElement(element) \|\| isHTMLScriptElement(element)) {

234 ASSERT(param->isInScriptOrStyleTag);

235 param->isInScriptOrStyleTag = false;

236 }

237

238 return result;

239 }

240

241 // After we finish serializing end tag of a element, we give the target

242 // element a chance to do some post work to add some additional data.

243 String WebPageSerializerImpl::postActionAfterSerializeEndTag(

244 const Element* element, SerializeDomParam* param)

245 {

246 StringBuilder result;

247

248 if (!param->isHTMLDocument)

249 return result.toString();

250 // Comment the BASE tag when serializing DOM.

251 if (isHTMLBaseElement(*element)) {

252 result.appendLiteral("-->");

253 // Append a new base tag declaration.

254 result.append(WebPageSerializer::generateBaseTagDeclaration(

255 param->document->baseTarget()));

256 }

257

258 return result.toString();

259 }

260

261 void WebPageSerializerImpl::saveHTMLContentToBuffer(

262 const String& result, SerializeDomParam* param)

263 {

264 m_dataBuffer.append(result);

265 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsNotFinished,

266 param,

267 DoNotForceFlush);

268 }

269

270 void WebPageSerializerImpl::encodeAndFlushBuffer(

271 WebPageSerializerClient::PageSerializationStatus status,

272 SerializeDomParam* param,

273 FlushOption flushOption)

274 {

275 // Data buffer is not full nor do we want to force flush.

276 if (flushOption != ForceFlush && m_dataBuffer.length() <= dataBufferCapacity )

277 return;

278

279 String content = m_dataBuffer.toString();

280 m_dataBuffer.clear();

281

282 CString encodedContent = param->textEncoding.normalizeAndEncode(content, WTF ::EntitiesForUnencodables);

283

284 // Send result to the client.

285 m_client->didSerializeDataForFrame(param->url,

286 WebCString(encodedContent.data(), encoded Content.length()),

287 status);

288 }

289

290 void WebPageSerializerImpl::openTagToString(Element* element,

291 SerializeDomParam* param)

292 {

293 bool needSkip;

294 StringBuilder result;

295 // Do pre action for open tag.

296 result.append(preActionBeforeSerializeOpenTag(element, param, &needSkip));

297 if (needSkip)

298 return;

299 // Add open tag

300 result.append('<');

301 result.append(element->nodeName().lower());

302 // Go through all attributes and serialize them.

303 AttributeCollection attributes = element->attributes();

304 AttributeCollection::iterator end = attributes.end();

305 for (AttributeCollection::iterator it = attributes.begin(); it != end; ++it) {

306 result.append(' ');

307 // Add attribute pair

308 result.append(it->name().toString());

309 result.appendLiteral("=\"");

310 if (!it->value().isEmpty()) {

311 const String& attrValue = it->value();

312

313 // Check whether we need to replace some resource links

314 // with local resource paths.

315 const QualifiedName& attrName = it->name();

316 if (element->hasLegalLinkAttribute(attrName)) {

317 // For links start with "javascript:", we do not change it.

318 if (attrValue.startsWith("javascript:", TextCaseInsensitive)) {

319 result.append(attrValue);

320 } else {

321 // Get the absolute link

322 WebLocalFrameImpl* subFrame = WebLocalFrameImpl::fromFrameOw nerElement(element);

323 String completeURL = subFrame ? subFrame->frame()->document( )->url() :

324 param->document->completeURL (attrValue);

325 // Check whether we have local files for those link.

326 if (m_localLinks.contains(completeURL)) {

327 if (!param->directoryName.isEmpty()) {

328 result.appendLiteral("./");

329 result.append(param->directoryName);

330 result.append('/');

331 }

332 result.append(m_localLinks.get(completeURL));

333 } else {

334 result.append(completeURL);

335 }

336 }

337 } else {

338 if (param->isHTMLDocument)

339 result.append(m_htmlEntities.convertEntitiesInString(attrVal ue));

340 else

341 result.append(m_xmlEntities.convertEntitiesInString(attrValu e));

342 }

343 }

344 result.append('\"');

345 }

346

347 // Do post action for open tag.

348 String addedContents = postActionAfterSerializeOpenTag(element, param);

349 // Complete the open tag for element when it has child/children.

350 if (element->hasChildren() \|\| param->haveAddedContentsBeforeEnd)

351 result.append('>');

352 // Append the added contents generate in post action of open tag.

353 result.append(addedContents);

354 // Save the result to data buffer.

355 saveHTMLContentToBuffer(result.toString(), param);

356 }

357

358 // Serialize end tag of an specified element.

359 void WebPageSerializerImpl::endTagToString(Element* element,

360 SerializeDomParam* param)

361 {

362 bool needSkip;

363 StringBuilder result;

364 // Do pre action for end tag.

365 result.append(preActionBeforeSerializeEndTag(element, param, &needSkip));

366 if (needSkip)

367 return;

368 // Write end tag when element has child/children.

369 if (element->hasChildren() \|\| param->haveAddedContentsBeforeEnd) {

370 result.appendLiteral("</");

371 result.append(element->nodeName().lower());

372 result.append('>');

373 } else {

374 // Check whether we have to write end tag for empty element.

375 if (param->isHTMLDocument) {

376 result.append('>');

377 // FIXME: This code is horribly wrong. WebPageSerializerImpl must d ie.

378 if (!element->isHTMLElement() \|\| !toHTMLElement(element)->ieForbidsI nsertHTML()) {

379 // We need to write end tag when it is required.

380 result.appendLiteral("</");

381 result.append(element->nodeName().lower());

382 result.append('>');

383 }

384 } else {

385 // For xml base document.

386 result.appendLiteral(" />");

387 }

388 }

389 // Do post action for end tag.

390 result.append(postActionAfterSerializeEndTag(element, param));

391 // Save the result to data buffer.

392 saveHTMLContentToBuffer(result.toString(), param);

393 }

394

395 void WebPageSerializerImpl::buildContentForNode(Node* node,

396 SerializeDomParam* param)

397 {

398 switch (node->nodeType()) {

399 case Node::ELEMENT_NODE:

400 // Process open tag of element.

401 openTagToString(toElement(node), param);

402 // Walk through the children nodes and process it.

403 for (Node *child = node->firstChild(); child; child = child->nextSibling ())

404 buildContentForNode(child, param);

405 // Process end tag of element.

406 endTagToString(toElement(node), param);

407 break;

408 case Node::TEXT_NODE:

409 saveHTMLContentToBuffer(createMarkup(node), param);

410 break;

411 case Node::ATTRIBUTE_NODE:

412 case Node::DOCUMENT_NODE:

413 case Node::DOCUMENT_FRAGMENT_NODE:

414 // Should not exist.

415 ASSERT_NOT_REACHED();

416 break;

417 // Document type node can be in DOM?

418 case Node::DOCUMENT_TYPE_NODE:

419 param->haveSeenDocType = true;

420 default:

421 // For other type node, call default action.

422 saveHTMLContentToBuffer(createMarkup(node), param);

423 break;

424 }

425 }

426

427 WebPageSerializerImpl::WebPageSerializerImpl(WebFrame* frame,

428 bool recursiveSerialization,

429 WebPageSerializerClient* client,

430 const WebVector<WebURL>& links,

431 const WebVector<WebString>& localPa ths,

432 const WebString& localDirectoryName )

433 : m_client(client)

434 , m_recursiveSerialization(recursiveSerialization)

435 , m_framesCollected(false)

436 , m_localDirectoryName(localDirectoryName)

437 , m_htmlEntities(false)

438 , m_xmlEntities(true)

439 {

440 // Must specify available webframe.

441 ASSERT(frame);

442 m_specifiedWebLocalFrameImpl = toWebLocalFrameImpl(frame);

443 // Make sure we have non 0 client.

444 ASSERT(client);

445 // Build local resources map.

446 ASSERT(links.size() == localPaths.size());

447 for (size_t i = 0; i < links.size(); i++) {

448 KURL url = links[i];

449 ASSERT(!m_localLinks.contains(url.string()));

450 m_localLinks.set(url.string(), localPaths[i]);

451 }

452

453 ASSERT(m_dataBuffer.isEmpty());

454 }

455

456 void WebPageSerializerImpl::collectTargetFrames()

457 {

458 ASSERT(!m_framesCollected);

459 m_framesCollected = true;

460

461 // First, process main frame.

462 m_frames.append(m_specifiedWebLocalFrameImpl);

463 // Return now if user only needs to serialize specified frame, not including

464 // all sub-frames.

465 if (!m_recursiveSerialization)

466 return;

467 // Collect all frames inside the specified frame.

468 for (WebLocalFrameImpl* frame : m_frames) {

469 // Get current using document.

470 Document* currentDoc = frame->frame()->document();

471 // Go through sub-frames.

472 RefPtrWillBeRawPtr<HTMLAllCollection> all = currentDoc->all();

473

474 for (unsigned i = 0; Element* element = all->item(i); ++i) {

475 if (!element->isHTMLElement())

476 continue;

477 WebLocalFrameImpl* webFrame =

478 WebLocalFrameImpl::fromFrameOwnerElement(element);

479 if (webFrame)

480 m_frames.append(webFrame);

481 }

482 }

483 }

484

485 bool WebPageSerializerImpl::serialize()

486 {

487 if (!m_framesCollected)

488 collectTargetFrames();

489

490 bool didSerialization = false;

491 KURL mainURL = m_specifiedWebLocalFrameImpl->frame()->document()->url();

492

493 for (unsigned i = 0; i < m_frames.size(); ++i) {

494 WebLocalFrameImpl* webFrame = m_frames[i];

495 Document* document = webFrame->frame()->document();

496 const KURL& url = document->url();

497

498 if (!url.isValid() \|\| !m_localLinks.contains(url.string()))

499 continue;

500

501 didSerialization = true;

502

503 const WTF::TextEncoding& textEncoding = document->encoding().isValid() ? document->encoding() : UTF8Encoding();

504 String directoryName = url == mainURL ? m_localDirectoryName : "";

505

506 SerializeDomParam param(url, textEncoding, document, directoryName);

507

508 Element* documentElement = document->documentElement();

509 if (documentElement)

510 buildContentForNode(documentElement, &param);

511

512 encodeAndFlushBuffer(WebPageSerializerClient::CurrentFrameIsFinished, &p aram, ForceFlush);

513 }

514

515 ASSERT(m_dataBuffer.isEmpty());

516 m_client->didSerializeDataForFrame(KURL(), WebCString("", 0), WebPageSeriali zerClient::AllFramesAreFinished);

517 return didSerialization;

518 }

519

520 } // namespace blink

OLD	NEW

« no previous file with comments | « Source/web/WebPageSerializerImpl.h ('k') | Source/web/tests/WebPageSerializerTest.cpp » ('j') | no next file with comments »