OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package com.dom_distiller.client; | 5 package com.dom_distiller.client; |
6 | 6 |
| 7 import com.dom_distiller.proto.DomDistillerProtos; |
| 8 import com.dom_distiller.proto.DomDistillerProtos.TimingInfo; |
7 import com.google.gwt.dom.client.AnchorElement; | 9 import com.google.gwt.dom.client.AnchorElement; |
8 import com.google.gwt.dom.client.Document; | 10 import com.google.gwt.dom.client.Document; |
9 import com.google.gwt.dom.client.Element; | 11 import com.google.gwt.dom.client.Element; |
10 import com.google.gwt.dom.client.Node; | 12 import com.google.gwt.dom.client.Node; |
11 import com.google.gwt.dom.client.NodeList; | 13 import com.google.gwt.dom.client.NodeList; |
12 import com.google.gwt.dom.client.VideoElement; | 14 import com.google.gwt.dom.client.VideoElement; |
13 | 15 |
14 import de.l3s.boilerpipe.BoilerpipeProcessingException; | 16 import de.l3s.boilerpipe.BoilerpipeProcessingException; |
15 import de.l3s.boilerpipe.document.TextBlock; | 17 import de.l3s.boilerpipe.document.TextBlock; |
16 import de.l3s.boilerpipe.document.TextDocument; | 18 import de.l3s.boilerpipe.document.TextDocument; |
17 import de.l3s.boilerpipe.extractors.CommonExtractors; | 19 import de.l3s.boilerpipe.extractors.CommonExtractors; |
18 import de.l3s.boilerpipe.labels.DefaultLabels; | 20 import de.l3s.boilerpipe.labels.DefaultLabels; |
19 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; | 21 import de.l3s.boilerpipe.sax.BoilerpipeHTMLContentHandler; |
20 | 22 |
21 import java.util.ArrayList; | 23 import java.util.ArrayList; |
22 import java.util.LinkedList; | 24 import java.util.LinkedList; |
23 import java.util.List; | 25 import java.util.List; |
24 import java.util.logging.Logger; | |
25 | 26 |
26 public class ContentExtractor { | 27 public class ContentExtractor { |
27 static Logger logger = Logger.getLogger("DomDistiller"); | 28 private final Element documentElement; |
28 | 29 |
29 private final Element documentElement; | 30 private final List<String> candidateTitles; |
| 31 |
| 32 private final TimingInfo mTimingInfo; |
30 | 33 |
31 private final MarkupParser parser; | 34 private final MarkupParser parser; |
32 | 35 |
33 private final List<String> candidateTitles; | |
34 | |
35 public ContentExtractor(Element root) { | 36 public ContentExtractor(Element root) { |
36 this.documentElement = root; | 37 this.documentElement = root; |
| 38 this.candidateTitles = new LinkedList<String>(); |
| 39 this.mTimingInfo = DomDistillerProtos.TimingInfo.create(); |
| 40 |
| 41 double startTime = DomUtil.getTime(); |
37 this.parser = new MarkupParser(root); | 42 this.parser = new MarkupParser(root); |
38 this.candidateTitles = new LinkedList<String>(); | 43 this.mTimingInfo.setMarkupParsingTime(DomUtil.getTime() - startTime); |
39 } | 44 } |
40 | 45 |
41 // Grabs a list of candidate titles in descending priority order: | 46 // Grabs a list of candidate titles in descending priority order: |
42 // 1) meta-information | 47 // 1) meta-information |
43 // 2) The document's title element, modified based on some readability heuri
stics | 48 // 2) The document's title element, modified based on some readability heuri
stics |
44 // 3) The document's title element, if it's a string | 49 // 3) The document's title element, if it's a string |
45 private void ensureTitleInitialized() { | 50 private void ensureTitleInitialized() { |
46 if (candidateTitles.size() > 0) return; | 51 if (candidateTitles.size() > 0) return; |
47 | 52 |
48 String title = parser.getTitle(); | 53 String title = parser.getTitle(); |
(...skipping 13 matching lines...) Expand all Loading... |
62 ensureTitleInitialized(); | 67 ensureTitleInitialized(); |
63 assert candidateTitles.size() > 0; | 68 assert candidateTitles.size() > 0; |
64 return candidateTitles.get(0); | 69 return candidateTitles.get(0); |
65 } | 70 } |
66 | 71 |
67 public String extractContent() { | 72 public String extractContent() { |
68 return extractContent(false); | 73 return extractContent(false); |
69 } | 74 } |
70 | 75 |
71 public String extractContent(boolean textOnly) { | 76 public String extractContent(boolean textOnly) { |
| 77 double now = DomUtil.getTime(); |
| 78 TextDocument document = createTextBlocksFromPage(); |
| 79 mTimingInfo.setDocumentConstructionTime(DomUtil.getTime() - now); |
| 80 |
| 81 now = DomUtil.getTime(); |
| 82 List<Node> contentNodes = processTextBlocks(document); |
| 83 mTimingInfo.setArticleProcessingTime(DomUtil.getTime() - now); |
| 84 |
| 85 if (contentNodes.isEmpty()) { |
| 86 return ""; |
| 87 } |
| 88 |
| 89 now = DomUtil.getTime(); |
| 90 String html = formatExtractedNodes(textOnly, contentNodes); |
| 91 mTimingInfo.setFormattingTime(DomUtil.getTime() - now); |
| 92 return html; |
| 93 } |
| 94 |
| 95 |
| 96 /** |
| 97 * Returns timing information about the most recent extraction run. |
| 98 * @return an instance of DomDistillerProtos.TimingInfo with detailed timing
statistics. |
| 99 */ |
| 100 public TimingInfo getTimingInfo() { |
| 101 return mTimingInfo; |
| 102 } |
| 103 |
| 104 /** |
| 105 * Converts the original HTML page into a series of TextBlock for analysis. |
| 106 * @return a document with the list of extracted TextBlocks and additional i
nformation |
| 107 * that can be useful for identifying the core elements of the page. |
| 108 */ |
| 109 private TextDocument createTextBlocksFromPage() { |
72 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl
er(); | 110 BoilerpipeHTMLContentHandler htmlParser = new BoilerpipeHTMLContentHandl
er(); |
73 | |
74 htmlParser.startDocument(); | 111 htmlParser.startDocument(); |
75 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(htmlParser); | 112 DomToSaxVisitor domToSaxVisitor = new DomToSaxVisitor(htmlParser); |
76 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS
axVisitor); | 113 FilteringDomVisitor filteringDomVisitor = new FilteringDomVisitor(domToS
axVisitor); |
77 new DomWalker(filteringDomVisitor).walk(documentElement); | 114 new DomWalker(filteringDomVisitor).walk(documentElement); |
78 htmlParser.endDocument(); | 115 htmlParser.endDocument(); |
79 | |
80 TextDocument document = htmlParser.toTextDocument(); | 116 TextDocument document = htmlParser.toTextDocument(); |
81 ensureTitleInitialized(); | 117 ensureTitleInitialized(); |
| 118 document.setCandidateTitles(candidateTitles); |
| 119 document.setHiddenElements(filteringDomVisitor.getHiddenElements()); |
| 120 document.setDataTables(filteringDomVisitor.getDataTables()); |
| 121 return document; |
| 122 } |
82 | 123 |
83 document.setCanddiateTitles(candidateTitles); | 124 /** |
| 125 * Implements the actual analysis of the page content, identifying the core
elements of the |
| 126 * page. |
| 127 * @param document the TextBlock representation of the page extracted from t
he DOM. |
| 128 * @return a list of DOM nodes from the original document that contain the c
ore elements of the |
| 129 * page. |
| 130 */ |
| 131 private List<Node> processTextBlocks(TextDocument document) { |
84 try { | 132 try { |
85 CommonExtractors.ARTICLE_EXTRACTOR.process(document); | 133 CommonExtractors.ARTICLE_EXTRACTOR.process(document); |
86 } catch (BoilerpipeProcessingException e) { | 134 } catch (BoilerpipeProcessingException e) { |
87 logger.warning("Processing failed."); | 135 LogUtil.logToConsole("DomDistiller Processing failed: " + e); |
88 return ""; | 136 return new LinkedList<Node>(); |
89 } | 137 } |
90 | 138 |
91 | |
92 List<Node> contentNodes = getContentNodesForTextDocument(document); | 139 List<Node> contentNodes = getContentNodesForTextDocument(document); |
93 | 140 |
94 List<Node> contentAndRelevantElements = RelevantElementsFinder.findAndAd
dElements( | 141 List<Node> contentAndRelevantElements = RelevantElementsFinder.findAndAd
dElements( |
95 contentNodes, filteringDomVisitor.getHiddenElements(), | 142 contentNodes, document.getHiddenElements(), |
96 filteringDomVisitor.getDataTables(), Document.get().getDocumentE
lement()); | 143 document.getDataTables(), Document.get().getDocumentElement()); |
| 144 return contentAndRelevantElements; |
| 145 } |
97 | 146 |
98 if (contentAndRelevantElements.isEmpty()) { | 147 /** |
99 return ""; | 148 * Creates a new minimal HTML document containing copies of the DOM nodes id
entified as the |
100 } | 149 * core elements of the page. Some additional re-formatting hints may be inc
luded in the new |
101 | 150 * document. |
102 Node clonedSubtree = NodeListExpander.expand(contentAndRelevantElements)
.cloneSubtree(); | 151 * |
103 | 152 * @param textOnly indicates whether to simply return the aggregated text co
ntent instead of |
| 153 * HTML |
| 154 * @param contentNodes the DOM nodes containing text to be included in the f
inal docuemnt. |
| 155 * @return A HTML or text document which includes the aggregated content of
the provided HTML |
| 156 * nodes. |
| 157 */ |
| 158 private String formatExtractedNodes(boolean textOnly, List<Node> contentNode
s) { |
| 159 Node clonedSubtree = NodeListExpander.expand(contentNodes).cloneSubtree(
); |
104 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) { | 160 if (clonedSubtree.getNodeType() != Node.ELEMENT_NODE) { |
105 return ""; | 161 return ""; |
106 } | 162 } |
107 | 163 |
108 // The base URL in the distilled page viewer is different from that in | 164 // The base URL in the distilled page viewer is different from that in |
109 // the live page. This breaks all relative links (in anchors, | 165 // the live page. This breaks all relative links (in anchors, |
110 // images, etc.), so make them absolute in the distilled content. | 166 // images, etc.), so make them absolute in the distilled content. |
111 makeAllLinksAbsolute(clonedSubtree); | 167 makeAllLinksAbsolute(clonedSubtree); |
112 | |
113 if (textOnly) { | 168 if (textOnly) { |
114 return getTextFromTree(clonedSubtree); | 169 return getTextFromTree(clonedSubtree); |
115 } | 170 } |
116 | 171 |
117 // TODO(cjhopman): this discards the top element and just returns its ch
ildren. This might | 172 // TODO(cjhopman): this discards the top element and just returns its ch
ildren. This might |
118 // break in some cases. | 173 // break in some cases. |
119 return Element.as(clonedSubtree).getInnerHTML(); | 174 return Element.as(clonedSubtree).getInnerHTML(); |
120 } | 175 } |
121 | 176 |
122 /** | 177 /** |
(...skipping 71 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
194 return; | 249 return; |
195 } | 250 } |
196 var elementsWithSrc = root.querySelectorAll('img,source,track,video'); | 251 var elementsWithSrc = root.querySelectorAll('img,source,track,video'); |
197 for (var key in elementsWithSrc) { | 252 for (var key in elementsWithSrc) { |
198 if (elementsWithSrc[key].src) { | 253 if (elementsWithSrc[key].src) { |
199 elementsWithSrc[key].src = elementsWithSrc[key].src; | 254 elementsWithSrc[key].src = elementsWithSrc[key].src; |
200 } | 255 } |
201 } | 256 } |
202 }-*/; | 257 }-*/; |
203 } | 258 } |
OLD | NEW |