Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1316)

Unified Diff: boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java

Issue 296113004: Start using computed style instead of default tag actions. (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: Added tests Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java
diff --git a/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java b/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java
index 26735dc9f523e9359b96b090e7e13c259a8f9320..f885ff7dbab074c4908746470f7e274915155c93 100644
--- a/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java
+++ b/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java
@@ -21,7 +21,10 @@
*/
package de.l3s.boilerpipe.sax;
+import com.dom_distiller.client.DomUtil;
import com.dom_distiller.client.StringUtil;
+import com.google.gwt.dom.client.Element;
+import com.google.gwt.dom.client.Style;
import de.l3s.boilerpipe.document.TextBlock;
import de.l3s.boilerpipe.document.TextDocument;
@@ -32,6 +35,7 @@ import com.dom_distiller.client.sax.Attributes;
import com.dom_distiller.client.sax.ContentHandler;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
@@ -82,6 +86,14 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler {
LinkedList<Integer> fontSizeStack = new LinkedList<Integer>();
/**
+ * Contains the computed style of each element. The element key is a weak reference, to ensure
Yaron 2014/05/29 01:09:10 Is this true? I don't see that
nyquist 2014/05/29 23:42:25 I'm a liar who does not update comments after real
+ * the entry in the map is removed when there are no other references.
+ */
+ private final Map<Element, Style> computedStyleCache = new HashMap<Element, Style>();
+
+ private final Map<String, TagAction> displayStyleToTagAction = new HashMap<String, TagAction>();
+
+ /**
* Recycles this instance.
*/
public void recycle() {
@@ -125,6 +137,27 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler {
*/
public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) {
this.tagActions = tagActions;
+ setupDisplayToTagActionMapping();
+ }
+
+ private void setupDisplayToTagActionMapping() {
+ displayStyleToTagAction.put("inline", CommonTagActions.TA_INLINE_NO_WHITESPACE);
Yaron 2014/05/29 01:09:10 Can you reference a source for these?
nyquist 2014/05/29 23:42:25 Done. Also reordered and clarified with multiple s
+ displayStyleToTagAction.put("block", CommonTagActions.TA_BLOCK_LEVEL);
+ displayStyleToTagAction.put("inline-block", CommonTagActions.TA_INLINE_BLOCK_LEVEL);
+ displayStyleToTagAction.put("block", CommonTagActions.TA_BLOCK_LEVEL);
Yaron 2014/05/29 01:09:10 nit: dup
nyquist 2014/05/29 23:42:25 Done.
+ displayStyleToTagAction.put("table", CommonTagActions.TA_BLOCK_LEVEL);
+ displayStyleToTagAction.put("table-caption", CommonTagActions.TA_BLOCK_LEVEL);
+ displayStyleToTagAction.put("table-column-group", CommonTagActions.TA_BLOCK_LEVEL);
+ displayStyleToTagAction.put("table-header-group", CommonTagActions.TA_BLOCK_LEVEL);
+ displayStyleToTagAction.put("table-footer-group", CommonTagActions.TA_BLOCK_LEVEL);
+ displayStyleToTagAction.put("table-row-group", CommonTagActions.TA_BLOCK_LEVEL);
+ displayStyleToTagAction.put("table-cell", CommonTagActions.TA_BLOCK_LEVEL);
+ displayStyleToTagAction.put("table-column", CommonTagActions.TA_BLOCK_LEVEL);
+ displayStyleToTagAction.put("table-row", CommonTagActions.TA_BLOCK_LEVEL);
+ displayStyleToTagAction.put("flex", CommonTagActions.TA_BLOCK_LEVEL);
+ displayStyleToTagAction.put("inline-flex", CommonTagActions.TA_INLINE_BLOCK_LEVEL);
+ displayStyleToTagAction.put("inline-table", CommonTagActions.TA_INLINE_BLOCK_LEVEL);
+ displayStyleToTagAction.put("list-item", CommonTagActions.TA_BLOCK_LEVEL);
Yaron 2014/05/29 01:09:10 So display:none is handled elsewhere... Logically
nyquist 2014/05/29 23:42:25 Added TODO.
}
@Override
@@ -146,10 +179,15 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler {
}
@Override
- public void startElement(String uri, String localName, String qName, Attributes atts) {
+ public void startElement(String uri, String localName, String qName,
+ Element element, Attributes atts) {
labelStacks.add(null);
- TagAction ta = tagActions.get(localName);
+ TagAction ta = getComputedTagAction(element);
+ if (tagActions.containsKey(localName)) {
+ ta = tagActions.get(localName);
+ }
+
if (ta != null) {
if(ta.changesTagLevel()) {
tagLevel++;
@@ -164,9 +202,29 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler {
lastStartTag = localName;
}
+ private TagAction getComputedTagAction(Element element) {
+ if (computedStyleCache.containsKey(element)) {
+ return getComputedTagAction(computedStyleCache.get(element));
+ }
+ Style computedStyle = DomUtil.getComputedStyle(element);
+ computedStyleCache.put(element, computedStyle);
+ return getComputedTagAction(computedStyle);
+ }
+
+ private TagAction getComputedTagAction(Style style) {
+ if (displayStyleToTagAction.containsKey(style.getDisplay())) {
+ return displayStyleToTagAction.get(style.getDisplay());
+ }
+ return null;
+ }
+
@Override
- public void endElement(String uri, String localName, String qName) {
- TagAction ta = tagActions.get(localName);
+ public void endElement(String uri, String localName, String qName, Element element) {
+ TagAction ta = getComputedTagAction(element);
+ if (tagActions.containsKey(localName)) {
+ ta = tagActions.get(localName);
+ }
+
if (ta != null) {
flush = ta.end(this, localName, qName) | flush;
} else {

Powered by Google App Engine
This is Rietveld 408576698