Index: boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java |
diff --git a/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java b/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java |
index 26735dc9f523e9359b96b090e7e13c259a8f9320..a7828c5b70ac146da0879b44ec58a80f49dfda09 100644 |
--- a/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java |
+++ b/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java |
@@ -21,7 +21,10 @@ |
*/ |
package de.l3s.boilerpipe.sax; |
+import com.dom_distiller.client.DomUtil; |
import com.dom_distiller.client.StringUtil; |
+import com.google.gwt.dom.client.Element; |
+import com.google.gwt.dom.client.Style; |
import de.l3s.boilerpipe.document.TextBlock; |
import de.l3s.boilerpipe.document.TextDocument; |
@@ -32,6 +35,7 @@ import com.dom_distiller.client.sax.Attributes; |
import com.dom_distiller.client.sax.ContentHandler; |
import java.util.ArrayList; |
+import java.util.HashMap; |
import java.util.HashSet; |
import java.util.LinkedList; |
import java.util.List; |
@@ -82,6 +86,13 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler { |
LinkedList<Integer> fontSizeStack = new LinkedList<Integer>(); |
/** |
+ * Contains the computed style of each element. |
+ */ |
+ private final Map<Element, Style> computedStyleCache = new HashMap<Element, Style>(); |
+ |
+ private final Map<String, TagAction> displayStyleToTagAction = new HashMap<String, TagAction>(); |
+ |
+ /** |
* Recycles this instance. |
*/ |
public void recycle() { |
@@ -125,6 +136,32 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler { |
*/ |
public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) { |
this.tagActions = tagActions; |
+ setupDisplayToTagActionMapping(); |
+ } |
+ |
+ // TODO(nyquist) Merge with FilteringDomVisitor for display: none when this class goes away. |
+ private void setupDisplayToTagActionMapping() { |
+ // See http://www.w3.org/TR/CSS2/visuren.html#display-prop |
+ displayStyleToTagAction.put("block", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("inline-block", CommonTagActions.TA_INLINE_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("inline", CommonTagActions.TA_INLINE_NO_WHITESPACE); |
+ displayStyleToTagAction.put("list-item", CommonTagActions.TA_BLOCK_LEVEL); |
+ |
+ // See http://www.w3.org/TR/CSS2/tables.html#table-display |
+ displayStyleToTagAction.put("table", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("inline-table", CommonTagActions.TA_INLINE_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-row", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-row-group", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-header-group", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-footer-group", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-column", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-column-group", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-cell", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-caption", CommonTagActions.TA_BLOCK_LEVEL); |
+ |
+ // See http://www.w3.org/TR/css-flexbox-1/#flex-containers |
+ displayStyleToTagAction.put("flex", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("inline-flex", CommonTagActions.TA_INLINE_BLOCK_LEVEL); |
} |
@Override |
@@ -146,29 +183,53 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler { |
} |
@Override |
- public void startElement(String uri, String localName, String qName, Attributes atts) { |
+ public void startElement(Element element, Attributes atts) { |
labelStacks.add(null); |
- TagAction ta = tagActions.get(localName); |
+ TagAction ta = getComputedTagAction(element); |
+ if (tagActions.containsKey(element.getTagName())) { |
+ ta = tagActions.get(element.getTagName()); |
+ } |
+ |
if (ta != null) { |
if(ta.changesTagLevel()) { |
tagLevel++; |
} |
- flush = ta.start(this, localName, qName, atts) | flush; |
+ flush = ta.start(this, atts) | flush; |
} else { |
tagLevel++; |
flush = true; |
} |
lastEvent = Event.START_TAG; |
- lastStartTag = localName; |
+ lastStartTag = element.getTagName(); |
+ } |
+ |
+ private TagAction getComputedTagAction(Element element) { |
+ if (computedStyleCache.containsKey(element)) { |
+ return getComputedTagAction(computedStyleCache.get(element)); |
+ } |
+ Style computedStyle = DomUtil.getComputedStyle(element); |
+ computedStyleCache.put(element, computedStyle); |
+ return getComputedTagAction(computedStyle); |
+ } |
+ |
+ private TagAction getComputedTagAction(Style style) { |
+ if (displayStyleToTagAction.containsKey(style.getDisplay())) { |
+ return displayStyleToTagAction.get(style.getDisplay()); |
+ } |
+ return null; |
} |
@Override |
- public void endElement(String uri, String localName, String qName) { |
- TagAction ta = tagActions.get(localName); |
+ public void endElement(Element element) { |
+ TagAction ta = getComputedTagAction(element); |
+ if (tagActions.containsKey(element.getTagName())) { |
+ ta = tagActions.get(element.getTagName()); |
+ } |
+ |
if (ta != null) { |
- flush = ta.end(this, localName, qName) | flush; |
+ flush = ta.end(this) | flush; |
} else { |
flush = true; |
} |
@@ -182,7 +243,7 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler { |
} |
lastEvent = Event.END_TAG; |
- lastEndTag = localName; |
+ lastEndTag = element.getTagName(); |
labelStacks.removeLast(); |
} |