Index: boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java |
diff --git a/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java b/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java |
index 26735dc9f523e9359b96b090e7e13c259a8f9320..f885ff7dbab074c4908746470f7e274915155c93 100644 |
--- a/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java |
+++ b/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java |
@@ -21,7 +21,10 @@ |
*/ |
package de.l3s.boilerpipe.sax; |
+import com.dom_distiller.client.DomUtil; |
import com.dom_distiller.client.StringUtil; |
+import com.google.gwt.dom.client.Element; |
+import com.google.gwt.dom.client.Style; |
import de.l3s.boilerpipe.document.TextBlock; |
import de.l3s.boilerpipe.document.TextDocument; |
@@ -32,6 +35,7 @@ import com.dom_distiller.client.sax.Attributes; |
import com.dom_distiller.client.sax.ContentHandler; |
import java.util.ArrayList; |
+import java.util.HashMap; |
import java.util.HashSet; |
import java.util.LinkedList; |
import java.util.List; |
@@ -82,6 +86,14 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler { |
LinkedList<Integer> fontSizeStack = new LinkedList<Integer>(); |
/** |
+ * Contains the computed style of each element. The element key is a weak reference, to ensure |
Yaron
2014/05/29 01:09:10
Is this true? I don't see that
nyquist
2014/05/29 23:42:25
I'm a liar who does not update comments after real
|
+ * the entry in the map is removed when there are no other references. |
+ */ |
+ private final Map<Element, Style> computedStyleCache = new HashMap<Element, Style>(); |
+ |
+ private final Map<String, TagAction> displayStyleToTagAction = new HashMap<String, TagAction>(); |
+ |
+ /** |
* Recycles this instance. |
*/ |
public void recycle() { |
@@ -125,6 +137,27 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler { |
*/ |
public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) { |
this.tagActions = tagActions; |
+ setupDisplayToTagActionMapping(); |
+ } |
+ |
+ private void setupDisplayToTagActionMapping() { |
+ displayStyleToTagAction.put("inline", CommonTagActions.TA_INLINE_NO_WHITESPACE); |
Yaron
2014/05/29 01:09:10
Can you reference a source for these?
nyquist
2014/05/29 23:42:25
Done. Also reordered and clarified with multiple s
|
+ displayStyleToTagAction.put("block", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("inline-block", CommonTagActions.TA_INLINE_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("block", CommonTagActions.TA_BLOCK_LEVEL); |
Yaron
2014/05/29 01:09:10
nit: dup
nyquist
2014/05/29 23:42:25
Done.
|
+ displayStyleToTagAction.put("table", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-caption", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-column-group", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-header-group", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-footer-group", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-row-group", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-cell", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-column", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("table-row", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("flex", CommonTagActions.TA_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("inline-flex", CommonTagActions.TA_INLINE_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("inline-table", CommonTagActions.TA_INLINE_BLOCK_LEVEL); |
+ displayStyleToTagAction.put("list-item", CommonTagActions.TA_BLOCK_LEVEL); |
Yaron
2014/05/29 01:09:10
So display:none is handled elsewhere... Logically
nyquist
2014/05/29 23:42:25
Added TODO.
|
} |
@Override |
@@ -146,10 +179,15 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler { |
} |
@Override |
- public void startElement(String uri, String localName, String qName, Attributes atts) { |
+ public void startElement(String uri, String localName, String qName, |
+ Element element, Attributes atts) { |
labelStacks.add(null); |
- TagAction ta = tagActions.get(localName); |
+ TagAction ta = getComputedTagAction(element); |
+ if (tagActions.containsKey(localName)) { |
+ ta = tagActions.get(localName); |
+ } |
+ |
if (ta != null) { |
if(ta.changesTagLevel()) { |
tagLevel++; |
@@ -164,9 +202,29 @@ public class BoilerpipeHTMLContentHandler implements ContentHandler { |
lastStartTag = localName; |
} |
+ private TagAction getComputedTagAction(Element element) { |
+ if (computedStyleCache.containsKey(element)) { |
+ return getComputedTagAction(computedStyleCache.get(element)); |
+ } |
+ Style computedStyle = DomUtil.getComputedStyle(element); |
+ computedStyleCache.put(element, computedStyle); |
+ return getComputedTagAction(computedStyle); |
+ } |
+ |
+ private TagAction getComputedTagAction(Style style) { |
+ if (displayStyleToTagAction.containsKey(style.getDisplay())) { |
+ return displayStyleToTagAction.get(style.getDisplay()); |
+ } |
+ return null; |
+ } |
+ |
@Override |
- public void endElement(String uri, String localName, String qName) { |
- TagAction ta = tagActions.get(localName); |
+ public void endElement(String uri, String localName, String qName, Element element) { |
+ TagAction ta = getComputedTagAction(element); |
+ if (tagActions.containsKey(localName)) { |
+ ta = tagActions.get(localName); |
+ } |
+ |
if (ta != null) { |
flush = ta.end(this, localName, qName) | flush; |
} else { |