Index: src/de/l3s/boilerpipe/document/TextDocument.java |
diff --git a/src/de/l3s/boilerpipe/document/TextDocument.java b/src/de/l3s/boilerpipe/document/TextDocument.java |
index 4c31308c8b463e053683f8e6eeced82faeaf7ac3..ab74aaf4563727dae369b0655d2b70109c02e902 100644 |
--- a/src/de/l3s/boilerpipe/document/TextDocument.java |
+++ b/src/de/l3s/boilerpipe/document/TextDocument.java |
@@ -20,17 +20,23 @@ |
*/ |
package de.l3s.boilerpipe.document; |
+import com.google.gwt.dom.client.Node; |
+ |
import java.util.LinkedList; |
import java.util.List; |
+import java.util.Set; |
/** |
- * A text document, consisting of one or more {@link TextBlock}s. |
+ * A text document, consisting of one or more {@link TextBlock}s, and features |
+ * of the original page (e.g. candidate titles, hidden elements, etc). |
* |
* @author Christian Kohlschütter |
*/ |
public class TextDocument implements Cloneable { |
final List<TextBlock> textBlocks; |
List<String> candidateTitles; |
+ private Set<Node> dataTables; |
+ private Set<Node> hiddenElements; |
/** |
* Creates a new {@link TextDocument} with given {@link TextBlock}s, and no |
@@ -81,9 +87,10 @@ public class TextDocument implements Cloneable { |
* Sets the list of candidate titles. |
* @param candidateTitles |
*/ |
- public void setCanddiateTitles(List<String> candidateTitles) { |
+ public void setCandidateTitles(List<String> candidateTitles) { |
this.candidateTitles = new LinkedList<String>(candidateTitles); |
} |
+ |
/** |
* Returns the {@link TextDocument}'s content. |
* |
@@ -131,4 +138,39 @@ public class TextDocument implements Cloneable { |
} |
return sb.toString(); |
} |
+ |
+ /** |
+ * Returns a list of nodes from the original Document which were classified as data tables |
+ * (i.e. are treated as an atomic block of text). |
+ * @return the set of data tables |
+ */ |
+ public Set<Node> getDataTables() { |
+ return dataTables; |
+ } |
+ |
+ /** |
+ * Sets the data tables identified while processing the document. |
+ * @param dataTables the set of data tables |
+ */ |
+ public void setDataTables(Set<Node> dataTables) { |
+ this.dataTables = dataTables; |
+ } |
+ |
+ /** |
+ * Returns a list of nodes fro mteh original Document which weren't actually visible. These |
+ * are typically omitted from boilerpipe text processing but are tracked for post-processing. |
+ * @return the set of hidden elements |
+ */ |
+ public Set<Node> getHiddenElements() { |
+ return hiddenElements; |
+ } |
+ |
+ /** |
+ * Sets the hidden elements identified while processing the document. |
+ * @param hiddenElements the set of hidden elements |
+ */ |
+ public void setHiddenElements(Set<Node> hiddenElements) { |
+ this.hiddenElements = hiddenElements; |
+ } |
+ |
} |