Index: boilerpipe-core/src/main/de/l3s/boilerpipe/sax/MarkupTagAction.java |
diff --git a/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/MarkupTagAction.java b/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/MarkupTagAction.java |
deleted file mode 100644 |
index 594aa546b50002b665be68fbdc494eaad5ee75c4..0000000000000000000000000000000000000000 |
--- a/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/MarkupTagAction.java |
+++ /dev/null |
@@ -1,104 +0,0 @@ |
-package de.l3s.boilerpipe.sax; |
- |
-import java.util.ArrayList; |
-import java.util.HashSet; |
-import java.util.LinkedList; |
-import java.util.List; |
-import java.util.Set; |
-import java.util.regex.Pattern; |
- |
-import com.dom_distiller.client.sax.Attributes; |
- |
-import de.l3s.boilerpipe.document.TextBlock; |
-import de.l3s.boilerpipe.labels.DefaultLabels; |
-import de.l3s.boilerpipe.labels.LabelAction; |
- |
-/** |
- * Assigns labels for element CSS classes and ids to the corresponding |
- * {@link TextBlock}. CSS classes are prefixed by |
- * <code>{@link DefaultLabels#MARKUP_PREFIX}.</code>, and IDs are prefixed by |
- * <code>{@link DefaultLabels#MARKUP_PREFIX}#</code> |
- * |
- * @author Christian Kohlschütter |
- */ |
-public final class MarkupTagAction implements TagAction { |
- |
- private final boolean isBlockLevel; |
- private LinkedList<List<String>> labelStack = new LinkedList<List<String>>(); |
- |
- public MarkupTagAction(final boolean isBlockLevel) { |
- this.isBlockLevel = isBlockLevel; |
- } |
- |
- private static final Pattern PAT_NUM = Pattern.compile("[0-9]+"); |
- |
- @Override |
- public boolean start(BoilerpipeHTMLContentHandler instance, |
- String localName, String qName, Attributes atts) { |
- List<String> labels = new ArrayList<String>(5); |
- labels.add(DefaultLabels.MARKUP_PREFIX + localName); |
- |
- String classVal = atts.getValue("class"); |
- |
- if (classVal != null && classVal.length() > 0) { |
- classVal = PAT_NUM.matcher(classVal).replaceAll("#"); |
- classVal = classVal.trim(); |
- String[] vals = classVal.split("[ ]+"); |
- labels.add(DefaultLabels.MARKUP_PREFIX + "." |
- + classVal.replace(' ', '.')); |
- if (vals.length > 1) { |
- for (String s : vals) { |
- labels.add(DefaultLabels.MARKUP_PREFIX + "." + s); |
- } |
- } |
- } |
- |
- String id = atts.getValue("id"); |
- if (id != null && id.length() > 0) { |
- id = PAT_NUM.matcher(id).replaceAll("#"); |
- labels.add(DefaultLabels.MARKUP_PREFIX + "#" + id); |
- } |
- |
- Set<String> ancestors = getAncestorLabels(); |
- List<String> labelsWithAncestors = new ArrayList<String>( |
- (ancestors.size() + 1) * labels.size()); |
- |
- for (String l : labels) { |
- for (String an : ancestors) { |
- labelsWithAncestors.add(an); |
- labelsWithAncestors.add(an + " " + l); |
- } |
- labelsWithAncestors.add(l); |
- } |
- |
- instance.addLabelAction(new LabelAction(labelsWithAncestors |
- .toArray(new String[labelsWithAncestors.size()]))); |
- |
- labelStack.add(labels); |
- |
- return isBlockLevel; |
- } |
- |
- @Override |
- public boolean end(BoilerpipeHTMLContentHandler instance, String localName, |
- String qName) { |
- |
- labelStack.removeLast(); |
- return isBlockLevel; |
- } |
- |
- public boolean changesTagLevel() { |
- return isBlockLevel; |
- } |
- |
- private Set<String> getAncestorLabels() { |
- Set<String> set = new HashSet<String>(); |
- for (List<String> labels : labelStack) { |
- if (labels == null) { |
- continue; |
- } |
- set.addAll(labels); |
- } |
- return set; |
- } |
-} |