| Index: boilerpipe-core/src/main/de/l3s/boilerpipe/sax/CommonTagActions.java
|
| diff --git a/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/CommonTagActions.java b/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/CommonTagActions.java
|
| index 123031e6293ccf61bd31e9cc56d9b6c11c04fd11..6a89637148c86a5657617f0ff40c1a99a7b7f8fc 100644
|
| --- a/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/CommonTagActions.java
|
| +++ b/boilerpipe-core/src/main/de/l3s/boilerpipe/sax/CommonTagActions.java
|
| @@ -32,8 +32,8 @@ import de.l3s.boilerpipe.labels.LabelAction;
|
| */
|
| public abstract class CommonTagActions {
|
|
|
| - private CommonTagActions() {
|
| - }
|
| + private CommonTagActions() {
|
| + }
|
|
|
| public static final class Chained implements TagAction {
|
|
|
| @@ -46,19 +46,16 @@ public abstract class CommonTagActions {
|
| }
|
|
|
| public boolean start(BoilerpipeHTMLContentHandler instance,
|
| - String localName, String qName, Attributes atts) {
|
| - return t1.start(instance, localName, qName, atts)
|
| - | t2.start(instance, localName, qName, atts);
|
| + Attributes atts) {
|
| + return t1.start(instance, atts) | t2.start(instance, atts);
|
| }
|
|
|
| - public boolean end(BoilerpipeHTMLContentHandler instance,
|
| - String localName, String qName) {
|
| - return t1.end(instance, localName, qName)
|
| - | t2.end(instance, localName, qName);
|
| + public boolean end(BoilerpipeHTMLContentHandler instance) {
|
| + return t1.end(instance) | t2.end(instance);
|
| }
|
|
|
| public boolean changesTagLevel() {
|
| - return t1.changesTagLevel() || t2.changesTagLevel();
|
| + return t1.changesTagLevel() || t2.changesTagLevel();
|
| }
|
| }
|
|
|
| @@ -68,20 +65,18 @@ public abstract class CommonTagActions {
|
| public static final TagAction TA_IGNORABLE_ELEMENT = new TagAction() {
|
|
|
| public boolean start(final BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName,
|
| - final Attributes atts) {
|
| + final Attributes atts) {
|
| instance.inIgnorableElement++;
|
| return true;
|
| }
|
|
|
| - public boolean end(final BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName) {
|
| + public boolean end(final BoilerpipeHTMLContentHandler instance) {
|
| instance.inIgnorableElement--;
|
| return true;
|
| }
|
|
|
| public boolean changesTagLevel() {
|
| - return true;
|
| + return true;
|
| }
|
| };
|
|
|
| @@ -95,15 +90,14 @@ public abstract class CommonTagActions {
|
| public static final TagAction TA_ANCHOR_TEXT = new TagAction() {
|
|
|
| public boolean start(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName,
|
| - final Attributes atts) {
|
| + final Attributes atts) {
|
| if (instance.inAnchor++ > 0) {
|
| // as nested A elements are not allowed per specification, we
|
| // are probably reaching this branch due to a bug in the XML
|
| // parser
|
| - System.err.println("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow...");
|
| -
|
| - end(instance, localName, qName);
|
| + System.err.println("Warning: SAX input contains nested A elements -- You have probably hit a bug in your HTML parser (e.g., NekoHTML bug #2909310). Please clean the HTML externally and feed it to boilerpipe again. Trying to recover somehow...");
|
| +
|
| + end(instance);
|
| }
|
| if (instance.inIgnorableElement == 0) {
|
| instance.addWhitespaceIfNecessary();
|
| @@ -115,8 +109,7 @@ public abstract class CommonTagActions {
|
| return false;
|
| }
|
|
|
| - public boolean end(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName) {
|
| + public boolean end(BoilerpipeHTMLContentHandler instance) {
|
| if (--instance.inAnchor == 0) {
|
| if (instance.inIgnorableElement == 0) {
|
| instance.addWhitespaceIfNecessary();
|
| @@ -130,7 +123,7 @@ public abstract class CommonTagActions {
|
| }
|
|
|
| public boolean changesTagLevel() {
|
| - return true;
|
| + return true;
|
| }
|
| };
|
|
|
| @@ -139,98 +132,82 @@ public abstract class CommonTagActions {
|
| */
|
| public static final TagAction TA_BODY = new TagAction() {
|
| public boolean start(final BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName,
|
| - final Attributes atts) {
|
| + final Attributes atts) {
|
| instance.flushBlock();
|
| instance.inBody++;
|
| return false;
|
| }
|
|
|
| - public boolean end(final BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName) {
|
| + public boolean end(final BoilerpipeHTMLContentHandler instance) {
|
| instance.flushBlock();
|
| instance.inBody--;
|
| return false;
|
| }
|
|
|
| public boolean changesTagLevel() {
|
| - return true;
|
| + return true;
|
| }
|
| };
|
|
|
| /**
|
| - * Marks this tag a simple "inline" element, which generates whitespace, but no new block.
|
| + * Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block.
|
| */
|
| - public static final TagAction TA_INLINE_WHITESPACE = new TagAction() {
|
| + public static final TagAction TA_INLINE_NO_WHITESPACE = new TagAction() {
|
|
|
| public boolean start(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName,
|
| - final Attributes atts) {
|
| - instance.addWhitespaceIfNecessary();
|
| + final Attributes atts) {
|
| return false;
|
| }
|
|
|
| - public boolean end(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName) {
|
| - instance.addWhitespaceIfNecessary();
|
| + public boolean end(BoilerpipeHTMLContentHandler instance) {
|
| return false;
|
| }
|
| -
|
| +
|
| public boolean changesTagLevel() {
|
| - return false;
|
| + return false;
|
| }
|
| };
|
| -
|
| - /**
|
| - * @deprecated Use {@link #TA_INLINE_WHITESPACE} instead
|
| - */
|
| - @Deprecated
|
| - public static final TagAction TA_INLINE = TA_INLINE_WHITESPACE;
|
| -
|
| + private static final Pattern PAT_FONT_SIZE = Pattern
|
| + .compile("([\\+\\-]?)([0-9])");
|
| +
|
| /**
|
| - * Marks this tag a simple "inline" element, which neither generates whitespace, nor a new block.
|
| + * Explicitly marks this tag a simple "block-level" element, which always generates whitespace
|
| */
|
| - public static final TagAction TA_INLINE_NO_WHITESPACE = new TagAction() {
|
| + public static final TagAction TA_BLOCK_LEVEL = new TagAction() {
|
|
|
| public boolean start(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName,
|
| - final Attributes atts) {
|
| - return false;
|
| + final Attributes atts) {
|
| + return true;
|
| }
|
|
|
| - public boolean end(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName) {
|
| - return false;
|
| + public boolean end(BoilerpipeHTMLContentHandler instance) {
|
| + return true;
|
| }
|
|
|
| public boolean changesTagLevel() {
|
| - return false;
|
| + return true;
|
| }
|
| };
|
| - private static final Pattern PAT_FONT_SIZE = Pattern
|
| - .compile("([\\+\\-]?)([0-9])");
|
| -
|
| +
|
| /**
|
| - * Explicitly marks this tag a simple "block-level" element, which always generates whitespace
|
| + * Explicitly marks this tag an inline-block element, which does not generate whitespace.
|
| */
|
| - public static final TagAction TA_BLOCK_LEVEL = new TagAction() {
|
| + public static final TagAction TA_INLINE_BLOCK_LEVEL = new TagAction() {
|
|
|
| public boolean start(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName,
|
| - final Attributes atts) {
|
| - return true;
|
| + final Attributes atts) {
|
| + return false;
|
| }
|
|
|
| - public boolean end(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName) {
|
| - return true;
|
| + public boolean end(BoilerpipeHTMLContentHandler instance) {
|
| + return false;
|
| }
|
| -
|
| +
|
| public boolean changesTagLevel() {
|
| - return true;
|
| + return true;
|
| }
|
| - };
|
| -
|
| + };
|
| +
|
| /**
|
| * Special TagAction for the <code><FONT></code> tag, which keeps track of the
|
| * absolute and relative font size.
|
| @@ -238,8 +215,7 @@ public abstract class CommonTagActions {
|
| public static final TagAction TA_FONT = new TagAction() {
|
|
|
| public boolean start(final BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName,
|
| - final Attributes atts) {
|
| + final Attributes atts) {
|
|
|
| String sizeAttr = atts.getValue("size");
|
| if (sizeAttr != null) {
|
| @@ -282,14 +258,13 @@ public abstract class CommonTagActions {
|
| return false;
|
| }
|
|
|
| - public boolean end(final BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName) {
|
| + public boolean end(final BoilerpipeHTMLContentHandler instance) {
|
| instance.fontSizeStack.removeFirst();
|
| return false;
|
| }
|
|
|
| public boolean changesTagLevel() {
|
| - return false;
|
| + return false;
|
| }
|
| };
|
|
|
| @@ -306,21 +281,19 @@ public abstract class CommonTagActions {
|
| }
|
|
|
| public boolean start(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName,
|
| - final Attributes atts) {
|
| + final Attributes atts) {
|
| instance.addWhitespaceIfNecessary();
|
| instance.addLabelAction(action);
|
| return false;
|
| }
|
|
|
| - public boolean end(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName) {
|
| + public boolean end(BoilerpipeHTMLContentHandler instance) {
|
| instance.addWhitespaceIfNecessary();
|
| return false;
|
| }
|
|
|
| public boolean changesTagLevel() {
|
| - return false;
|
| + return false;
|
| }
|
| }
|
|
|
| @@ -337,19 +310,17 @@ public abstract class CommonTagActions {
|
| }
|
|
|
| public boolean start(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName,
|
| - final Attributes atts) {
|
| + final Attributes atts) {
|
| instance.addLabelAction(action);
|
| return true;
|
| }
|
|
|
| - public boolean end(BoilerpipeHTMLContentHandler instance,
|
| - final String localName, final String qName) {
|
| + public boolean end(BoilerpipeHTMLContentHandler instance) {
|
| return true;
|
| }
|
|
|
| public boolean changesTagLevel() {
|
| - return true;
|
| + return true;
|
| }
|
| }
|
| }
|
|
|