| OLD | NEW |
| 1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 /** | 5 /** |
| 6 * boilerpipe | 6 * boilerpipe |
| 7 * | 7 * |
| 8 * Copyright (c) 2009 Christian Kohlschütter | 8 * Copyright (c) 2009 Christian Kohlschütter |
| 9 * | 9 * |
| 10 * The author licenses this file to You under the Apache License, Version 2.0 | 10 * The author licenses this file to You under the Apache License, Version 2.0 |
| 11 * (the "License"); you may not use this file except in compliance with | 11 * (the "License"); you may not use this file except in compliance with |
| 12 * the License. You may obtain a copy of the License at | 12 * the License. You may obtain a copy of the License at |
| 13 * | 13 * |
| 14 * http://www.apache.org/licenses/LICENSE-2.0 | 14 * http://www.apache.org/licenses/LICENSE-2.0 |
| 15 * | 15 * |
| 16 * Unless required by applicable law or agreed to in writing, software | 16 * Unless required by applicable law or agreed to in writing, software |
| 17 * distributed under the License is distributed on an "AS IS" BASIS, | 17 * distributed under the License is distributed on an "AS IS" BASIS, |
| 18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 19 * See the License for the specific language governing permissions and | 19 * See the License for the specific language governing permissions and |
| 20 * limitations under the License. | 20 * limitations under the License. |
| 21 */ | 21 */ |
| 22 package de.l3s.boilerpipe.sax; | 22 package de.l3s.boilerpipe.sax; |
| 23 | 23 |
| 24 import com.dom_distiller.client.StringUtil; |
| 25 |
| 26 import de.l3s.boilerpipe.document.TextBlock; |
| 27 import de.l3s.boilerpipe.document.TextDocument; |
| 28 import de.l3s.boilerpipe.labels.LabelAction; |
| 29 import de.l3s.boilerpipe.util.UnicodeTokenizer; |
| 30 |
| 31 import org.xml.sax.Attributes; |
| 32 import org.xml.sax.ContentHandler; |
| 33 import org.xml.sax.Locator; |
| 34 import org.xml.sax.SAXException; |
| 35 |
| 24 import java.util.ArrayList; | 36 import java.util.ArrayList; |
| 25 import java.util.HashSet; | 37 import java.util.HashSet; |
| 26 import java.util.LinkedList; | 38 import java.util.LinkedList; |
| 27 import java.util.List; | 39 import java.util.List; |
| 28 import java.util.Map; | 40 import java.util.Map; |
| 29 import java.util.regex.Pattern; | 41 import java.util.regex.Pattern; |
| 30 | 42 |
| 31 import org.xml.sax.Attributes; | |
| 32 import org.xml.sax.ContentHandler; | |
| 33 import org.xml.sax.Locator; | |
| 34 import org.xml.sax.SAXException; | |
| 35 | |
| 36 import de.l3s.boilerpipe.document.TextBlock; | |
| 37 import de.l3s.boilerpipe.document.TextDocument; | |
| 38 import de.l3s.boilerpipe.labels.LabelAction; | |
| 39 import de.l3s.boilerpipe.util.UnicodeTokenizer; | |
| 40 | |
| 41 import com.dom_distiller.client.StringUtil; | |
| 42 | |
| 43 /** | 43 /** |
| 44 * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can | 44 * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can |
| 45 * be used by different parser implementations, e.g. NekoHTML and TagSoup. | 45 * be used by different parser implementations, e.g. NekoHTML and TagSoup. |
| 46 * | 46 * |
| 47 * @author Christian Kohlschütter | 47 * @author Christian Kohlschütter |
| 48 */ | 48 */ |
| 49 public class BoilerpipeHTMLContentHandler implements ContentHandler { | 49 public class BoilerpipeHTMLContentHandler implements ContentHandler { |
| 50 | 50 |
| 51 private final Map<String, TagAction> tagActions; | 51 private final Map<String, TagAction> tagActions; |
| 52 private String title = null; | 52 |
| 53 | 53 static final String ANCHOR_TEXT_START = "$\ue00a<"; |
| 54 static final String ANCHOR_TEXT_START = "$\ue00a<"; | 54 static final String ANCHOR_TEXT_END = ">\ue00a$"; |
| 55 static final String ANCHOR_TEXT_END = ">\ue00a$"; | 55 |
| 56 | 56 StringBuilder tokenBuffer = new StringBuilder(); |
| 57 StringBuilder tokenBuffer = new StringBuilder(); | 57 StringBuilder textBuffer = new StringBuilder(); |
| 58 StringBuilder textBuffer = new StringBuilder(); | 58 |
| 59 | 59 int inBody = 0; |
| 60 int inBody = 0; | 60 int inAnchor = 0; |
| 61 int inAnchor = 0; | 61 int inIgnorableElement = 0; |
| 62 int inIgnorableElement = 0; | 62 |
| 63 | 63 int tagLevel = 0; |
| 64 int tagLevel = 0; | 64 int blockTagLevel = -1; |
| 65 int blockTagLevel = -1; | 65 |
| 66 | 66 boolean sbLastWasWhitespace = false; |
| 67 boolean sbLastWasWhitespace = false; | 67 private int textElementIdx = 0; |
| 68 private int textElementIdx = 0; | 68 |
| 69 | 69 private final List<TextBlock> textBlocks = new ArrayList<TextBlock>(); |
| 70 private final List<TextBlock> textBlocks = new ArrayList<TextBlock>(); | 70 |
| 71 | 71 private String lastStartTag = null; |
| 72 private String lastStartTag = null; | 72 @SuppressWarnings("unused") |
| 73 @SuppressWarnings("unused") | 73 private String lastEndTag = null; |
| 74 private String lastEndTag = null; | 74 @SuppressWarnings("unused") |
| 75 @SuppressWarnings("unused") | 75 private Event lastEvent = null; |
| 76 private Event lastEvent = null; | 76 |
| 77 | 77 private int offsetBlocks = 0; |
| 78 private int offsetBlocks = 0; | 78 private HashSet<Integer> currentContainedTextElements = new HashSet<Integer>
(); |
| 79 private HashSet<Integer> currentContainedTextElements = new HashSet<Inte
ger>(); | 79 |
| 80 | 80 private boolean flush = false; |
| 81 private boolean flush = false; | 81 boolean inAnchorText = false; |
| 82 boolean inAnchorText = false; | 82 |
| 83 | 83 LinkedList<LinkedList<LabelAction>> labelStacks = new LinkedList<LinkedList<
LabelAction>>(); |
| 84 LinkedList<LinkedList<LabelAction>> labelStacks = new LinkedList<LinkedL
ist<LabelAction>>(); | 84 LinkedList<Integer> fontSizeStack = new LinkedList<Integer>(); |
| 85 LinkedList<Integer> fontSizeStack = new LinkedList<Integer>(); | 85 |
| 86 | 86 /** |
| 87 /** | 87 * Recycles this instance. |
| 88 * Recycles this instance. | 88 */ |
| 89 */ | 89 public void recycle() { |
| 90 public void recycle() { | 90 tokenBuffer.setLength(0); |
| 91 tokenBuffer.setLength(0); | 91 textBuffer.setLength(0); |
| 92 textBuffer.setLength(0); | 92 |
| 93 | 93 inBody = 0; |
| 94 inBody = 0; | 94 inAnchor = 0; |
| 95 inAnchor = 0; | 95 inIgnorableElement = 0; |
| 96 inIgnorableElement = 0; | 96 sbLastWasWhitespace = false; |
| 97 sbLastWasWhitespace = false; | 97 textElementIdx = 0; |
| 98 textElementIdx = 0; | 98 |
| 99 | 99 textBlocks.clear(); |
| 100 textBlocks.clear(); | 100 |
| 101 | 101 lastStartTag = null; |
| 102 lastStartTag = null; | 102 lastEndTag = null; |
| 103 lastEndTag = null; | 103 lastEvent = null; |
| 104 lastEvent = null; | 104 |
| 105 | 105 offsetBlocks = 0; |
| 106 offsetBlocks = 0; | 106 currentContainedTextElements.clear(); |
| 107 currentContainedTextElements.clear(); | 107 |
| 108 | 108 flush = false; |
| 109 flush = false; | 109 inAnchorText = false; |
| 110 inAnchorText = false; | 110 } |
| 111 } | 111 |
| 112 | 112 /** |
| 113 /** | 113 * Constructs a {@link BoilerpipeHTMLContentHandler} using the |
| 114 * Constructs a {@link BoilerpipeHTMLContentHandler} using the | 114 * {@link DefaultTagActionMap}. |
| 115 * {@link DefaultTagActionMap}. | 115 */ |
| 116 */ | 116 public BoilerpipeHTMLContentHandler() { |
| 117 public BoilerpipeHTMLContentHandler() { | 117 this(DefaultTagActionMap.INSTANCE); |
| 118 this(DefaultTagActionMap.INSTANCE); | 118 } |
| 119 } | 119 |
| 120 | 120 /** |
| 121 /** | 121 * Constructs a {@link BoilerpipeHTMLContentHandler} using the given |
| 122 * Constructs a {@link BoilerpipeHTMLContentHandler} using the given | 122 * {@link TagActionMap}. |
| 123 * {@link TagActionMap}. | 123 * |
| 124 * | 124 * @param tagActions |
| 125 * @param tagActions | 125 * The {@link TagActionMap} to use, e.g. |
| 126 * The {@link TagActionMap} to use, e.g. | 126 * {@link DefaultTagActionMap}. |
| 127 * {@link DefaultTagActionMap}. | 127 */ |
| 128 */ | 128 public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) { |
| 129 public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) { | 129 this.tagActions = tagActions; |
| 130 this.tagActions = tagActions; | 130 } |
| 131 } | 131 |
| 132 | 132 @Override |
| 133 // @Override | 133 public void endDocument() throws SAXException { |
| 134 public void endDocument() throws SAXException { | 134 flushBlock(); |
| 135 flushBlock(); | 135 } |
| 136 } | 136 |
| 137 | 137 @Override |
| 138 // @Override | 138 public void endPrefixMapping(String prefix) throws SAXException { |
| 139 public void endPrefixMapping(String prefix) throws SAXException { | 139 } |
| 140 } | 140 |
| 141 | 141 @Override |
| 142 // @Override | 142 public void ignorableWhitespace(char[] ch, int start, int length) |
| 143 public void ignorableWhitespace(char[] ch, int start, int length) | 143 throws SAXException { |
| 144 throws SAXException { | 144 if (!sbLastWasWhitespace) { |
| 145 if (!sbLastWasWhitespace) { | 145 textBuffer.append(' '); |
| 146 textBuffer.append(' '); | 146 tokenBuffer.append(' '); |
| 147 tokenBuffer.append(' '); | 147 } |
| 148 } | 148 sbLastWasWhitespace = true; |
| 149 sbLastWasWhitespace = true; | 149 } |
| 150 } | 150 |
| 151 | 151 @Override |
| 152 // @Override | 152 public void processingInstruction(String target, String data) |
| 153 public void processingInstruction(String target, String data) | 153 throws SAXException { |
| 154 throws SAXException { | 154 } |
| 155 } | 155 |
| 156 | 156 @Override |
| 157 // @Override | 157 public void setDocumentLocator(Locator locator) { |
| 158 public void setDocumentLocator(Locator locator) { | 158 } |
| 159 } | 159 |
| 160 | 160 @Override |
| 161 // @Override | 161 public void skippedEntity(String name) throws SAXException { |
| 162 public void skippedEntity(String name) throws SAXException { | 162 } |
| 163 } | 163 |
| 164 | 164 @Override |
| 165 // @Override | 165 public void startDocument() throws SAXException { |
| 166 public void startDocument() throws SAXException { | 166 } |
| 167 } | 167 |
| 168 | 168 @Override |
| 169 // @Override | 169 public void startPrefixMapping(String prefix, String uri) |
| 170 public void startPrefixMapping(String prefix, String uri) | 170 throws SAXException { |
| 171 throws SAXException { | 171 } |
| 172 } | 172 |
| 173 | 173 @Override |
| 174 // @Override | 174 public void startElement(String uri, String localName, String qName, |
| 175 public void startElement(String uri, String localName, String qName, | 175 Attributes atts) throws SAXException { |
| 176 Attributes atts) throws SAXException { | 176 labelStacks.add(null); |
| 177 labelStacks.add(null); | 177 |
| 178 | 178 TagAction ta = tagActions.get(localName); |
| 179 TagAction ta = tagActions.get(localName); | 179 if (ta != null) { |
| 180 if (ta != null) { | 180 if(ta.changesTagLevel()) { |
| 181 if(ta.changesTagLevel()) { | 181 tagLevel++; |
| 182 tagLevel++; | 182 } |
| 183 } | 183 flush = ta.start(this, localName, qName, atts) | flush; |
| 184 flush = ta.start(this, localName, qName, atts) | flush; | 184 } else { |
| 185 } else { | 185 tagLevel++; |
| 186 tagLevel++; | 186 flush = true; |
| 187 flush = true; | 187 } |
| 188 } | 188 |
| 189 | 189 lastEvent = Event.START_TAG; |
| 190 lastEvent = Event.START_TAG; | 190 lastStartTag = localName; |
| 191 lastStartTag = localName; | 191 } |
| 192 } | 192 |
| 193 | 193 @Override |
| 194 // @Override | 194 public void endElement(String uri, String localName, String qName) |
| 195 public void endElement(String uri, String localName, String qName) | 195 throws SAXException { |
| 196 throws SAXException { | 196 TagAction ta = tagActions.get(localName); |
| 197 TagAction ta = tagActions.get(localName); | 197 if (ta != null) { |
| 198 if (ta != null) { | 198 flush = ta.end(this, localName, qName) | flush; |
| 199 flush = ta.end(this, localName, qName) | flush; | 199 } else { |
| 200 } else { | 200 flush = true; |
| 201 flush = true; | 201 } |
| 202 } | 202 |
| 203 | 203 if(ta == null || ta.changesTagLevel()) { |
| 204 if(ta == null || ta.changesTagLevel()) { | 204 tagLevel--; |
| 205 tagLevel--; | 205 } |
| 206 } | 206 |
| 207 | 207 if (flush) { |
| 208 if (flush) { | 208 flushBlock(); |
| 209 flushBlock(); | 209 } |
| 210 } | 210 |
| 211 | 211 lastEvent = Event.END_TAG; |
| 212 lastEvent = Event.END_TAG; | 212 lastEndTag = localName; |
| 213 lastEndTag = localName; | 213 |
| 214 | 214 labelStacks.removeLast(); |
| 215 labelStacks.removeLast(); | 215 } |
| 216 } | 216 |
| 217 | 217 @Override |
| 218 // @Override | 218 public void characters(char[] ch, int start, int length) |
| 219 public void characters(char[] ch, int start, int length) | 219 throws SAXException { |
| 220 throws SAXException { | 220 textElementIdx++; |
| 221 textElementIdx++; | 221 |
| 222 | 222 |
| 223 | 223 if (flush) { |
| 224 if (flush) { | 224 flushBlock(); |
| 225 flushBlock(); | 225 flush = false; |
| 226 flush = false; | 226 } |
| 227 } | 227 |
| 228 | 228 if (inIgnorableElement != 0) { |
| 229 if (inIgnorableElement != 0) { | 229 return; |
| 230 return; | 230 } |
| 231 } | 231 |
| 232 | 232 char c; |
| 233 char c; | 233 boolean startWhitespace = false; |
| 234 boolean startWhitespace = false; | 234 boolean endWhitespace = false; |
| 235 boolean endWhitespace = false; | 235 if (length == 0) { |
| 236 if (length == 0) { | 236 return; |
| 237 return; | 237 } |
| 238 } | 238 |
| 239 | 239 final int end = start + length; |
| 240 final int end = start + length; | 240 for (int i = start; i < end; i++) { |
| 241 for (int i = start; i < end; i++) { | 241 if (StringUtil.isWhitespace(ch[i])) { |
| 242 if (StringUtil.isWhitespace(ch[i])) { | 242 ch[i] = ' '; |
| 243 ch[i] = ' '; | 243 } |
| 244 } | 244 } |
| 245 } | 245 while (start < end) { |
| 246 while (start < end) { | 246 c = ch[start]; |
| 247 c = ch[start]; | 247 if (c == ' ') { |
| 248 if (c == ' ') { | 248 startWhitespace = true; |
| 249 startWhitespace = true; | 249 start++; |
| 250 start++; | 250 length--; |
| 251 length--; | 251 } else { |
| 252 } else { | 252 break; |
| 253 break; | 253 } |
| 254 } | 254 } |
| 255 } | 255 while (length > 0) { |
| 256 while (length > 0) { | 256 c = ch[start + length - 1]; |
| 257 c = ch[start + length - 1]; | 257 if (c == ' ') { |
| 258 if (c == ' ') { | 258 endWhitespace = true; |
| 259 endWhitespace = true; | 259 length--; |
| 260 length--; | 260 } else { |
| 261 } else { | 261 break; |
| 262 break; | 262 } |
| 263 } | 263 } |
| 264 } | 264 if (length == 0) { |
| 265 if (length == 0) { | 265 if (startWhitespace || endWhitespace) { |
| 266 if (startWhitespace || endWhitespace) { | 266 if (!sbLastWasWhitespace) { |
| 267 if (!sbLastWasWhitespace) { | 267 textBuffer.append(' '); |
| 268 textBuffer.append(' '); | 268 tokenBuffer.append(' '); |
| 269 tokenBuffer.append(' '); | 269 } |
| 270 } | 270 sbLastWasWhitespace = true; |
| 271 sbLastWasWhitespace = true; | 271 } else { |
| 272 } else { | 272 sbLastWasWhitespace = false; |
| 273 sbLastWasWhitespace = false; | 273 } |
| 274 } | 274 lastEvent = Event.WHITESPACE; |
| 275 lastEvent = Event.WHITESPACE; | 275 return; |
| 276 return; | 276 } |
| 277 } | 277 if (startWhitespace) { |
| 278 if (startWhitespace) { | 278 if (!sbLastWasWhitespace) { |
| 279 if (!sbLastWasWhitespace) { | 279 textBuffer.append(' '); |
| 280 textBuffer.append(' '); | 280 tokenBuffer.append(' '); |
| 281 tokenBuffer.append(' '); | 281 } |
| 282 } | 282 } |
| 283 } | 283 |
| 284 | 284 if (blockTagLevel == -1) { |
| 285 if (blockTagLevel == -1) { | 285 blockTagLevel = tagLevel; |
| 286 blockTagLevel = tagLevel; | 286 } |
| 287 } | 287 |
| 288 | 288 textBuffer.append(ch, start, length); |
| 289 textBuffer.append(ch, start, length); | 289 tokenBuffer.append(ch, start, length); |
| 290 tokenBuffer.append(ch, start, length); | 290 if (endWhitespace) { |
| 291 if (endWhitespace) { | 291 textBuffer.append(' '); |
| 292 textBuffer.append(' '); | 292 tokenBuffer.append(' '); |
| 293 tokenBuffer.append(' '); | 293 } |
| 294 } | 294 |
| 295 | 295 sbLastWasWhitespace = endWhitespace; |
| 296 sbLastWasWhitespace = endWhitespace; | 296 lastEvent = Event.CHARACTERS; |
| 297 lastEvent = Event.CHARACTERS; | 297 |
| 298 | 298 currentContainedTextElements.add(textElementIdx); |
| 299 currentContainedTextElements.add(textElementIdx); | 299 } |
| 300 } | 300 |
| 301 | 301 List<TextBlock> getTextBlocks() { |
| 302 List<TextBlock> getTextBlocks() { | 302 return textBlocks; |
| 303 return textBlocks; | 303 } |
| 304 } | 304 |
| 305 | 305 public void flushBlock() { |
| 306 public void flushBlock() { | 306 if (inBody == 0) { |
| 307 if (inBody == 0) { | 307 textBuffer.setLength(0); |
| 308 if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody ==
0) { | 308 tokenBuffer.setLength(0); |
| 309 setTitle(tokenBuffer.toString().trim()); | 309 return; |
| 310 } | 310 } |
| 311 textBuffer.setLength(0); | 311 |
| 312 tokenBuffer.setLength(0); | 312 final int length = tokenBuffer.length(); |
| 313 return; | 313 switch (length) { |
| 314 } | 314 case 0: |
| 315 | 315 return; |
| 316 final int length = tokenBuffer.length(); | 316 case 1: |
| 317 switch (length) { | 317 if (sbLastWasWhitespace) { |
| 318 case 0: | 318 textBuffer.setLength(0); |
| 319 return; | 319 tokenBuffer.setLength(0); |
| 320 case 1: | 320 return; |
| 321 if (sbLastWasWhitespace) { | 321 } |
| 322 textBuffer.setLength(0); | 322 } |
| 323 tokenBuffer.setLength(0); | 323 final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer); |
| 324 return; | 324 |
| 325 } | 325 int numWords = 0; |
| 326 } | 326 int numLinkedWords = 0; |
| 327 final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer); | 327 int numWrappedLines = 0; |
| 328 | 328 int currentLineLength = -1; // don't count the first space |
| 329 int numWords = 0; | 329 final int maxLineLength = 80; |
| 330 int numLinkedWords = 0; | 330 int numTokens = 0; |
| 331 int numWrappedLines = 0; | 331 int numWordsCurrentLine = 0; |
| 332 int currentLineLength = -1; // don't count the first space | 332 |
| 333 final int maxLineLength = 80; | 333 for (String token : tokens) { |
| 334 int numTokens = 0; | 334 if (ANCHOR_TEXT_START.equals(token)) { |
| 335 int numWordsCurrentLine = 0; | 335 inAnchorText = true; |
| 336 | 336 } else if (ANCHOR_TEXT_END.equals(token)) { |
| 337 for (String token : tokens) { | 337 inAnchorText = false; |
| 338 if (ANCHOR_TEXT_START.equals(token)) { | 338 } else if (isWord(token)) { |
| 339 inAnchorText = true; | 339 numTokens++; |
| 340 } else if (ANCHOR_TEXT_END.equals(token)) { | 340 numWords++; |
| 341 inAnchorText = false; | 341 numWordsCurrentLine++; |
| 342 } else if (isWord(token)) { | 342 if (inAnchorText) { |
| 343 numTokens++; | 343 numLinkedWords++; |
| 344 numWords++; | 344 } |
| 345 numWordsCurrentLine++; | 345 final int tokenLength = token.length(); |
| 346 if (inAnchorText) { | 346 currentLineLength += tokenLength + 1; |
| 347 numLinkedWords++; | 347 if (currentLineLength > maxLineLength) { |
| 348 } | 348 numWrappedLines++; |
| 349 final int tokenLength = token.length(); | 349 currentLineLength = tokenLength; |
| 350 currentLineLength += tokenLength + 1; | 350 numWordsCurrentLine = 1; |
| 351 if (currentLineLength > maxLineLength) { | 351 } |
| 352 numWrappedLines++; | 352 } else { |
| 353 currentLineLength = tokenLength; | 353 numTokens++; |
| 354 numWordsCurrentLine = 1; | 354 } |
| 355 } | 355 } |
| 356 } else { | 356 if (numTokens == 0) { |
| 357 numTokens++; | 357 return; |
| 358 } | 358 } |
| 359 } | 359 int numWordsInWrappedLines; |
| 360 if (numTokens == 0) { | 360 if (numWrappedLines == 0) { |
| 361 return; | 361 numWordsInWrappedLines = numWords; |
| 362 } | 362 numWrappedLines = 1; |
| 363 int numWordsInWrappedLines; | 363 } else { |
| 364 if (numWrappedLines == 0) { | 364 numWordsInWrappedLines = numWords - numWordsCurrentLine; |
| 365 numWordsInWrappedLines = numWords; | 365 } |
| 366 numWrappedLines = 1; | 366 |
| 367 } else { | 367 TextBlock tb = new TextBlock(StringUtil.javaTrim(textBuffer.toString()), |
| 368 numWordsInWrappedLines = numWords - numWordsCurrentLine; | 368 currentContainedTextElements, numWords, numLinkedWords, |
| 369 } | 369 numWordsInWrappedLines, numWrappedLines, offsetBlocks); |
| 370 | 370 currentContainedTextElements = new HashSet<Integer>(); |
| 371 TextBlock tb = new TextBlock(StringUtil.javaTrim(textBuffer.toSt
ring()), | 371 |
| 372 currentContainedTextElements, numWords, numLinke
dWords, | 372 offsetBlocks++; |
| 373 numWordsInWrappedLines, numWrappedLines, offsetB
locks); | 373 |
| 374 currentContainedTextElements = new HashSet<Integer>(); | 374 textBuffer.setLength(0); |
| 375 | 375 tokenBuffer.setLength(0); |
| 376 offsetBlocks++; | 376 |
| 377 | 377 tb.setTagLevel(blockTagLevel); |
| 378 textBuffer.setLength(0); | 378 addTextBlock(tb); |
| 379 tokenBuffer.setLength(0); | 379 blockTagLevel = -1; |
| 380 | 380 } |
| 381 tb.setTagLevel(blockTagLevel); | 381 |
| 382 addTextBlock(tb); | 382 protected void addTextBlock(final TextBlock tb) { |
| 383 blockTagLevel = -1; | 383 |
| 384 } | 384 for (Integer l : fontSizeStack) { |
| 385 | 385 if (l != null) { |
| 386 protected void addTextBlock(final TextBlock tb) { | 386 tb.addLabel("font-" + l); |
| 387 | 387 break; |
| 388 for (Integer l : fontSizeStack) { | 388 } |
| 389 if (l != null) { | 389 } |
| 390 tb.addLabel("font-" + l); | 390 for (LinkedList<LabelAction> labelStack : labelStacks) { |
| 391 break; | 391 if (labelStack != null) { |
| 392 } | 392 for (LabelAction labels : labelStack) { |
| 393 } | 393 if (labels != null) { |
| 394 for (LinkedList<LabelAction> labelStack : labelStacks) { | 394 labels.addTo(tb); |
| 395 if (labelStack != null) { | 395 } |
| 396 for (LabelAction labels : labelStack) { | 396 } |
| 397 if (labels != null) { | 397 } |
| 398 labels.addTo(tb); | 398 } |
| 399 } | 399 |
| 400 } | 400 textBlocks.add(tb); |
| 401 } | 401 } |
| 402 } | 402 |
| 403 | 403 public static boolean isWord(final String token) { |
| 404 textBlocks.add(tb); | 404 return PAT_VALID_WORD_CHARACTER.matcher(token).find(); |
| 405 } | 405 } |
| 406 | 406 |
| 407 public static boolean isWord(final String token) { | 407 static private enum Event { |
| 408 return PAT_VALID_WORD_CHARACTER.matcher(token).find(); | 408 START_TAG, END_TAG, CHARACTERS, WHITESPACE |
| 409 } | 409 } |
| 410 | 410 |
| 411 static private enum Event { | 411 |
| 412 START_TAG, END_TAG, CHARACTERS, WHITESPACE | 412 /** |
| 413 } | 413 * Returns a {@link TextDocument} containing the extracted {@link TextBlock} |
| 414 | 414 * s. NOTE: Only call this after parsing. |
| 415 public String getTitle() { | 415 * |
| 416 return title; | 416 * @return The {@link TextDocument} |
| 417 } | 417 */ |
| 418 | 418 public TextDocument toTextDocument() { |
| 419 public void setTitle(String s) { | 419 // just to be sure |
| 420 if (s == null || s.length() == 0) { | 420 flushBlock(); |
| 421 return; | 421 // TODO(yfriedman): When BoilerpipeHTMLContentHandler is finished being
moved to |
| 422 } | 422 // DomToSaxVisitor, we should be able to set Title directly. |
| 423 title = s; | 423 return new TextDocument(null, getTextBlocks()); |
| 424 } | 424 } |
| 425 | 425 |
| 426 /** | 426 public void addWhitespaceIfNecessary() { |
| 427 * Returns a {@link TextDocument} containing the extracted {@link TextBl
ock} | 427 if (!sbLastWasWhitespace) { |
| 428 * s. NOTE: Only call this after parsing. | 428 tokenBuffer.append(' '); |
| 429 * | 429 textBuffer.append(' '); |
| 430 * @return The {@link TextDocument} | 430 sbLastWasWhitespace = true; |
| 431 */ | 431 } |
| 432 public TextDocument toTextDocument() { | 432 } |
| 433 // just to be sure | 433 |
| 434 flushBlock(); | 434 public void addLabelAction(final LabelAction la) |
| 435 | 435 throws IllegalStateException { |
| 436 return new TextDocument(getTitle(), getTextBlocks()); | 436 LinkedList<LabelAction> labelStack = labelStacks.getLast(); |
| 437 } | 437 if (labelStack == null) { |
| 438 | 438 labelStack = new LinkedList<LabelAction>(); |
| 439 public void addWhitespaceIfNecessary() { | 439 labelStacks.removeLast(); |
| 440 if (!sbLastWasWhitespace) { | 440 labelStacks.add(labelStack); |
| 441 tokenBuffer.append(' '); | 441 } |
| 442 textBuffer.append(' '); | 442 labelStack.add(la); |
| 443 sbLastWasWhitespace = true; | 443 } |
| 444 } | 444 |
| 445 } | 445 private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern |
| 446 | 446 .compile( |
| 447 public void addLabelAction(final LabelAction la) | |
| 448 throws IllegalStateException { | |
| 449 LinkedList<LabelAction> labelStack = labelStacks.getLast(); | |
| 450 if (labelStack == null) { | |
| 451 labelStack = new LinkedList<LabelAction>(); | |
| 452 labelStacks.removeLast(); | |
| 453 labelStacks.add(labelStack); | |
| 454 } | |
| 455 labelStack.add(la); | |
| 456 } | |
| 457 | |
| 458 private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern | |
| 459 .compile( | |
| 460 "[" + | 447 "[" + |
| 461 "\u0030-\u0039\u0041-\u005a\u0061-\u007a\u00aa\u
00b2-\u00b3\u00b5\u00b9-\u00ba\u00bc-\u00be\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02
36\u0250-\u02c1\u02c6-\u02d1\u02e0-\u02e4\u02ee\u037a\u0386\u0388-\u038a\u038c\u
038e-\u03a1\u03a3-\u03ce\u03d0-\u03f5\u03f7-\u03fb\u0400-\u0481\u048a-\u04ce\u04
d0-\u04f5\u04f8-\u04f9\u0500-\u050f\u0531-\u0556\u0559\u0561-\u0587\u05d0-\u05ea
\u05f0-\u05f2\u0621-\u063a\u0640-\u064a\u0660-\u0669\u066e-\u066f\u0671-\u06d3\u
06d5\u06e5-\u06e6\u06ee-\u06fc\u06ff\u0710\u0712-\u072f\u074d-\u074f\u0780-\u07a
5\u07b1\u0904-\u0939\u093d\u0950\u0958-\u0961\u0966-\u096f\u0985-\u098c\u098f-\u
0990\u0993-\u09a8\u09aa-\u09b0\u09b2\u09b6-\u09b9\u09bd\u09dc-\u09dd\u09df-\u09e
1\u09e6-\u09f1\u09f4-\u09f9\u0a05-\u0a0a\u0a0f-\u0a10\u0a13-\u0a28\u0a2a-\u0a30\
u0a32-\u0a33\u0a35-\u0a36\u0a38-\u0a39\u0a59-\u0a5c\u0a5e\u0a66-\u0a6f\u0a72-\u0
a74\u0a85-\u0a8d\u0a8f-\u0a91\u0a93-\u0aa8\u0aaa-\u0ab0\u0ab2-\u0ab3\u0ab5-\u0ab
9\u0abd\u0ad0\u0ae0-\u0ae1\u0ae6-\u0aef\u0b05-\u0b0c\u0b0f-\u0b10\u0b13-\u0b28\u
0b2a-\u0b30\u0b32-\u0b33\u0b35-\u0b39\u0b3d\u0b5c-\u0b5d\u0b5f-\u0b61\u0b66-\u0b
6f\u0b71\u0b83\u0b85-\u0b8a\u0b8e-\u0b90\u0b92-\u0b95\u0b99-\u0b9a\u0b9c\u0b9e-\
u0b9f\u0ba3-\u0ba4\u0ba8-\u0baa\u0bae-\u0bb5\u0bb7-\u0bb9\u0be7-\u0bf2\u0c05-\u0
c0c\u0c0e-\u0c10\u0c12-\u0c28\u0c2a-\u0c33\u0c35-\u0c39\u0c60-\u0c61\u0c66-\u0c6
f\u0c85-\u0c8c\u0c8e-\u0c90\u0c92-\u0ca8\u0caa-\u0cb3\u0cb5-\u0cb9\u0cbd\u0cde\u
0ce0-\u0ce1\u0ce6-\u0cef\u0d05-\u0d0c\u0d0e-\u0d10\u0d12-\u0d28\u0d2a-\u0d39\u0d
60-\u0d61\u0d66-\u0d6f\u0d85-\u0d96\u0d9a-\u0db1\u0db3-\u0dbb\u0dbd\u0dc0-\u0dc6
\u0e01-\u0e30\u0e32-\u0e33\u0e40-\u0e46\u0e50-\u0e59\u0e81-\u0e82\u0e84\u0e87-\u
0e88\u0e8a\u0e8d\u0e94-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5\u0ea7\u0eaa-\u0eab
\u0ead-\u0eb0\u0eb2-\u0eb3\u0ebd\u0ec0-\u0ec4\u0ec6\u0ed0-\u0ed9\u0edc-\u0edd\u0
f00\u0f20-\u0f33\u0f40-\u0f47\u0f49-\u0f6a\u0f88-\u0f8b\u1000-\u1021\u1023-\u102
7\u1029-\u102a\u1040-\u1049\u1050-\u1055\u10a0-\u10c5\u10d0-\u10f8\u1100-\u1159\
u115f-\u11a2\u11a8-\u11f9\u1200-\u1206\u1208-\u1246\u1248\u124a-\u124d\u1250-\u1
256\u1258\u125a-\u125d\u1260-\u1286\u1288\u128a-\u128d\u1290-\u12ae\u12b0\u12b2-
\u12b5\u12b8-\u12be\u12c0\u12c2-\u12c5\u12c8-\u12ce\u12d0-\u12d6\u12d8-\u12ee\u1
2f0-\u130e\u1310\u1312-\u1315\u1318-\u131e\u1320-\u1346\u1348-\u135a\u1369-\u137
c\u13a0-\u13f4\u1401-\u166c\u166f-\u1676\u1681-\u169a\u16a0-\u16ea\u16ee-\u16f0\
u1700-\u170c\u170e-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176c\u176e-\u1770\u1
780-\u17b3\u17d7\u17dc\u17e0-\u17e9\u17f0-\u17f9\u1810-\u1819\u1820-\u1877\u1880
-\u18a8\u1900-\u191c\u1946-\u196d\u1970-\u1974\u1d00-\u1d6b\u1e00-\u1e9b\u1ea0-\
u1ef9\u1f00-\u1f15\u1f18-\u1f1d\u1f20-\u1f45\u1f48-\u1f4d\u1f50-\u1f57\u1f59\u1f
5b\u1f5d\u1f5f-\u1f7d\u1f80-\u1fb4\u1fb6-\u1fbc\u1fbe\u1fc2-\u1fc4\u1fc6-\u1fcc\
u1fd0-\u1fd3\u1fd6-\u1fdb\u1fe0-\u1fec\u1ff2-\u1ff4\u1ff6-\u1ffc\u2070-\u2071\u2
074-\u2079\u207f-\u2089\u2102\u2107\u210a-\u2113\u2115\u2119-\u211d\u2124\u2126\
u2128\u212a-\u212d\u212f-\u2131\u2133-\u2139\u213d-\u213f\u2145-\u2149\u2153-\u2
183\u2460-\u249b\u24ea-\u24ff\u2776-\u2793\u3005-\u3007\u3021-\u3029\u3031-\u303
5\u3038-\u303c\u3041-\u3096\u309d-\u309f\u30a1-\u30fa\u30fc-\u30ff\u3105-\u312c\
u3131-\u318e\u3192-\u3195\u31a0-\u31b7\u31f0-\u31ff\u3220-\u3229\u3251-\u325f\u3
280-\u3289\u32b1-\u32bf\u3400-\u4db5\u4e00-\u9fa5\ua000-\ua48c\uac00-\ud7a3\uf90
0-\ufa2d\ufa30-\ufa6a\ufb00-\ufb06\ufb13-\ufb17\ufb1d\ufb1f-\ufb28\ufb2a-\ufb36\
ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufbb1\ufbd3-\ufd3d\ufd50-\uf
d8f\ufd92-\ufdc7\ufdf0-\ufdfb\ufe70-\ufe74\ufe76-\ufefc\uff10-\uff19\uff21-\uff3
a\uff41-\uff5a\uff66-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc" | 448 "\u0030-\u0039\u0041-\u005a\u0061-\u007a\u00aa\u
00b2-\u00b3\u00b5\u00b9-\u00ba\u00bc-\u00be\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02
36\u0250-\u02c1\u02c6-\u02d1\u02e0-\u02e4\u02ee\u037a\u0386\u0388-\u038a\u038c\u
038e-\u03a1\u03a3-\u03ce\u03d0-\u03f5\u03f7-\u03fb\u0400-\u0481\u048a-\u04ce\u04
d0-\u04f5\u04f8-\u04f9\u0500-\u050f\u0531-\u0556\u0559\u0561-\u0587\u05d0-\u05ea
\u05f0-\u05f2\u0621-\u063a\u0640-\u064a\u0660-\u0669\u066e-\u066f\u0671-\u06d3\u
06d5\u06e5-\u06e6\u06ee-\u06fc\u06ff\u0710\u0712-\u072f\u074d-\u074f\u0780-\u07a
5\u07b1\u0904-\u0939\u093d\u0950\u0958-\u0961\u0966-\u096f\u0985-\u098c\u098f-\u
0990\u0993-\u09a8\u09aa-\u09b0\u09b2\u09b6-\u09b9\u09bd\u09dc-\u09dd\u09df-\u09e
1\u09e6-\u09f1\u09f4-\u09f9\u0a05-\u0a0a\u0a0f-\u0a10\u0a13-\u0a28\u0a2a-\u0a30\
u0a32-\u0a33\u0a35-\u0a36\u0a38-\u0a39\u0a59-\u0a5c\u0a5e\u0a66-\u0a6f\u0a72-\u0
a74\u0a85-\u0a8d\u0a8f-\u0a91\u0a93-\u0aa8\u0aaa-\u0ab0\u0ab2-\u0ab3\u0ab5-\u0ab
9\u0abd\u0ad0\u0ae0-\u0ae1\u0ae6-\u0aef\u0b05-\u0b0c\u0b0f-\u0b10\u0b13-\u0b28\u
0b2a-\u0b30\u0b32-\u0b33\u0b35-\u0b39\u0b3d\u0b5c-\u0b5d\u0b5f-\u0b61\u0b66-\u0b
6f\u0b71\u0b83\u0b85-\u0b8a\u0b8e-\u0b90\u0b92-\u0b95\u0b99-\u0b9a\u0b9c\u0b9e-\
u0b9f\u0ba3-\u0ba4\u0ba8-\u0baa\u0bae-\u0bb5\u0bb7-\u0bb9\u0be7-\u0bf2\u0c05-\u0
c0c\u0c0e-\u0c10\u0c12-\u0c28\u0c2a-\u0c33\u0c35-\u0c39\u0c60-\u0c61\u0c66-\u0c6
f\u0c85-\u0c8c\u0c8e-\u0c90\u0c92-\u0ca8\u0caa-\u0cb3\u0cb5-\u0cb9\u0cbd\u0cde\u
0ce0-\u0ce1\u0ce6-\u0cef\u0d05-\u0d0c\u0d0e-\u0d10\u0d12-\u0d28\u0d2a-\u0d39\u0d
60-\u0d61\u0d66-\u0d6f\u0d85-\u0d96\u0d9a-\u0db1\u0db3-\u0dbb\u0dbd\u0dc0-\u0dc6
\u0e01-\u0e30\u0e32-\u0e33\u0e40-\u0e46\u0e50-\u0e59\u0e81-\u0e82\u0e84\u0e87-\u
0e88\u0e8a\u0e8d\u0e94-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5\u0ea7\u0eaa-\u0eab
\u0ead-\u0eb0\u0eb2-\u0eb3\u0ebd\u0ec0-\u0ec4\u0ec6\u0ed0-\u0ed9\u0edc-\u0edd\u0
f00\u0f20-\u0f33\u0f40-\u0f47\u0f49-\u0f6a\u0f88-\u0f8b\u1000-\u1021\u1023-\u102
7\u1029-\u102a\u1040-\u1049\u1050-\u1055\u10a0-\u10c5\u10d0-\u10f8\u1100-\u1159\
u115f-\u11a2\u11a8-\u11f9\u1200-\u1206\u1208-\u1246\u1248\u124a-\u124d\u1250-\u1
256\u1258\u125a-\u125d\u1260-\u1286\u1288\u128a-\u128d\u1290-\u12ae\u12b0\u12b2-
\u12b5\u12b8-\u12be\u12c0\u12c2-\u12c5\u12c8-\u12ce\u12d0-\u12d6\u12d8-\u12ee\u1
2f0-\u130e\u1310\u1312-\u1315\u1318-\u131e\u1320-\u1346\u1348-\u135a\u1369-\u137
c\u13a0-\u13f4\u1401-\u166c\u166f-\u1676\u1681-\u169a\u16a0-\u16ea\u16ee-\u16f0\
u1700-\u170c\u170e-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176c\u176e-\u1770\u1
780-\u17b3\u17d7\u17dc\u17e0-\u17e9\u17f0-\u17f9\u1810-\u1819\u1820-\u1877\u1880
-\u18a8\u1900-\u191c\u1946-\u196d\u1970-\u1974\u1d00-\u1d6b\u1e00-\u1e9b\u1ea0-\
u1ef9\u1f00-\u1f15\u1f18-\u1f1d\u1f20-\u1f45\u1f48-\u1f4d\u1f50-\u1f57\u1f59\u1f
5b\u1f5d\u1f5f-\u1f7d\u1f80-\u1fb4\u1fb6-\u1fbc\u1fbe\u1fc2-\u1fc4\u1fc6-\u1fcc\
u1fd0-\u1fd3\u1fd6-\u1fdb\u1fe0-\u1fec\u1ff2-\u1ff4\u1ff6-\u1ffc\u2070-\u2071\u2
074-\u2079\u207f-\u2089\u2102\u2107\u210a-\u2113\u2115\u2119-\u211d\u2124\u2126\
u2128\u212a-\u212d\u212f-\u2131\u2133-\u2139\u213d-\u213f\u2145-\u2149\u2153-\u2
183\u2460-\u249b\u24ea-\u24ff\u2776-\u2793\u3005-\u3007\u3021-\u3029\u3031-\u303
5\u3038-\u303c\u3041-\u3096\u309d-\u309f\u30a1-\u30fa\u30fc-\u30ff\u3105-\u312c\
u3131-\u318e\u3192-\u3195\u31a0-\u31b7\u31f0-\u31ff\u3220-\u3229\u3251-\u325f\u3
280-\u3289\u32b1-\u32bf\u3400-\u4db5\u4e00-\u9fa5\ua000-\ua48c\uac00-\ud7a3\uf90
0-\ufa2d\ufa30-\ufa6a\ufb00-\ufb06\ufb13-\ufb17\ufb1d\ufb1f-\ufb28\ufb2a-\ufb36\
ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufbb1\ufbd3-\ufd3d\ufd50-\uf
d8f\ufd92-\ufdc7\ufdf0-\ufdfb\ufe70-\ufe74\ufe76-\ufefc\uff10-\uff19\uff21-\uff3
a\uff41-\uff5a\uff66-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc" |
| 462 + "]"); | 449 + "]"); |
| 463 | 450 |
| 464 } | 451 } |
| OLD | NEW |