OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 /** | 5 /** |
6 * boilerpipe | 6 * boilerpipe |
7 * | 7 * |
8 * Copyright (c) 2009 Christian Kohlschütter | 8 * Copyright (c) 2009 Christian Kohlschütter |
9 * | 9 * |
10 * The author licenses this file to You under the Apache License, Version 2.0 | 10 * The author licenses this file to You under the Apache License, Version 2.0 |
11 * (the "License"); you may not use this file except in compliance with | 11 * (the "License"); you may not use this file except in compliance with |
12 * the License. You may obtain a copy of the License at | 12 * the License. You may obtain a copy of the License at |
13 * | 13 * |
14 * http://www.apache.org/licenses/LICENSE-2.0 | 14 * http://www.apache.org/licenses/LICENSE-2.0 |
15 * | 15 * |
16 * Unless required by applicable law or agreed to in writing, software | 16 * Unless required by applicable law or agreed to in writing, software |
17 * distributed under the License is distributed on an "AS IS" BASIS, | 17 * distributed under the License is distributed on an "AS IS" BASIS, |
18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
19 * See the License for the specific language governing permissions and | 19 * See the License for the specific language governing permissions and |
20 * limitations under the License. | 20 * limitations under the License. |
21 */ | 21 */ |
22 package de.l3s.boilerpipe.sax; | 22 package de.l3s.boilerpipe.sax; |
23 | 23 |
| 24 import com.dom_distiller.client.StringUtil; |
| 25 |
| 26 import de.l3s.boilerpipe.document.TextBlock; |
| 27 import de.l3s.boilerpipe.document.TextDocument; |
| 28 import de.l3s.boilerpipe.labels.LabelAction; |
| 29 import de.l3s.boilerpipe.util.UnicodeTokenizer; |
| 30 |
| 31 import org.xml.sax.Attributes; |
| 32 import org.xml.sax.ContentHandler; |
| 33 import org.xml.sax.Locator; |
| 34 import org.xml.sax.SAXException; |
| 35 |
24 import java.util.ArrayList; | 36 import java.util.ArrayList; |
25 import java.util.HashSet; | 37 import java.util.HashSet; |
26 import java.util.LinkedList; | 38 import java.util.LinkedList; |
27 import java.util.List; | 39 import java.util.List; |
28 import java.util.Map; | 40 import java.util.Map; |
29 import java.util.regex.Pattern; | 41 import java.util.regex.Pattern; |
30 | 42 |
31 import org.xml.sax.Attributes; | |
32 import org.xml.sax.ContentHandler; | |
33 import org.xml.sax.Locator; | |
34 import org.xml.sax.SAXException; | |
35 | |
36 import de.l3s.boilerpipe.document.TextBlock; | |
37 import de.l3s.boilerpipe.document.TextDocument; | |
38 import de.l3s.boilerpipe.labels.LabelAction; | |
39 import de.l3s.boilerpipe.util.UnicodeTokenizer; | |
40 | |
41 import com.dom_distiller.client.StringUtil; | |
42 | |
43 /** | 43 /** |
44 * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can | 44 * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can |
45 * be used by different parser implementations, e.g. NekoHTML and TagSoup. | 45 * be used by different parser implementations, e.g. NekoHTML and TagSoup. |
46 * | 46 * |
47 * @author Christian Kohlschütter | 47 * @author Christian Kohlschütter |
48 */ | 48 */ |
49 public class BoilerpipeHTMLContentHandler implements ContentHandler { | 49 public class BoilerpipeHTMLContentHandler implements ContentHandler { |
50 | 50 |
51 private final Map<String, TagAction> tagActions; | 51 private final Map<String, TagAction> tagActions; |
52 private String title = null; | 52 |
53 | 53 static final String ANCHOR_TEXT_START = "$\ue00a<"; |
54 static final String ANCHOR_TEXT_START = "$\ue00a<"; | 54 static final String ANCHOR_TEXT_END = ">\ue00a$"; |
55 static final String ANCHOR_TEXT_END = ">\ue00a$"; | 55 |
56 | 56 StringBuilder tokenBuffer = new StringBuilder(); |
57 StringBuilder tokenBuffer = new StringBuilder(); | 57 StringBuilder textBuffer = new StringBuilder(); |
58 StringBuilder textBuffer = new StringBuilder(); | 58 |
59 | 59 int inBody = 0; |
60 int inBody = 0; | 60 int inAnchor = 0; |
61 int inAnchor = 0; | 61 int inIgnorableElement = 0; |
62 int inIgnorableElement = 0; | 62 |
63 | 63 int tagLevel = 0; |
64 int tagLevel = 0; | 64 int blockTagLevel = -1; |
65 int blockTagLevel = -1; | 65 |
66 | 66 boolean sbLastWasWhitespace = false; |
67 boolean sbLastWasWhitespace = false; | 67 private int textElementIdx = 0; |
68 private int textElementIdx = 0; | 68 |
69 | 69 private final List<TextBlock> textBlocks = new ArrayList<TextBlock>(); |
70 private final List<TextBlock> textBlocks = new ArrayList<TextBlock>(); | 70 |
71 | 71 private String lastStartTag = null; |
72 private String lastStartTag = null; | 72 @SuppressWarnings("unused") |
73 @SuppressWarnings("unused") | 73 private String lastEndTag = null; |
74 private String lastEndTag = null; | 74 @SuppressWarnings("unused") |
75 @SuppressWarnings("unused") | 75 private Event lastEvent = null; |
76 private Event lastEvent = null; | 76 |
77 | 77 private int offsetBlocks = 0; |
78 private int offsetBlocks = 0; | 78 private HashSet<Integer> currentContainedTextElements = new HashSet<Integer>
(); |
79 private HashSet<Integer> currentContainedTextElements = new HashSet<Inte
ger>(); | 79 |
80 | 80 private boolean flush = false; |
81 private boolean flush = false; | 81 boolean inAnchorText = false; |
82 boolean inAnchorText = false; | 82 |
83 | 83 LinkedList<LinkedList<LabelAction>> labelStacks = new LinkedList<LinkedList<
LabelAction>>(); |
84 LinkedList<LinkedList<LabelAction>> labelStacks = new LinkedList<LinkedL
ist<LabelAction>>(); | 84 LinkedList<Integer> fontSizeStack = new LinkedList<Integer>(); |
85 LinkedList<Integer> fontSizeStack = new LinkedList<Integer>(); | 85 |
86 | 86 /** |
87 /** | 87 * Recycles this instance. |
88 * Recycles this instance. | 88 */ |
89 */ | 89 public void recycle() { |
90 public void recycle() { | 90 tokenBuffer.setLength(0); |
91 tokenBuffer.setLength(0); | 91 textBuffer.setLength(0); |
92 textBuffer.setLength(0); | 92 |
93 | 93 inBody = 0; |
94 inBody = 0; | 94 inAnchor = 0; |
95 inAnchor = 0; | 95 inIgnorableElement = 0; |
96 inIgnorableElement = 0; | 96 sbLastWasWhitespace = false; |
97 sbLastWasWhitespace = false; | 97 textElementIdx = 0; |
98 textElementIdx = 0; | 98 |
99 | 99 textBlocks.clear(); |
100 textBlocks.clear(); | 100 |
101 | 101 lastStartTag = null; |
102 lastStartTag = null; | 102 lastEndTag = null; |
103 lastEndTag = null; | 103 lastEvent = null; |
104 lastEvent = null; | 104 |
105 | 105 offsetBlocks = 0; |
106 offsetBlocks = 0; | 106 currentContainedTextElements.clear(); |
107 currentContainedTextElements.clear(); | 107 |
108 | 108 flush = false; |
109 flush = false; | 109 inAnchorText = false; |
110 inAnchorText = false; | 110 } |
111 } | 111 |
112 | 112 /** |
113 /** | 113 * Constructs a {@link BoilerpipeHTMLContentHandler} using the |
114 * Constructs a {@link BoilerpipeHTMLContentHandler} using the | 114 * {@link DefaultTagActionMap}. |
115 * {@link DefaultTagActionMap}. | 115 */ |
116 */ | 116 public BoilerpipeHTMLContentHandler() { |
117 public BoilerpipeHTMLContentHandler() { | 117 this(DefaultTagActionMap.INSTANCE); |
118 this(DefaultTagActionMap.INSTANCE); | 118 } |
119 } | 119 |
120 | 120 /** |
121 /** | 121 * Constructs a {@link BoilerpipeHTMLContentHandler} using the given |
122 * Constructs a {@link BoilerpipeHTMLContentHandler} using the given | 122 * {@link TagActionMap}. |
123 * {@link TagActionMap}. | 123 * |
124 * | 124 * @param tagActions |
125 * @param tagActions | 125 * The {@link TagActionMap} to use, e.g. |
126 * The {@link TagActionMap} to use, e.g. | 126 * {@link DefaultTagActionMap}. |
127 * {@link DefaultTagActionMap}. | 127 */ |
128 */ | 128 public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) { |
129 public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) { | 129 this.tagActions = tagActions; |
130 this.tagActions = tagActions; | 130 } |
131 } | 131 |
132 | 132 @Override |
133 // @Override | 133 public void endDocument() throws SAXException { |
134 public void endDocument() throws SAXException { | 134 flushBlock(); |
135 flushBlock(); | 135 } |
136 } | 136 |
137 | 137 @Override |
138 // @Override | 138 public void endPrefixMapping(String prefix) throws SAXException { |
139 public void endPrefixMapping(String prefix) throws SAXException { | 139 } |
140 } | 140 |
141 | 141 @Override |
142 // @Override | 142 public void ignorableWhitespace(char[] ch, int start, int length) |
143 public void ignorableWhitespace(char[] ch, int start, int length) | 143 throws SAXException { |
144 throws SAXException { | 144 if (!sbLastWasWhitespace) { |
145 if (!sbLastWasWhitespace) { | 145 textBuffer.append(' '); |
146 textBuffer.append(' '); | 146 tokenBuffer.append(' '); |
147 tokenBuffer.append(' '); | 147 } |
148 } | 148 sbLastWasWhitespace = true; |
149 sbLastWasWhitespace = true; | 149 } |
150 } | 150 |
151 | 151 @Override |
152 // @Override | 152 public void processingInstruction(String target, String data) |
153 public void processingInstruction(String target, String data) | 153 throws SAXException { |
154 throws SAXException { | 154 } |
155 } | 155 |
156 | 156 @Override |
157 // @Override | 157 public void setDocumentLocator(Locator locator) { |
158 public void setDocumentLocator(Locator locator) { | 158 } |
159 } | 159 |
160 | 160 @Override |
161 // @Override | 161 public void skippedEntity(String name) throws SAXException { |
162 public void skippedEntity(String name) throws SAXException { | 162 } |
163 } | 163 |
164 | 164 @Override |
165 // @Override | 165 public void startDocument() throws SAXException { |
166 public void startDocument() throws SAXException { | 166 } |
167 } | 167 |
168 | 168 @Override |
169 // @Override | 169 public void startPrefixMapping(String prefix, String uri) |
170 public void startPrefixMapping(String prefix, String uri) | 170 throws SAXException { |
171 throws SAXException { | 171 } |
172 } | 172 |
173 | 173 @Override |
174 // @Override | 174 public void startElement(String uri, String localName, String qName, |
175 public void startElement(String uri, String localName, String qName, | 175 Attributes atts) throws SAXException { |
176 Attributes atts) throws SAXException { | 176 labelStacks.add(null); |
177 labelStacks.add(null); | 177 |
178 | 178 TagAction ta = tagActions.get(localName); |
179 TagAction ta = tagActions.get(localName); | 179 if (ta != null) { |
180 if (ta != null) { | 180 if(ta.changesTagLevel()) { |
181 if(ta.changesTagLevel()) { | 181 tagLevel++; |
182 tagLevel++; | 182 } |
183 } | 183 flush = ta.start(this, localName, qName, atts) | flush; |
184 flush = ta.start(this, localName, qName, atts) | flush; | 184 } else { |
185 } else { | 185 tagLevel++; |
186 tagLevel++; | 186 flush = true; |
187 flush = true; | 187 } |
188 } | 188 |
189 | 189 lastEvent = Event.START_TAG; |
190 lastEvent = Event.START_TAG; | 190 lastStartTag = localName; |
191 lastStartTag = localName; | 191 } |
192 } | 192 |
193 | 193 @Override |
194 // @Override | 194 public void endElement(String uri, String localName, String qName) |
195 public void endElement(String uri, String localName, String qName) | 195 throws SAXException { |
196 throws SAXException { | 196 TagAction ta = tagActions.get(localName); |
197 TagAction ta = tagActions.get(localName); | 197 if (ta != null) { |
198 if (ta != null) { | 198 flush = ta.end(this, localName, qName) | flush; |
199 flush = ta.end(this, localName, qName) | flush; | 199 } else { |
200 } else { | 200 flush = true; |
201 flush = true; | 201 } |
202 } | 202 |
203 | 203 if(ta == null || ta.changesTagLevel()) { |
204 if(ta == null || ta.changesTagLevel()) { | 204 tagLevel--; |
205 tagLevel--; | 205 } |
206 } | 206 |
207 | 207 if (flush) { |
208 if (flush) { | 208 flushBlock(); |
209 flushBlock(); | 209 } |
210 } | 210 |
211 | 211 lastEvent = Event.END_TAG; |
212 lastEvent = Event.END_TAG; | 212 lastEndTag = localName; |
213 lastEndTag = localName; | 213 |
214 | 214 labelStacks.removeLast(); |
215 labelStacks.removeLast(); | 215 } |
216 } | 216 |
217 | 217 @Override |
218 // @Override | 218 public void characters(char[] ch, int start, int length) |
219 public void characters(char[] ch, int start, int length) | 219 throws SAXException { |
220 throws SAXException { | 220 textElementIdx++; |
221 textElementIdx++; | 221 |
222 | 222 |
223 | 223 if (flush) { |
224 if (flush) { | 224 flushBlock(); |
225 flushBlock(); | 225 flush = false; |
226 flush = false; | 226 } |
227 } | 227 |
228 | 228 if (inIgnorableElement != 0) { |
229 if (inIgnorableElement != 0) { | 229 return; |
230 return; | 230 } |
231 } | 231 |
232 | 232 char c; |
233 char c; | 233 boolean startWhitespace = false; |
234 boolean startWhitespace = false; | 234 boolean endWhitespace = false; |
235 boolean endWhitespace = false; | 235 if (length == 0) { |
236 if (length == 0) { | 236 return; |
237 return; | 237 } |
238 } | 238 |
239 | 239 final int end = start + length; |
240 final int end = start + length; | 240 for (int i = start; i < end; i++) { |
241 for (int i = start; i < end; i++) { | 241 if (StringUtil.isWhitespace(ch[i])) { |
242 if (StringUtil.isWhitespace(ch[i])) { | 242 ch[i] = ' '; |
243 ch[i] = ' '; | 243 } |
244 } | 244 } |
245 } | 245 while (start < end) { |
246 while (start < end) { | 246 c = ch[start]; |
247 c = ch[start]; | 247 if (c == ' ') { |
248 if (c == ' ') { | 248 startWhitespace = true; |
249 startWhitespace = true; | 249 start++; |
250 start++; | 250 length--; |
251 length--; | 251 } else { |
252 } else { | 252 break; |
253 break; | 253 } |
254 } | 254 } |
255 } | 255 while (length > 0) { |
256 while (length > 0) { | 256 c = ch[start + length - 1]; |
257 c = ch[start + length - 1]; | 257 if (c == ' ') { |
258 if (c == ' ') { | 258 endWhitespace = true; |
259 endWhitespace = true; | 259 length--; |
260 length--; | 260 } else { |
261 } else { | 261 break; |
262 break; | 262 } |
263 } | 263 } |
264 } | 264 if (length == 0) { |
265 if (length == 0) { | 265 if (startWhitespace || endWhitespace) { |
266 if (startWhitespace || endWhitespace) { | 266 if (!sbLastWasWhitespace) { |
267 if (!sbLastWasWhitespace) { | 267 textBuffer.append(' '); |
268 textBuffer.append(' '); | 268 tokenBuffer.append(' '); |
269 tokenBuffer.append(' '); | 269 } |
270 } | 270 sbLastWasWhitespace = true; |
271 sbLastWasWhitespace = true; | 271 } else { |
272 } else { | 272 sbLastWasWhitespace = false; |
273 sbLastWasWhitespace = false; | 273 } |
274 } | 274 lastEvent = Event.WHITESPACE; |
275 lastEvent = Event.WHITESPACE; | 275 return; |
276 return; | 276 } |
277 } | 277 if (startWhitespace) { |
278 if (startWhitespace) { | 278 if (!sbLastWasWhitespace) { |
279 if (!sbLastWasWhitespace) { | 279 textBuffer.append(' '); |
280 textBuffer.append(' '); | 280 tokenBuffer.append(' '); |
281 tokenBuffer.append(' '); | 281 } |
282 } | 282 } |
283 } | 283 |
284 | 284 if (blockTagLevel == -1) { |
285 if (blockTagLevel == -1) { | 285 blockTagLevel = tagLevel; |
286 blockTagLevel = tagLevel; | 286 } |
287 } | 287 |
288 | 288 textBuffer.append(ch, start, length); |
289 textBuffer.append(ch, start, length); | 289 tokenBuffer.append(ch, start, length); |
290 tokenBuffer.append(ch, start, length); | 290 if (endWhitespace) { |
291 if (endWhitespace) { | 291 textBuffer.append(' '); |
292 textBuffer.append(' '); | 292 tokenBuffer.append(' '); |
293 tokenBuffer.append(' '); | 293 } |
294 } | 294 |
295 | 295 sbLastWasWhitespace = endWhitespace; |
296 sbLastWasWhitespace = endWhitespace; | 296 lastEvent = Event.CHARACTERS; |
297 lastEvent = Event.CHARACTERS; | 297 |
298 | 298 currentContainedTextElements.add(textElementIdx); |
299 currentContainedTextElements.add(textElementIdx); | 299 } |
300 } | 300 |
301 | 301 List<TextBlock> getTextBlocks() { |
302 List<TextBlock> getTextBlocks() { | 302 return textBlocks; |
303 return textBlocks; | 303 } |
304 } | 304 |
305 | 305 public void flushBlock() { |
306 public void flushBlock() { | 306 if (inBody == 0) { |
307 if (inBody == 0) { | 307 textBuffer.setLength(0); |
308 if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody ==
0) { | 308 tokenBuffer.setLength(0); |
309 setTitle(tokenBuffer.toString().trim()); | 309 return; |
310 } | 310 } |
311 textBuffer.setLength(0); | 311 |
312 tokenBuffer.setLength(0); | 312 final int length = tokenBuffer.length(); |
313 return; | 313 switch (length) { |
314 } | 314 case 0: |
315 | 315 return; |
316 final int length = tokenBuffer.length(); | 316 case 1: |
317 switch (length) { | 317 if (sbLastWasWhitespace) { |
318 case 0: | 318 textBuffer.setLength(0); |
319 return; | 319 tokenBuffer.setLength(0); |
320 case 1: | 320 return; |
321 if (sbLastWasWhitespace) { | 321 } |
322 textBuffer.setLength(0); | 322 } |
323 tokenBuffer.setLength(0); | 323 final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer); |
324 return; | 324 |
325 } | 325 int numWords = 0; |
326 } | 326 int numLinkedWords = 0; |
327 final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer); | 327 int numWrappedLines = 0; |
328 | 328 int currentLineLength = -1; // don't count the first space |
329 int numWords = 0; | 329 final int maxLineLength = 80; |
330 int numLinkedWords = 0; | 330 int numTokens = 0; |
331 int numWrappedLines = 0; | 331 int numWordsCurrentLine = 0; |
332 int currentLineLength = -1; // don't count the first space | 332 |
333 final int maxLineLength = 80; | 333 for (String token : tokens) { |
334 int numTokens = 0; | 334 if (ANCHOR_TEXT_START.equals(token)) { |
335 int numWordsCurrentLine = 0; | 335 inAnchorText = true; |
336 | 336 } else if (ANCHOR_TEXT_END.equals(token)) { |
337 for (String token : tokens) { | 337 inAnchorText = false; |
338 if (ANCHOR_TEXT_START.equals(token)) { | 338 } else if (isWord(token)) { |
339 inAnchorText = true; | 339 numTokens++; |
340 } else if (ANCHOR_TEXT_END.equals(token)) { | 340 numWords++; |
341 inAnchorText = false; | 341 numWordsCurrentLine++; |
342 } else if (isWord(token)) { | 342 if (inAnchorText) { |
343 numTokens++; | 343 numLinkedWords++; |
344 numWords++; | 344 } |
345 numWordsCurrentLine++; | 345 final int tokenLength = token.length(); |
346 if (inAnchorText) { | 346 currentLineLength += tokenLength + 1; |
347 numLinkedWords++; | 347 if (currentLineLength > maxLineLength) { |
348 } | 348 numWrappedLines++; |
349 final int tokenLength = token.length(); | 349 currentLineLength = tokenLength; |
350 currentLineLength += tokenLength + 1; | 350 numWordsCurrentLine = 1; |
351 if (currentLineLength > maxLineLength) { | 351 } |
352 numWrappedLines++; | 352 } else { |
353 currentLineLength = tokenLength; | 353 numTokens++; |
354 numWordsCurrentLine = 1; | 354 } |
355 } | 355 } |
356 } else { | 356 if (numTokens == 0) { |
357 numTokens++; | 357 return; |
358 } | 358 } |
359 } | 359 int numWordsInWrappedLines; |
360 if (numTokens == 0) { | 360 if (numWrappedLines == 0) { |
361 return; | 361 numWordsInWrappedLines = numWords; |
362 } | 362 numWrappedLines = 1; |
363 int numWordsInWrappedLines; | 363 } else { |
364 if (numWrappedLines == 0) { | 364 numWordsInWrappedLines = numWords - numWordsCurrentLine; |
365 numWordsInWrappedLines = numWords; | 365 } |
366 numWrappedLines = 1; | 366 |
367 } else { | 367 TextBlock tb = new TextBlock(StringUtil.javaTrim(textBuffer.toString()), |
368 numWordsInWrappedLines = numWords - numWordsCurrentLine; | 368 currentContainedTextElements, numWords, numLinkedWords, |
369 } | 369 numWordsInWrappedLines, numWrappedLines, offsetBlocks); |
370 | 370 currentContainedTextElements = new HashSet<Integer>(); |
371 TextBlock tb = new TextBlock(StringUtil.javaTrim(textBuffer.toSt
ring()), | 371 |
372 currentContainedTextElements, numWords, numLinke
dWords, | 372 offsetBlocks++; |
373 numWordsInWrappedLines, numWrappedLines, offsetB
locks); | 373 |
374 currentContainedTextElements = new HashSet<Integer>(); | 374 textBuffer.setLength(0); |
375 | 375 tokenBuffer.setLength(0); |
376 offsetBlocks++; | 376 |
377 | 377 tb.setTagLevel(blockTagLevel); |
378 textBuffer.setLength(0); | 378 addTextBlock(tb); |
379 tokenBuffer.setLength(0); | 379 blockTagLevel = -1; |
380 | 380 } |
381 tb.setTagLevel(blockTagLevel); | 381 |
382 addTextBlock(tb); | 382 protected void addTextBlock(final TextBlock tb) { |
383 blockTagLevel = -1; | 383 |
384 } | 384 for (Integer l : fontSizeStack) { |
385 | 385 if (l != null) { |
386 protected void addTextBlock(final TextBlock tb) { | 386 tb.addLabel("font-" + l); |
387 | 387 break; |
388 for (Integer l : fontSizeStack) { | 388 } |
389 if (l != null) { | 389 } |
390 tb.addLabel("font-" + l); | 390 for (LinkedList<LabelAction> labelStack : labelStacks) { |
391 break; | 391 if (labelStack != null) { |
392 } | 392 for (LabelAction labels : labelStack) { |
393 } | 393 if (labels != null) { |
394 for (LinkedList<LabelAction> labelStack : labelStacks) { | 394 labels.addTo(tb); |
395 if (labelStack != null) { | 395 } |
396 for (LabelAction labels : labelStack) { | 396 } |
397 if (labels != null) { | 397 } |
398 labels.addTo(tb); | 398 } |
399 } | 399 |
400 } | 400 textBlocks.add(tb); |
401 } | 401 } |
402 } | 402 |
403 | 403 public static boolean isWord(final String token) { |
404 textBlocks.add(tb); | 404 return PAT_VALID_WORD_CHARACTER.matcher(token).find(); |
405 } | 405 } |
406 | 406 |
407 public static boolean isWord(final String token) { | 407 static private enum Event { |
408 return PAT_VALID_WORD_CHARACTER.matcher(token).find(); | 408 START_TAG, END_TAG, CHARACTERS, WHITESPACE |
409 } | 409 } |
410 | 410 |
411 static private enum Event { | 411 |
412 START_TAG, END_TAG, CHARACTERS, WHITESPACE | 412 /** |
413 } | 413 * Returns a {@link TextDocument} containing the extracted {@link TextBlock} |
414 | 414 * s. NOTE: Only call this after parsing. |
415 public String getTitle() { | 415 * |
416 return title; | 416 * @return The {@link TextDocument} |
417 } | 417 */ |
418 | 418 public TextDocument toTextDocument() { |
419 public void setTitle(String s) { | 419 // just to be sure |
420 if (s == null || s.length() == 0) { | 420 flushBlock(); |
421 return; | 421 // TODO(yfriedman): When BoilerpipeHTMLContentHandler is finished being
moved to |
422 } | 422 // DomToSaxVisitor, we should be able to set Title directly. |
423 title = s; | 423 return new TextDocument(null, getTextBlocks()); |
424 } | 424 } |
425 | 425 |
426 /** | 426 public void addWhitespaceIfNecessary() { |
427 * Returns a {@link TextDocument} containing the extracted {@link TextBl
ock} | 427 if (!sbLastWasWhitespace) { |
428 * s. NOTE: Only call this after parsing. | 428 tokenBuffer.append(' '); |
429 * | 429 textBuffer.append(' '); |
430 * @return The {@link TextDocument} | 430 sbLastWasWhitespace = true; |
431 */ | 431 } |
432 public TextDocument toTextDocument() { | 432 } |
433 // just to be sure | 433 |
434 flushBlock(); | 434 public void addLabelAction(final LabelAction la) |
435 | 435 throws IllegalStateException { |
436 return new TextDocument(getTitle(), getTextBlocks()); | 436 LinkedList<LabelAction> labelStack = labelStacks.getLast(); |
437 } | 437 if (labelStack == null) { |
438 | 438 labelStack = new LinkedList<LabelAction>(); |
439 public void addWhitespaceIfNecessary() { | 439 labelStacks.removeLast(); |
440 if (!sbLastWasWhitespace) { | 440 labelStacks.add(labelStack); |
441 tokenBuffer.append(' '); | 441 } |
442 textBuffer.append(' '); | 442 labelStack.add(la); |
443 sbLastWasWhitespace = true; | 443 } |
444 } | 444 |
445 } | 445 private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern |
446 | 446 .compile( |
447 public void addLabelAction(final LabelAction la) | |
448 throws IllegalStateException { | |
449 LinkedList<LabelAction> labelStack = labelStacks.getLast(); | |
450 if (labelStack == null) { | |
451 labelStack = new LinkedList<LabelAction>(); | |
452 labelStacks.removeLast(); | |
453 labelStacks.add(labelStack); | |
454 } | |
455 labelStack.add(la); | |
456 } | |
457 | |
458 private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern | |
459 .compile( | |
460 "[" + | 447 "[" + |
461 "\u0030-\u0039\u0041-\u005a\u0061-\u007a\u00aa\u
00b2-\u00b3\u00b5\u00b9-\u00ba\u00bc-\u00be\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02
36\u0250-\u02c1\u02c6-\u02d1\u02e0-\u02e4\u02ee\u037a\u0386\u0388-\u038a\u038c\u
038e-\u03a1\u03a3-\u03ce\u03d0-\u03f5\u03f7-\u03fb\u0400-\u0481\u048a-\u04ce\u04
d0-\u04f5\u04f8-\u04f9\u0500-\u050f\u0531-\u0556\u0559\u0561-\u0587\u05d0-\u05ea
\u05f0-\u05f2\u0621-\u063a\u0640-\u064a\u0660-\u0669\u066e-\u066f\u0671-\u06d3\u
06d5\u06e5-\u06e6\u06ee-\u06fc\u06ff\u0710\u0712-\u072f\u074d-\u074f\u0780-\u07a
5\u07b1\u0904-\u0939\u093d\u0950\u0958-\u0961\u0966-\u096f\u0985-\u098c\u098f-\u
0990\u0993-\u09a8\u09aa-\u09b0\u09b2\u09b6-\u09b9\u09bd\u09dc-\u09dd\u09df-\u09e
1\u09e6-\u09f1\u09f4-\u09f9\u0a05-\u0a0a\u0a0f-\u0a10\u0a13-\u0a28\u0a2a-\u0a30\
u0a32-\u0a33\u0a35-\u0a36\u0a38-\u0a39\u0a59-\u0a5c\u0a5e\u0a66-\u0a6f\u0a72-\u0
a74\u0a85-\u0a8d\u0a8f-\u0a91\u0a93-\u0aa8\u0aaa-\u0ab0\u0ab2-\u0ab3\u0ab5-\u0ab
9\u0abd\u0ad0\u0ae0-\u0ae1\u0ae6-\u0aef\u0b05-\u0b0c\u0b0f-\u0b10\u0b13-\u0b28\u
0b2a-\u0b30\u0b32-\u0b33\u0b35-\u0b39\u0b3d\u0b5c-\u0b5d\u0b5f-\u0b61\u0b66-\u0b
6f\u0b71\u0b83\u0b85-\u0b8a\u0b8e-\u0b90\u0b92-\u0b95\u0b99-\u0b9a\u0b9c\u0b9e-\
u0b9f\u0ba3-\u0ba4\u0ba8-\u0baa\u0bae-\u0bb5\u0bb7-\u0bb9\u0be7-\u0bf2\u0c05-\u0
c0c\u0c0e-\u0c10\u0c12-\u0c28\u0c2a-\u0c33\u0c35-\u0c39\u0c60-\u0c61\u0c66-\u0c6
f\u0c85-\u0c8c\u0c8e-\u0c90\u0c92-\u0ca8\u0caa-\u0cb3\u0cb5-\u0cb9\u0cbd\u0cde\u
0ce0-\u0ce1\u0ce6-\u0cef\u0d05-\u0d0c\u0d0e-\u0d10\u0d12-\u0d28\u0d2a-\u0d39\u0d
60-\u0d61\u0d66-\u0d6f\u0d85-\u0d96\u0d9a-\u0db1\u0db3-\u0dbb\u0dbd\u0dc0-\u0dc6
\u0e01-\u0e30\u0e32-\u0e33\u0e40-\u0e46\u0e50-\u0e59\u0e81-\u0e82\u0e84\u0e87-\u
0e88\u0e8a\u0e8d\u0e94-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5\u0ea7\u0eaa-\u0eab
\u0ead-\u0eb0\u0eb2-\u0eb3\u0ebd\u0ec0-\u0ec4\u0ec6\u0ed0-\u0ed9\u0edc-\u0edd\u0
f00\u0f20-\u0f33\u0f40-\u0f47\u0f49-\u0f6a\u0f88-\u0f8b\u1000-\u1021\u1023-\u102
7\u1029-\u102a\u1040-\u1049\u1050-\u1055\u10a0-\u10c5\u10d0-\u10f8\u1100-\u1159\
u115f-\u11a2\u11a8-\u11f9\u1200-\u1206\u1208-\u1246\u1248\u124a-\u124d\u1250-\u1
256\u1258\u125a-\u125d\u1260-\u1286\u1288\u128a-\u128d\u1290-\u12ae\u12b0\u12b2-
\u12b5\u12b8-\u12be\u12c0\u12c2-\u12c5\u12c8-\u12ce\u12d0-\u12d6\u12d8-\u12ee\u1
2f0-\u130e\u1310\u1312-\u1315\u1318-\u131e\u1320-\u1346\u1348-\u135a\u1369-\u137
c\u13a0-\u13f4\u1401-\u166c\u166f-\u1676\u1681-\u169a\u16a0-\u16ea\u16ee-\u16f0\
u1700-\u170c\u170e-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176c\u176e-\u1770\u1
780-\u17b3\u17d7\u17dc\u17e0-\u17e9\u17f0-\u17f9\u1810-\u1819\u1820-\u1877\u1880
-\u18a8\u1900-\u191c\u1946-\u196d\u1970-\u1974\u1d00-\u1d6b\u1e00-\u1e9b\u1ea0-\
u1ef9\u1f00-\u1f15\u1f18-\u1f1d\u1f20-\u1f45\u1f48-\u1f4d\u1f50-\u1f57\u1f59\u1f
5b\u1f5d\u1f5f-\u1f7d\u1f80-\u1fb4\u1fb6-\u1fbc\u1fbe\u1fc2-\u1fc4\u1fc6-\u1fcc\
u1fd0-\u1fd3\u1fd6-\u1fdb\u1fe0-\u1fec\u1ff2-\u1ff4\u1ff6-\u1ffc\u2070-\u2071\u2
074-\u2079\u207f-\u2089\u2102\u2107\u210a-\u2113\u2115\u2119-\u211d\u2124\u2126\
u2128\u212a-\u212d\u212f-\u2131\u2133-\u2139\u213d-\u213f\u2145-\u2149\u2153-\u2
183\u2460-\u249b\u24ea-\u24ff\u2776-\u2793\u3005-\u3007\u3021-\u3029\u3031-\u303
5\u3038-\u303c\u3041-\u3096\u309d-\u309f\u30a1-\u30fa\u30fc-\u30ff\u3105-\u312c\
u3131-\u318e\u3192-\u3195\u31a0-\u31b7\u31f0-\u31ff\u3220-\u3229\u3251-\u325f\u3
280-\u3289\u32b1-\u32bf\u3400-\u4db5\u4e00-\u9fa5\ua000-\ua48c\uac00-\ud7a3\uf90
0-\ufa2d\ufa30-\ufa6a\ufb00-\ufb06\ufb13-\ufb17\ufb1d\ufb1f-\ufb28\ufb2a-\ufb36\
ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufbb1\ufbd3-\ufd3d\ufd50-\uf
d8f\ufd92-\ufdc7\ufdf0-\ufdfb\ufe70-\ufe74\ufe76-\ufefc\uff10-\uff19\uff21-\uff3
a\uff41-\uff5a\uff66-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc" | 448 "\u0030-\u0039\u0041-\u005a\u0061-\u007a\u00aa\u
00b2-\u00b3\u00b5\u00b9-\u00ba\u00bc-\u00be\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02
36\u0250-\u02c1\u02c6-\u02d1\u02e0-\u02e4\u02ee\u037a\u0386\u0388-\u038a\u038c\u
038e-\u03a1\u03a3-\u03ce\u03d0-\u03f5\u03f7-\u03fb\u0400-\u0481\u048a-\u04ce\u04
d0-\u04f5\u04f8-\u04f9\u0500-\u050f\u0531-\u0556\u0559\u0561-\u0587\u05d0-\u05ea
\u05f0-\u05f2\u0621-\u063a\u0640-\u064a\u0660-\u0669\u066e-\u066f\u0671-\u06d3\u
06d5\u06e5-\u06e6\u06ee-\u06fc\u06ff\u0710\u0712-\u072f\u074d-\u074f\u0780-\u07a
5\u07b1\u0904-\u0939\u093d\u0950\u0958-\u0961\u0966-\u096f\u0985-\u098c\u098f-\u
0990\u0993-\u09a8\u09aa-\u09b0\u09b2\u09b6-\u09b9\u09bd\u09dc-\u09dd\u09df-\u09e
1\u09e6-\u09f1\u09f4-\u09f9\u0a05-\u0a0a\u0a0f-\u0a10\u0a13-\u0a28\u0a2a-\u0a30\
u0a32-\u0a33\u0a35-\u0a36\u0a38-\u0a39\u0a59-\u0a5c\u0a5e\u0a66-\u0a6f\u0a72-\u0
a74\u0a85-\u0a8d\u0a8f-\u0a91\u0a93-\u0aa8\u0aaa-\u0ab0\u0ab2-\u0ab3\u0ab5-\u0ab
9\u0abd\u0ad0\u0ae0-\u0ae1\u0ae6-\u0aef\u0b05-\u0b0c\u0b0f-\u0b10\u0b13-\u0b28\u
0b2a-\u0b30\u0b32-\u0b33\u0b35-\u0b39\u0b3d\u0b5c-\u0b5d\u0b5f-\u0b61\u0b66-\u0b
6f\u0b71\u0b83\u0b85-\u0b8a\u0b8e-\u0b90\u0b92-\u0b95\u0b99-\u0b9a\u0b9c\u0b9e-\
u0b9f\u0ba3-\u0ba4\u0ba8-\u0baa\u0bae-\u0bb5\u0bb7-\u0bb9\u0be7-\u0bf2\u0c05-\u0
c0c\u0c0e-\u0c10\u0c12-\u0c28\u0c2a-\u0c33\u0c35-\u0c39\u0c60-\u0c61\u0c66-\u0c6
f\u0c85-\u0c8c\u0c8e-\u0c90\u0c92-\u0ca8\u0caa-\u0cb3\u0cb5-\u0cb9\u0cbd\u0cde\u
0ce0-\u0ce1\u0ce6-\u0cef\u0d05-\u0d0c\u0d0e-\u0d10\u0d12-\u0d28\u0d2a-\u0d39\u0d
60-\u0d61\u0d66-\u0d6f\u0d85-\u0d96\u0d9a-\u0db1\u0db3-\u0dbb\u0dbd\u0dc0-\u0dc6
\u0e01-\u0e30\u0e32-\u0e33\u0e40-\u0e46\u0e50-\u0e59\u0e81-\u0e82\u0e84\u0e87-\u
0e88\u0e8a\u0e8d\u0e94-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5\u0ea7\u0eaa-\u0eab
\u0ead-\u0eb0\u0eb2-\u0eb3\u0ebd\u0ec0-\u0ec4\u0ec6\u0ed0-\u0ed9\u0edc-\u0edd\u0
f00\u0f20-\u0f33\u0f40-\u0f47\u0f49-\u0f6a\u0f88-\u0f8b\u1000-\u1021\u1023-\u102
7\u1029-\u102a\u1040-\u1049\u1050-\u1055\u10a0-\u10c5\u10d0-\u10f8\u1100-\u1159\
u115f-\u11a2\u11a8-\u11f9\u1200-\u1206\u1208-\u1246\u1248\u124a-\u124d\u1250-\u1
256\u1258\u125a-\u125d\u1260-\u1286\u1288\u128a-\u128d\u1290-\u12ae\u12b0\u12b2-
\u12b5\u12b8-\u12be\u12c0\u12c2-\u12c5\u12c8-\u12ce\u12d0-\u12d6\u12d8-\u12ee\u1
2f0-\u130e\u1310\u1312-\u1315\u1318-\u131e\u1320-\u1346\u1348-\u135a\u1369-\u137
c\u13a0-\u13f4\u1401-\u166c\u166f-\u1676\u1681-\u169a\u16a0-\u16ea\u16ee-\u16f0\
u1700-\u170c\u170e-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176c\u176e-\u1770\u1
780-\u17b3\u17d7\u17dc\u17e0-\u17e9\u17f0-\u17f9\u1810-\u1819\u1820-\u1877\u1880
-\u18a8\u1900-\u191c\u1946-\u196d\u1970-\u1974\u1d00-\u1d6b\u1e00-\u1e9b\u1ea0-\
u1ef9\u1f00-\u1f15\u1f18-\u1f1d\u1f20-\u1f45\u1f48-\u1f4d\u1f50-\u1f57\u1f59\u1f
5b\u1f5d\u1f5f-\u1f7d\u1f80-\u1fb4\u1fb6-\u1fbc\u1fbe\u1fc2-\u1fc4\u1fc6-\u1fcc\
u1fd0-\u1fd3\u1fd6-\u1fdb\u1fe0-\u1fec\u1ff2-\u1ff4\u1ff6-\u1ffc\u2070-\u2071\u2
074-\u2079\u207f-\u2089\u2102\u2107\u210a-\u2113\u2115\u2119-\u211d\u2124\u2126\
u2128\u212a-\u212d\u212f-\u2131\u2133-\u2139\u213d-\u213f\u2145-\u2149\u2153-\u2
183\u2460-\u249b\u24ea-\u24ff\u2776-\u2793\u3005-\u3007\u3021-\u3029\u3031-\u303
5\u3038-\u303c\u3041-\u3096\u309d-\u309f\u30a1-\u30fa\u30fc-\u30ff\u3105-\u312c\
u3131-\u318e\u3192-\u3195\u31a0-\u31b7\u31f0-\u31ff\u3220-\u3229\u3251-\u325f\u3
280-\u3289\u32b1-\u32bf\u3400-\u4db5\u4e00-\u9fa5\ua000-\ua48c\uac00-\ud7a3\uf90
0-\ufa2d\ufa30-\ufa6a\ufb00-\ufb06\ufb13-\ufb17\ufb1d\ufb1f-\ufb28\ufb2a-\ufb36\
ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufbb1\ufbd3-\ufd3d\ufd50-\uf
d8f\ufd92-\ufdc7\ufdf0-\ufdfb\ufe70-\ufe74\ufe76-\ufefc\uff10-\uff19\uff21-\uff3
a\uff41-\uff5a\uff66-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc" |
462 + "]"); | 449 + "]"); |
463 | 450 |
464 } | 451 } |
OLD | NEW |