Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(73)

Side by Side Diff: boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java

Issue 291823005: Restore Title identification. (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: cleanup Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/com/dom_distiller/client/ContentExtractor.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 /** 5 /**
6 * boilerpipe 6 * boilerpipe
7 * 7 *
8 * Copyright (c) 2009 Christian Kohlschütter 8 * Copyright (c) 2009 Christian Kohlschütter
9 * 9 *
10 * The author licenses this file to You under the Apache License, Version 2.0 10 * The author licenses this file to You under the Apache License, Version 2.0
11 * (the "License"); you may not use this file except in compliance with 11 * (the "License"); you may not use this file except in compliance with
12 * the License. You may obtain a copy of the License at 12 * the License. You may obtain a copy of the License at
13 * 13 *
14 * http://www.apache.org/licenses/LICENSE-2.0 14 * http://www.apache.org/licenses/LICENSE-2.0
15 * 15 *
16 * Unless required by applicable law or agreed to in writing, software 16 * Unless required by applicable law or agreed to in writing, software
17 * distributed under the License is distributed on an "AS IS" BASIS, 17 * distributed under the License is distributed on an "AS IS" BASIS,
18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 * See the License for the specific language governing permissions and 19 * See the License for the specific language governing permissions and
20 * limitations under the License. 20 * limitations under the License.
21 */ 21 */
22 package de.l3s.boilerpipe.sax; 22 package de.l3s.boilerpipe.sax;
23 23
24 import com.dom_distiller.client.StringUtil;
25
26 import de.l3s.boilerpipe.document.TextBlock;
27 import de.l3s.boilerpipe.document.TextDocument;
28 import de.l3s.boilerpipe.labels.LabelAction;
29 import de.l3s.boilerpipe.util.UnicodeTokenizer;
30
31 import org.xml.sax.Attributes;
32 import org.xml.sax.ContentHandler;
33 import org.xml.sax.Locator;
34 import org.xml.sax.SAXException;
35
24 import java.util.ArrayList; 36 import java.util.ArrayList;
25 import java.util.HashSet; 37 import java.util.HashSet;
26 import java.util.LinkedList; 38 import java.util.LinkedList;
27 import java.util.List; 39 import java.util.List;
28 import java.util.Map; 40 import java.util.Map;
29 import java.util.regex.Pattern; 41 import java.util.regex.Pattern;
30 42
31 import org.xml.sax.Attributes;
32 import org.xml.sax.ContentHandler;
33 import org.xml.sax.Locator;
34 import org.xml.sax.SAXException;
35
36 import de.l3s.boilerpipe.document.TextBlock;
37 import de.l3s.boilerpipe.document.TextDocument;
38 import de.l3s.boilerpipe.labels.LabelAction;
39 import de.l3s.boilerpipe.util.UnicodeTokenizer;
40
41 import com.dom_distiller.client.StringUtil;
42
43 /** 43 /**
44 * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can 44 * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can
45 * be used by different parser implementations, e.g. NekoHTML and TagSoup. 45 * be used by different parser implementations, e.g. NekoHTML and TagSoup.
46 * 46 *
47 * @author Christian Kohlschütter 47 * @author Christian Kohlschütter
48 */ 48 */
49 public class BoilerpipeHTMLContentHandler implements ContentHandler { 49 public class BoilerpipeHTMLContentHandler implements ContentHandler {
50 50
51 private final Map<String, TagAction> tagActions; 51 private final Map<String, TagAction> tagActions;
52 private String title = null; 52 private final String title = null;
cjhopman 2014/05/21 21:16:02 This looks unused now.
53 53
54 static final String ANCHOR_TEXT_START = "$\ue00a<"; 54 static final String ANCHOR_TEXT_START = "$\ue00a<";
55 static final String ANCHOR_TEXT_END = ">\ue00a$"; 55 static final String ANCHOR_TEXT_END = ">\ue00a$";
56 56
57 StringBuilder tokenBuffer = new StringBuilder(); 57 StringBuilder tokenBuffer = new StringBuilder();
58 StringBuilder textBuffer = new StringBuilder(); 58 StringBuilder textBuffer = new StringBuilder();
59 59
60 int inBody = 0; 60 int inBody = 0;
61 int inAnchor = 0; 61 int inAnchor = 0;
62 int inIgnorableElement = 0; 62 int inIgnorableElement = 0;
63 63
64 int tagLevel = 0; 64 int tagLevel = 0;
65 int blockTagLevel = -1; 65 int blockTagLevel = -1;
66 66
67 boolean sbLastWasWhitespace = false; 67 boolean sbLastWasWhitespace = false;
68 private int textElementIdx = 0; 68 private int textElementIdx = 0;
69 69
70 private final List<TextBlock> textBlocks = new ArrayList<TextBlock>(); 70 private final List<TextBlock> textBlocks = new ArrayList<TextBlock>();
71 71
72 private String lastStartTag = null; 72 private String lastStartTag = null;
73 @SuppressWarnings("unused") 73 @SuppressWarnings("unused")
74 private String lastEndTag = null; 74 private String lastEndTag = null;
75 @SuppressWarnings("unused") 75 @SuppressWarnings("unused")
76 private Event lastEvent = null; 76 private Event lastEvent = null;
77 77
78 private int offsetBlocks = 0; 78 private int offsetBlocks = 0;
79 private HashSet<Integer> currentContainedTextElements = new HashSet<Inte ger>(); 79 private HashSet<Integer> currentContainedTextElements = new HashSet<Integer> ();
80 80
81 private boolean flush = false; 81 private boolean flush = false;
82 boolean inAnchorText = false; 82 boolean inAnchorText = false;
83 83
84 LinkedList<LinkedList<LabelAction>> labelStacks = new LinkedList<LinkedL ist<LabelAction>>(); 84 LinkedList<LinkedList<LabelAction>> labelStacks = new LinkedList<LinkedList< LabelAction>>();
85 LinkedList<Integer> fontSizeStack = new LinkedList<Integer>(); 85 LinkedList<Integer> fontSizeStack = new LinkedList<Integer>();
86 86
87 /** 87 /**
88 * Recycles this instance. 88 * Recycles this instance.
89 */ 89 */
90 public void recycle() { 90 public void recycle() {
91 tokenBuffer.setLength(0); 91 tokenBuffer.setLength(0);
92 textBuffer.setLength(0); 92 textBuffer.setLength(0);
93 93
94 inBody = 0; 94 inBody = 0;
95 inAnchor = 0; 95 inAnchor = 0;
96 inIgnorableElement = 0; 96 inIgnorableElement = 0;
97 sbLastWasWhitespace = false; 97 sbLastWasWhitespace = false;
98 textElementIdx = 0; 98 textElementIdx = 0;
99 99
100 textBlocks.clear(); 100 textBlocks.clear();
101 101
102 lastStartTag = null; 102 lastStartTag = null;
103 lastEndTag = null; 103 lastEndTag = null;
104 lastEvent = null; 104 lastEvent = null;
105 105
106 offsetBlocks = 0; 106 offsetBlocks = 0;
107 currentContainedTextElements.clear(); 107 currentContainedTextElements.clear();
108 108
109 flush = false; 109 flush = false;
110 inAnchorText = false; 110 inAnchorText = false;
111 } 111 }
112 112
113 /** 113 /**
114 * Constructs a {@link BoilerpipeHTMLContentHandler} using the 114 * Constructs a {@link BoilerpipeHTMLContentHandler} using the
115 * {@link DefaultTagActionMap}. 115 * {@link DefaultTagActionMap}.
116 */ 116 */
117 public BoilerpipeHTMLContentHandler() { 117 public BoilerpipeHTMLContentHandler() {
118 this(DefaultTagActionMap.INSTANCE); 118 this(DefaultTagActionMap.INSTANCE);
119 } 119 }
120 120
121 /** 121 /**
122 * Constructs a {@link BoilerpipeHTMLContentHandler} using the given 122 * Constructs a {@link BoilerpipeHTMLContentHandler} using the given
123 * {@link TagActionMap}. 123 * {@link TagActionMap}.
124 * 124 *
125 * @param tagActions 125 * @param tagActions
126 * The {@link TagActionMap} to use, e.g. 126 * The {@link TagActionMap} to use, e.g.
127 * {@link DefaultTagActionMap}. 127 * {@link DefaultTagActionMap}.
128 */ 128 */
129 public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) { 129 public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) {
130 this.tagActions = tagActions; 130 this.tagActions = tagActions;
131 } 131 }
132 132
133 // @Override 133 @Override
134 public void endDocument() throws SAXException { 134 public void endDocument() throws SAXException {
135 flushBlock(); 135 flushBlock();
136 } 136 }
137 137
138 // @Override 138 @Override
139 public void endPrefixMapping(String prefix) throws SAXException { 139 public void endPrefixMapping(String prefix) throws SAXException {
140 } 140 }
141 141
142 // @Override 142 @Override
143 public void ignorableWhitespace(char[] ch, int start, int length) 143 public void ignorableWhitespace(char[] ch, int start, int length)
144 throws SAXException { 144 throws SAXException {
145 if (!sbLastWasWhitespace) { 145 if (!sbLastWasWhitespace) {
146 textBuffer.append(' '); 146 textBuffer.append(' ');
147 tokenBuffer.append(' '); 147 tokenBuffer.append(' ');
148 } 148 }
149 sbLastWasWhitespace = true; 149 sbLastWasWhitespace = true;
150 } 150 }
151 151
152 // @Override 152 @Override
153 public void processingInstruction(String target, String data) 153 public void processingInstruction(String target, String data)
154 throws SAXException { 154 throws SAXException {
155 } 155 }
156 156
157 // @Override 157 @Override
158 public void setDocumentLocator(Locator locator) { 158 public void setDocumentLocator(Locator locator) {
159 } 159 }
160 160
161 // @Override 161 @Override
162 public void skippedEntity(String name) throws SAXException { 162 public void skippedEntity(String name) throws SAXException {
163 } 163 }
164 164
165 // @Override 165 @Override
166 public void startDocument() throws SAXException { 166 public void startDocument() throws SAXException {
167 } 167 }
168 168
169 // @Override 169 @Override
170 public void startPrefixMapping(String prefix, String uri) 170 public void startPrefixMapping(String prefix, String uri)
171 throws SAXException { 171 throws SAXException {
172 } 172 }
173 173
174 // @Override 174 @Override
175 public void startElement(String uri, String localName, String qName, 175 public void startElement(String uri, String localName, String qName,
176 Attributes atts) throws SAXException { 176 Attributes atts) throws SAXException {
177 labelStacks.add(null); 177 labelStacks.add(null);
178 178
179 TagAction ta = tagActions.get(localName); 179 TagAction ta = tagActions.get(localName);
180 if (ta != null) { 180 if (ta != null) {
181 if(ta.changesTagLevel()) { 181 if(ta.changesTagLevel()) {
182 tagLevel++; 182 tagLevel++;
183 } 183 }
184 flush = ta.start(this, localName, qName, atts) | flush; 184 flush = ta.start(this, localName, qName, atts) | flush;
185 } else { 185 } else {
186 tagLevel++; 186 tagLevel++;
187 flush = true; 187 flush = true;
188 } 188 }
189 189
190 lastEvent = Event.START_TAG; 190 lastEvent = Event.START_TAG;
191 lastStartTag = localName; 191 lastStartTag = localName;
192 } 192 }
193 193
194 // @Override 194 @Override
195 public void endElement(String uri, String localName, String qName) 195 public void endElement(String uri, String localName, String qName)
196 throws SAXException { 196 throws SAXException {
197 TagAction ta = tagActions.get(localName); 197 TagAction ta = tagActions.get(localName);
198 if (ta != null) { 198 if (ta != null) {
199 flush = ta.end(this, localName, qName) | flush; 199 flush = ta.end(this, localName, qName) | flush;
200 } else { 200 } else {
201 flush = true; 201 flush = true;
202 } 202 }
203 203
204 if(ta == null || ta.changesTagLevel()) { 204 if(ta == null || ta.changesTagLevel()) {
205 tagLevel--; 205 tagLevel--;
206 } 206 }
207 207
208 if (flush) { 208 if (flush) {
209 flushBlock(); 209 flushBlock();
210 } 210 }
211 211
212 lastEvent = Event.END_TAG; 212 lastEvent = Event.END_TAG;
213 lastEndTag = localName; 213 lastEndTag = localName;
214 214
215 labelStacks.removeLast(); 215 labelStacks.removeLast();
216 } 216 }
217 217
218 // @Override 218 @Override
219 public void characters(char[] ch, int start, int length) 219 public void characters(char[] ch, int start, int length)
220 throws SAXException { 220 throws SAXException {
221 textElementIdx++; 221 textElementIdx++;
222 222
223 223
224 if (flush) { 224 if (flush) {
225 flushBlock(); 225 flushBlock();
226 flush = false; 226 flush = false;
227 } 227 }
228 228
229 if (inIgnorableElement != 0) { 229 if (inIgnorableElement != 0) {
230 return; 230 return;
231 } 231 }
232 232
233 char c; 233 char c;
234 boolean startWhitespace = false; 234 boolean startWhitespace = false;
235 boolean endWhitespace = false; 235 boolean endWhitespace = false;
236 if (length == 0) { 236 if (length == 0) {
237 return; 237 return;
238 } 238 }
239 239
240 final int end = start + length; 240 final int end = start + length;
241 for (int i = start; i < end; i++) { 241 for (int i = start; i < end; i++) {
242 if (StringUtil.isWhitespace(ch[i])) { 242 if (StringUtil.isWhitespace(ch[i])) {
243 ch[i] = ' '; 243 ch[i] = ' ';
244 } 244 }
245 } 245 }
246 while (start < end) { 246 while (start < end) {
247 c = ch[start]; 247 c = ch[start];
248 if (c == ' ') { 248 if (c == ' ') {
249 startWhitespace = true; 249 startWhitespace = true;
250 start++; 250 start++;
251 length--; 251 length--;
252 } else { 252 } else {
253 break; 253 break;
254 } 254 }
255 } 255 }
256 while (length > 0) { 256 while (length > 0) {
257 c = ch[start + length - 1]; 257 c = ch[start + length - 1];
258 if (c == ' ') { 258 if (c == ' ') {
259 endWhitespace = true; 259 endWhitespace = true;
260 length--; 260 length--;
261 } else { 261 } else {
262 break; 262 break;
263 } 263 }
264 } 264 }
265 if (length == 0) { 265 if (length == 0) {
266 if (startWhitespace || endWhitespace) { 266 if (startWhitespace || endWhitespace) {
267 if (!sbLastWasWhitespace) { 267 if (!sbLastWasWhitespace) {
268 textBuffer.append(' '); 268 textBuffer.append(' ');
269 tokenBuffer.append(' '); 269 tokenBuffer.append(' ');
270 } 270 }
271 sbLastWasWhitespace = true; 271 sbLastWasWhitespace = true;
272 } else { 272 } else {
273 sbLastWasWhitespace = false; 273 sbLastWasWhitespace = false;
274 } 274 }
275 lastEvent = Event.WHITESPACE; 275 lastEvent = Event.WHITESPACE;
276 return; 276 return;
277 } 277 }
278 if (startWhitespace) { 278 if (startWhitespace) {
279 if (!sbLastWasWhitespace) { 279 if (!sbLastWasWhitespace) {
280 textBuffer.append(' '); 280 textBuffer.append(' ');
281 tokenBuffer.append(' '); 281 tokenBuffer.append(' ');
282 } 282 }
283 } 283 }
284 284
285 if (blockTagLevel == -1) { 285 if (blockTagLevel == -1) {
286 blockTagLevel = tagLevel; 286 blockTagLevel = tagLevel;
287 } 287 }
288 288
289 textBuffer.append(ch, start, length); 289 textBuffer.append(ch, start, length);
290 tokenBuffer.append(ch, start, length); 290 tokenBuffer.append(ch, start, length);
291 if (endWhitespace) { 291 if (endWhitespace) {
292 textBuffer.append(' '); 292 textBuffer.append(' ');
293 tokenBuffer.append(' '); 293 tokenBuffer.append(' ');
294 } 294 }
295 295
296 sbLastWasWhitespace = endWhitespace; 296 sbLastWasWhitespace = endWhitespace;
297 lastEvent = Event.CHARACTERS; 297 lastEvent = Event.CHARACTERS;
298 298
299 currentContainedTextElements.add(textElementIdx); 299 currentContainedTextElements.add(textElementIdx);
300 } 300 }
301 301
302 List<TextBlock> getTextBlocks() { 302 List<TextBlock> getTextBlocks() {
303 return textBlocks; 303 return textBlocks;
304 } 304 }
305 305
306 public void flushBlock() { 306 public void flushBlock() {
307 if (inBody == 0) { 307 if (inBody == 0) {
308 if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) { 308 textBuffer.setLength(0);
309 setTitle(tokenBuffer.toString().trim()); 309 tokenBuffer.setLength(0);
310 } 310 return;
311 textBuffer.setLength(0); 311 }
312 tokenBuffer.setLength(0); 312
313 return; 313 final int length = tokenBuffer.length();
314 } 314 switch (length) {
315 315 case 0:
316 final int length = tokenBuffer.length(); 316 return;
317 switch (length) { 317 case 1:
318 case 0: 318 if (sbLastWasWhitespace) {
319 return; 319 textBuffer.setLength(0);
320 case 1: 320 tokenBuffer.setLength(0);
321 if (sbLastWasWhitespace) { 321 return;
322 textBuffer.setLength(0); 322 }
323 tokenBuffer.setLength(0); 323 }
324 return; 324 final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer);
325 } 325
326 } 326 int numWords = 0;
327 final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer); 327 int numLinkedWords = 0;
328 328 int numWrappedLines = 0;
329 int numWords = 0; 329 int currentLineLength = -1; // don't count the first space
330 int numLinkedWords = 0; 330 final int maxLineLength = 80;
331 int numWrappedLines = 0; 331 int numTokens = 0;
332 int currentLineLength = -1; // don't count the first space 332 int numWordsCurrentLine = 0;
333 final int maxLineLength = 80; 333
334 int numTokens = 0; 334 for (String token : tokens) {
335 int numWordsCurrentLine = 0; 335 if (ANCHOR_TEXT_START.equals(token)) {
336 336 inAnchorText = true;
337 for (String token : tokens) { 337 } else if (ANCHOR_TEXT_END.equals(token)) {
338 if (ANCHOR_TEXT_START.equals(token)) { 338 inAnchorText = false;
339 inAnchorText = true; 339 } else if (isWord(token)) {
340 } else if (ANCHOR_TEXT_END.equals(token)) { 340 numTokens++;
341 inAnchorText = false; 341 numWords++;
342 } else if (isWord(token)) { 342 numWordsCurrentLine++;
343 numTokens++; 343 if (inAnchorText) {
344 numWords++; 344 numLinkedWords++;
345 numWordsCurrentLine++; 345 }
346 if (inAnchorText) { 346 final int tokenLength = token.length();
347 numLinkedWords++; 347 currentLineLength += tokenLength + 1;
348 } 348 if (currentLineLength > maxLineLength) {
349 final int tokenLength = token.length(); 349 numWrappedLines++;
350 currentLineLength += tokenLength + 1; 350 currentLineLength = tokenLength;
351 if (currentLineLength > maxLineLength) { 351 numWordsCurrentLine = 1;
352 numWrappedLines++; 352 }
353 currentLineLength = tokenLength; 353 } else {
354 numWordsCurrentLine = 1; 354 numTokens++;
355 } 355 }
356 } else { 356 }
357 numTokens++; 357 if (numTokens == 0) {
358 } 358 return;
359 } 359 }
360 if (numTokens == 0) { 360 int numWordsInWrappedLines;
361 return; 361 if (numWrappedLines == 0) {
362 } 362 numWordsInWrappedLines = numWords;
363 int numWordsInWrappedLines; 363 numWrappedLines = 1;
364 if (numWrappedLines == 0) { 364 } else {
365 numWordsInWrappedLines = numWords; 365 numWordsInWrappedLines = numWords - numWordsCurrentLine;
366 numWrappedLines = 1; 366 }
367 } else { 367
368 numWordsInWrappedLines = numWords - numWordsCurrentLine; 368 TextBlock tb = new TextBlock(StringUtil.javaTrim(textBuffer.toString()),
369 } 369 currentContainedTextElements, numWords, numLinkedWords,
370 370 numWordsInWrappedLines, numWrappedLines, offsetBlocks);
371 TextBlock tb = new TextBlock(StringUtil.javaTrim(textBuffer.toSt ring()), 371 currentContainedTextElements = new HashSet<Integer>();
372 currentContainedTextElements, numWords, numLinke dWords, 372
373 numWordsInWrappedLines, numWrappedLines, offsetB locks); 373 offsetBlocks++;
374 currentContainedTextElements = new HashSet<Integer>(); 374
375 375 textBuffer.setLength(0);
376 offsetBlocks++; 376 tokenBuffer.setLength(0);
377 377
378 textBuffer.setLength(0); 378 tb.setTagLevel(blockTagLevel);
379 tokenBuffer.setLength(0); 379 addTextBlock(tb);
380 380 blockTagLevel = -1;
381 tb.setTagLevel(blockTagLevel); 381 }
382 addTextBlock(tb); 382
383 blockTagLevel = -1; 383 protected void addTextBlock(final TextBlock tb) {
384 } 384
385 385 for (Integer l : fontSizeStack) {
386 protected void addTextBlock(final TextBlock tb) { 386 if (l != null) {
387 387 tb.addLabel("font-" + l);
388 for (Integer l : fontSizeStack) { 388 break;
389 if (l != null) { 389 }
390 tb.addLabel("font-" + l); 390 }
391 break; 391 for (LinkedList<LabelAction> labelStack : labelStacks) {
392 } 392 if (labelStack != null) {
393 } 393 for (LabelAction labels : labelStack) {
394 for (LinkedList<LabelAction> labelStack : labelStacks) { 394 if (labels != null) {
395 if (labelStack != null) { 395 labels.addTo(tb);
396 for (LabelAction labels : labelStack) { 396 }
397 if (labels != null) { 397 }
398 labels.addTo(tb); 398 }
399 } 399 }
400 } 400
401 } 401 textBlocks.add(tb);
402 } 402 }
403 403
404 textBlocks.add(tb); 404 public static boolean isWord(final String token) {
405 } 405 return PAT_VALID_WORD_CHARACTER.matcher(token).find();
406 406 }
407 public static boolean isWord(final String token) { 407
408 return PAT_VALID_WORD_CHARACTER.matcher(token).find(); 408 static private enum Event {
409 } 409 START_TAG, END_TAG, CHARACTERS, WHITESPACE
410 410 }
411 static private enum Event { 411
412 START_TAG, END_TAG, CHARACTERS, WHITESPACE 412
413 } 413 /**
414 414 * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
415 public String getTitle() { 415 * s. NOTE: Only call this after parsing.
416 return title; 416 *
417 } 417 * @return The {@link TextDocument}
418 418 */
419 public void setTitle(String s) { 419 public TextDocument toTextDocument() {
420 if (s == null || s.length() == 0) { 420 // just to be sure
421 return; 421 flushBlock();
422 } 422 // TODO(yfriedman): When BoilerpipeHTMLContentHandler is finished being moved to
423 title = s; 423 // DomToSaxVisitor, we should be able to set Title directly.
424 } 424 return new TextDocument(null, getTextBlocks());
425 425 }
426 /** 426
427 * Returns a {@link TextDocument} containing the extracted {@link TextBl ock} 427 public void addWhitespaceIfNecessary() {
428 * s. NOTE: Only call this after parsing. 428 if (!sbLastWasWhitespace) {
429 * 429 tokenBuffer.append(' ');
430 * @return The {@link TextDocument} 430 textBuffer.append(' ');
431 */ 431 sbLastWasWhitespace = true;
432 public TextDocument toTextDocument() { 432 }
433 // just to be sure 433 }
434 flushBlock(); 434
435 435 public void addLabelAction(final LabelAction la)
436 return new TextDocument(getTitle(), getTextBlocks()); 436 throws IllegalStateException {
437 } 437 LinkedList<LabelAction> labelStack = labelStacks.getLast();
438 438 if (labelStack == null) {
439 public void addWhitespaceIfNecessary() { 439 labelStack = new LinkedList<LabelAction>();
440 if (!sbLastWasWhitespace) { 440 labelStacks.removeLast();
441 tokenBuffer.append(' '); 441 labelStacks.add(labelStack);
442 textBuffer.append(' '); 442 }
443 sbLastWasWhitespace = true; 443 labelStack.add(la);
444 } 444 }
445 } 445
446 446 private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern
447 public void addLabelAction(final LabelAction la) 447 .compile(
448 throws IllegalStateException {
449 LinkedList<LabelAction> labelStack = labelStacks.getLast();
450 if (labelStack == null) {
451 labelStack = new LinkedList<LabelAction>();
452 labelStacks.removeLast();
453 labelStacks.add(labelStack);
454 }
455 labelStack.add(la);
456 }
457
458 private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern
459 .compile(
460 "[" + 448 "[" +
461 "\u0030-\u0039\u0041-\u005a\u0061-\u007a\u00aa\u 00b2-\u00b3\u00b5\u00b9-\u00ba\u00bc-\u00be\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02 36\u0250-\u02c1\u02c6-\u02d1\u02e0-\u02e4\u02ee\u037a\u0386\u0388-\u038a\u038c\u 038e-\u03a1\u03a3-\u03ce\u03d0-\u03f5\u03f7-\u03fb\u0400-\u0481\u048a-\u04ce\u04 d0-\u04f5\u04f8-\u04f9\u0500-\u050f\u0531-\u0556\u0559\u0561-\u0587\u05d0-\u05ea \u05f0-\u05f2\u0621-\u063a\u0640-\u064a\u0660-\u0669\u066e-\u066f\u0671-\u06d3\u 06d5\u06e5-\u06e6\u06ee-\u06fc\u06ff\u0710\u0712-\u072f\u074d-\u074f\u0780-\u07a 5\u07b1\u0904-\u0939\u093d\u0950\u0958-\u0961\u0966-\u096f\u0985-\u098c\u098f-\u 0990\u0993-\u09a8\u09aa-\u09b0\u09b2\u09b6-\u09b9\u09bd\u09dc-\u09dd\u09df-\u09e 1\u09e6-\u09f1\u09f4-\u09f9\u0a05-\u0a0a\u0a0f-\u0a10\u0a13-\u0a28\u0a2a-\u0a30\ u0a32-\u0a33\u0a35-\u0a36\u0a38-\u0a39\u0a59-\u0a5c\u0a5e\u0a66-\u0a6f\u0a72-\u0 a74\u0a85-\u0a8d\u0a8f-\u0a91\u0a93-\u0aa8\u0aaa-\u0ab0\u0ab2-\u0ab3\u0ab5-\u0ab 9\u0abd\u0ad0\u0ae0-\u0ae1\u0ae6-\u0aef\u0b05-\u0b0c\u0b0f-\u0b10\u0b13-\u0b28\u 0b2a-\u0b30\u0b32-\u0b33\u0b35-\u0b39\u0b3d\u0b5c-\u0b5d\u0b5f-\u0b61\u0b66-\u0b 6f\u0b71\u0b83\u0b85-\u0b8a\u0b8e-\u0b90\u0b92-\u0b95\u0b99-\u0b9a\u0b9c\u0b9e-\ u0b9f\u0ba3-\u0ba4\u0ba8-\u0baa\u0bae-\u0bb5\u0bb7-\u0bb9\u0be7-\u0bf2\u0c05-\u0 c0c\u0c0e-\u0c10\u0c12-\u0c28\u0c2a-\u0c33\u0c35-\u0c39\u0c60-\u0c61\u0c66-\u0c6 f\u0c85-\u0c8c\u0c8e-\u0c90\u0c92-\u0ca8\u0caa-\u0cb3\u0cb5-\u0cb9\u0cbd\u0cde\u 0ce0-\u0ce1\u0ce6-\u0cef\u0d05-\u0d0c\u0d0e-\u0d10\u0d12-\u0d28\u0d2a-\u0d39\u0d 60-\u0d61\u0d66-\u0d6f\u0d85-\u0d96\u0d9a-\u0db1\u0db3-\u0dbb\u0dbd\u0dc0-\u0dc6 \u0e01-\u0e30\u0e32-\u0e33\u0e40-\u0e46\u0e50-\u0e59\u0e81-\u0e82\u0e84\u0e87-\u 0e88\u0e8a\u0e8d\u0e94-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5\u0ea7\u0eaa-\u0eab \u0ead-\u0eb0\u0eb2-\u0eb3\u0ebd\u0ec0-\u0ec4\u0ec6\u0ed0-\u0ed9\u0edc-\u0edd\u0 f00\u0f20-\u0f33\u0f40-\u0f47\u0f49-\u0f6a\u0f88-\u0f8b\u1000-\u1021\u1023-\u102 7\u1029-\u102a\u1040-\u1049\u1050-\u1055\u10a0-\u10c5\u10d0-\u10f8\u1100-\u1159\ u115f-\u11a2\u11a8-\u11f9\u1200-\u1206\u1208-\u1246\u1248\u124a-\u124d\u1250-\u1 256\u1258\u125a-\u125d\u1260-\u1286\u1288\u128a-\u128d\u1290-\u12ae\u12b0\u12b2- \u12b5\u12b8-\u12be\u12c0\u12c2-\u12c5\u12c8-\u12ce\u12d0-\u12d6\u12d8-\u12ee\u1 2f0-\u130e\u1310\u1312-\u1315\u1318-\u131e\u1320-\u1346\u1348-\u135a\u1369-\u137 c\u13a0-\u13f4\u1401-\u166c\u166f-\u1676\u1681-\u169a\u16a0-\u16ea\u16ee-\u16f0\ u1700-\u170c\u170e-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176c\u176e-\u1770\u1 780-\u17b3\u17d7\u17dc\u17e0-\u17e9\u17f0-\u17f9\u1810-\u1819\u1820-\u1877\u1880 -\u18a8\u1900-\u191c\u1946-\u196d\u1970-\u1974\u1d00-\u1d6b\u1e00-\u1e9b\u1ea0-\ u1ef9\u1f00-\u1f15\u1f18-\u1f1d\u1f20-\u1f45\u1f48-\u1f4d\u1f50-\u1f57\u1f59\u1f 5b\u1f5d\u1f5f-\u1f7d\u1f80-\u1fb4\u1fb6-\u1fbc\u1fbe\u1fc2-\u1fc4\u1fc6-\u1fcc\ u1fd0-\u1fd3\u1fd6-\u1fdb\u1fe0-\u1fec\u1ff2-\u1ff4\u1ff6-\u1ffc\u2070-\u2071\u2 074-\u2079\u207f-\u2089\u2102\u2107\u210a-\u2113\u2115\u2119-\u211d\u2124\u2126\ u2128\u212a-\u212d\u212f-\u2131\u2133-\u2139\u213d-\u213f\u2145-\u2149\u2153-\u2 183\u2460-\u249b\u24ea-\u24ff\u2776-\u2793\u3005-\u3007\u3021-\u3029\u3031-\u303 5\u3038-\u303c\u3041-\u3096\u309d-\u309f\u30a1-\u30fa\u30fc-\u30ff\u3105-\u312c\ u3131-\u318e\u3192-\u3195\u31a0-\u31b7\u31f0-\u31ff\u3220-\u3229\u3251-\u325f\u3 280-\u3289\u32b1-\u32bf\u3400-\u4db5\u4e00-\u9fa5\ua000-\ua48c\uac00-\ud7a3\uf90 0-\ufa2d\ufa30-\ufa6a\ufb00-\ufb06\ufb13-\ufb17\ufb1d\ufb1f-\ufb28\ufb2a-\ufb36\ ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufbb1\ufbd3-\ufd3d\ufd50-\uf d8f\ufd92-\ufdc7\ufdf0-\ufdfb\ufe70-\ufe74\ufe76-\ufefc\uff10-\uff19\uff21-\uff3 a\uff41-\uff5a\uff66-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc" 449 "\u0030-\u0039\u0041-\u005a\u0061-\u007a\u00aa\u 00b2-\u00b3\u00b5\u00b9-\u00ba\u00bc-\u00be\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02 36\u0250-\u02c1\u02c6-\u02d1\u02e0-\u02e4\u02ee\u037a\u0386\u0388-\u038a\u038c\u 038e-\u03a1\u03a3-\u03ce\u03d0-\u03f5\u03f7-\u03fb\u0400-\u0481\u048a-\u04ce\u04 d0-\u04f5\u04f8-\u04f9\u0500-\u050f\u0531-\u0556\u0559\u0561-\u0587\u05d0-\u05ea \u05f0-\u05f2\u0621-\u063a\u0640-\u064a\u0660-\u0669\u066e-\u066f\u0671-\u06d3\u 06d5\u06e5-\u06e6\u06ee-\u06fc\u06ff\u0710\u0712-\u072f\u074d-\u074f\u0780-\u07a 5\u07b1\u0904-\u0939\u093d\u0950\u0958-\u0961\u0966-\u096f\u0985-\u098c\u098f-\u 0990\u0993-\u09a8\u09aa-\u09b0\u09b2\u09b6-\u09b9\u09bd\u09dc-\u09dd\u09df-\u09e 1\u09e6-\u09f1\u09f4-\u09f9\u0a05-\u0a0a\u0a0f-\u0a10\u0a13-\u0a28\u0a2a-\u0a30\ u0a32-\u0a33\u0a35-\u0a36\u0a38-\u0a39\u0a59-\u0a5c\u0a5e\u0a66-\u0a6f\u0a72-\u0 a74\u0a85-\u0a8d\u0a8f-\u0a91\u0a93-\u0aa8\u0aaa-\u0ab0\u0ab2-\u0ab3\u0ab5-\u0ab 9\u0abd\u0ad0\u0ae0-\u0ae1\u0ae6-\u0aef\u0b05-\u0b0c\u0b0f-\u0b10\u0b13-\u0b28\u 0b2a-\u0b30\u0b32-\u0b33\u0b35-\u0b39\u0b3d\u0b5c-\u0b5d\u0b5f-\u0b61\u0b66-\u0b 6f\u0b71\u0b83\u0b85-\u0b8a\u0b8e-\u0b90\u0b92-\u0b95\u0b99-\u0b9a\u0b9c\u0b9e-\ u0b9f\u0ba3-\u0ba4\u0ba8-\u0baa\u0bae-\u0bb5\u0bb7-\u0bb9\u0be7-\u0bf2\u0c05-\u0 c0c\u0c0e-\u0c10\u0c12-\u0c28\u0c2a-\u0c33\u0c35-\u0c39\u0c60-\u0c61\u0c66-\u0c6 f\u0c85-\u0c8c\u0c8e-\u0c90\u0c92-\u0ca8\u0caa-\u0cb3\u0cb5-\u0cb9\u0cbd\u0cde\u 0ce0-\u0ce1\u0ce6-\u0cef\u0d05-\u0d0c\u0d0e-\u0d10\u0d12-\u0d28\u0d2a-\u0d39\u0d 60-\u0d61\u0d66-\u0d6f\u0d85-\u0d96\u0d9a-\u0db1\u0db3-\u0dbb\u0dbd\u0dc0-\u0dc6 \u0e01-\u0e30\u0e32-\u0e33\u0e40-\u0e46\u0e50-\u0e59\u0e81-\u0e82\u0e84\u0e87-\u 0e88\u0e8a\u0e8d\u0e94-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5\u0ea7\u0eaa-\u0eab \u0ead-\u0eb0\u0eb2-\u0eb3\u0ebd\u0ec0-\u0ec4\u0ec6\u0ed0-\u0ed9\u0edc-\u0edd\u0 f00\u0f20-\u0f33\u0f40-\u0f47\u0f49-\u0f6a\u0f88-\u0f8b\u1000-\u1021\u1023-\u102 7\u1029-\u102a\u1040-\u1049\u1050-\u1055\u10a0-\u10c5\u10d0-\u10f8\u1100-\u1159\ u115f-\u11a2\u11a8-\u11f9\u1200-\u1206\u1208-\u1246\u1248\u124a-\u124d\u1250-\u1 256\u1258\u125a-\u125d\u1260-\u1286\u1288\u128a-\u128d\u1290-\u12ae\u12b0\u12b2- \u12b5\u12b8-\u12be\u12c0\u12c2-\u12c5\u12c8-\u12ce\u12d0-\u12d6\u12d8-\u12ee\u1 2f0-\u130e\u1310\u1312-\u1315\u1318-\u131e\u1320-\u1346\u1348-\u135a\u1369-\u137 c\u13a0-\u13f4\u1401-\u166c\u166f-\u1676\u1681-\u169a\u16a0-\u16ea\u16ee-\u16f0\ u1700-\u170c\u170e-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176c\u176e-\u1770\u1 780-\u17b3\u17d7\u17dc\u17e0-\u17e9\u17f0-\u17f9\u1810-\u1819\u1820-\u1877\u1880 -\u18a8\u1900-\u191c\u1946-\u196d\u1970-\u1974\u1d00-\u1d6b\u1e00-\u1e9b\u1ea0-\ u1ef9\u1f00-\u1f15\u1f18-\u1f1d\u1f20-\u1f45\u1f48-\u1f4d\u1f50-\u1f57\u1f59\u1f 5b\u1f5d\u1f5f-\u1f7d\u1f80-\u1fb4\u1fb6-\u1fbc\u1fbe\u1fc2-\u1fc4\u1fc6-\u1fcc\ u1fd0-\u1fd3\u1fd6-\u1fdb\u1fe0-\u1fec\u1ff2-\u1ff4\u1ff6-\u1ffc\u2070-\u2071\u2 074-\u2079\u207f-\u2089\u2102\u2107\u210a-\u2113\u2115\u2119-\u211d\u2124\u2126\ u2128\u212a-\u212d\u212f-\u2131\u2133-\u2139\u213d-\u213f\u2145-\u2149\u2153-\u2 183\u2460-\u249b\u24ea-\u24ff\u2776-\u2793\u3005-\u3007\u3021-\u3029\u3031-\u303 5\u3038-\u303c\u3041-\u3096\u309d-\u309f\u30a1-\u30fa\u30fc-\u30ff\u3105-\u312c\ u3131-\u318e\u3192-\u3195\u31a0-\u31b7\u31f0-\u31ff\u3220-\u3229\u3251-\u325f\u3 280-\u3289\u32b1-\u32bf\u3400-\u4db5\u4e00-\u9fa5\ua000-\ua48c\uac00-\ud7a3\uf90 0-\ufa2d\ufa30-\ufa6a\ufb00-\ufb06\ufb13-\ufb17\ufb1d\ufb1f-\ufb28\ufb2a-\ufb36\ ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufbb1\ufbd3-\ufd3d\ufd50-\uf d8f\ufd92-\ufdc7\ufdf0-\ufdfb\ufe70-\ufe74\ufe76-\ufefc\uff10-\uff19\uff21-\uff3 a\uff41-\uff5a\uff66-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc"
462 + "]"); 450 + "]");
463 451
464 } 452 }
OLDNEW
« no previous file with comments | « no previous file | src/com/dom_distiller/client/ContentExtractor.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698