Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(180)

Side by Side Diff: boilerpipe-core/src/main/de/l3s/boilerpipe/sax/BoilerpipeHTMLContentHandler.java

Issue 291823005: Restore Title identification. (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: trim title Created 6 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/com/dom_distiller/client/ContentExtractor.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 /** 5 /**
6 * boilerpipe 6 * boilerpipe
7 * 7 *
8 * Copyright (c) 2009 Christian Kohlschütter 8 * Copyright (c) 2009 Christian Kohlschütter
9 * 9 *
10 * The author licenses this file to You under the Apache License, Version 2.0 10 * The author licenses this file to You under the Apache License, Version 2.0
11 * (the "License"); you may not use this file except in compliance with 11 * (the "License"); you may not use this file except in compliance with
12 * the License. You may obtain a copy of the License at 12 * the License. You may obtain a copy of the License at
13 * 13 *
14 * http://www.apache.org/licenses/LICENSE-2.0 14 * http://www.apache.org/licenses/LICENSE-2.0
15 * 15 *
16 * Unless required by applicable law or agreed to in writing, software 16 * Unless required by applicable law or agreed to in writing, software
17 * distributed under the License is distributed on an "AS IS" BASIS, 17 * distributed under the License is distributed on an "AS IS" BASIS,
18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 * See the License for the specific language governing permissions and 19 * See the License for the specific language governing permissions and
20 * limitations under the License. 20 * limitations under the License.
21 */ 21 */
22 package de.l3s.boilerpipe.sax; 22 package de.l3s.boilerpipe.sax;
23 23
24 import com.dom_distiller.client.StringUtil;
25
26 import de.l3s.boilerpipe.document.TextBlock;
27 import de.l3s.boilerpipe.document.TextDocument;
28 import de.l3s.boilerpipe.labels.LabelAction;
29 import de.l3s.boilerpipe.util.UnicodeTokenizer;
30
31 import org.xml.sax.Attributes;
32 import org.xml.sax.ContentHandler;
33 import org.xml.sax.Locator;
34 import org.xml.sax.SAXException;
35
24 import java.util.ArrayList; 36 import java.util.ArrayList;
25 import java.util.HashSet; 37 import java.util.HashSet;
26 import java.util.LinkedList; 38 import java.util.LinkedList;
27 import java.util.List; 39 import java.util.List;
28 import java.util.Map; 40 import java.util.Map;
29 import java.util.regex.Pattern; 41 import java.util.regex.Pattern;
30 42
31 import org.xml.sax.Attributes;
32 import org.xml.sax.ContentHandler;
33 import org.xml.sax.Locator;
34 import org.xml.sax.SAXException;
35
36 import de.l3s.boilerpipe.document.TextBlock;
37 import de.l3s.boilerpipe.document.TextDocument;
38 import de.l3s.boilerpipe.labels.LabelAction;
39 import de.l3s.boilerpipe.util.UnicodeTokenizer;
40
41 import com.dom_distiller.client.StringUtil;
42
43 /** 43 /**
44 * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can 44 * A simple SAX {@link ContentHandler}, used by {@link BoilerpipeSAXInput}. Can
45 * be used by different parser implementations, e.g. NekoHTML and TagSoup. 45 * be used by different parser implementations, e.g. NekoHTML and TagSoup.
46 * 46 *
47 * @author Christian Kohlschütter 47 * @author Christian Kohlschütter
48 */ 48 */
49 public class BoilerpipeHTMLContentHandler implements ContentHandler { 49 public class BoilerpipeHTMLContentHandler implements ContentHandler {
50 50
51 private final Map<String, TagAction> tagActions; 51 private final Map<String, TagAction> tagActions;
52 private String title = null; 52
53 53 static final String ANCHOR_TEXT_START = "$\ue00a<";
54 static final String ANCHOR_TEXT_START = "$\ue00a<"; 54 static final String ANCHOR_TEXT_END = ">\ue00a$";
55 static final String ANCHOR_TEXT_END = ">\ue00a$"; 55
56 56 StringBuilder tokenBuffer = new StringBuilder();
57 StringBuilder tokenBuffer = new StringBuilder(); 57 StringBuilder textBuffer = new StringBuilder();
58 StringBuilder textBuffer = new StringBuilder(); 58
59 59 int inBody = 0;
60 int inBody = 0; 60 int inAnchor = 0;
61 int inAnchor = 0; 61 int inIgnorableElement = 0;
62 int inIgnorableElement = 0; 62
63 63 int tagLevel = 0;
64 int tagLevel = 0; 64 int blockTagLevel = -1;
65 int blockTagLevel = -1; 65
66 66 boolean sbLastWasWhitespace = false;
67 boolean sbLastWasWhitespace = false; 67 private int textElementIdx = 0;
68 private int textElementIdx = 0; 68
69 69 private final List<TextBlock> textBlocks = new ArrayList<TextBlock>();
70 private final List<TextBlock> textBlocks = new ArrayList<TextBlock>(); 70
71 71 private String lastStartTag = null;
72 private String lastStartTag = null; 72 @SuppressWarnings("unused")
73 @SuppressWarnings("unused") 73 private String lastEndTag = null;
74 private String lastEndTag = null; 74 @SuppressWarnings("unused")
75 @SuppressWarnings("unused") 75 private Event lastEvent = null;
76 private Event lastEvent = null; 76
77 77 private int offsetBlocks = 0;
78 private int offsetBlocks = 0; 78 private HashSet<Integer> currentContainedTextElements = new HashSet<Integer> ();
79 private HashSet<Integer> currentContainedTextElements = new HashSet<Inte ger>(); 79
80 80 private boolean flush = false;
81 private boolean flush = false; 81 boolean inAnchorText = false;
82 boolean inAnchorText = false; 82
83 83 LinkedList<LinkedList<LabelAction>> labelStacks = new LinkedList<LinkedList< LabelAction>>();
84 LinkedList<LinkedList<LabelAction>> labelStacks = new LinkedList<LinkedL ist<LabelAction>>(); 84 LinkedList<Integer> fontSizeStack = new LinkedList<Integer>();
85 LinkedList<Integer> fontSizeStack = new LinkedList<Integer>(); 85
86 86 /**
87 /** 87 * Recycles this instance.
88 * Recycles this instance. 88 */
89 */ 89 public void recycle() {
90 public void recycle() { 90 tokenBuffer.setLength(0);
91 tokenBuffer.setLength(0); 91 textBuffer.setLength(0);
92 textBuffer.setLength(0); 92
93 93 inBody = 0;
94 inBody = 0; 94 inAnchor = 0;
95 inAnchor = 0; 95 inIgnorableElement = 0;
96 inIgnorableElement = 0; 96 sbLastWasWhitespace = false;
97 sbLastWasWhitespace = false; 97 textElementIdx = 0;
98 textElementIdx = 0; 98
99 99 textBlocks.clear();
100 textBlocks.clear(); 100
101 101 lastStartTag = null;
102 lastStartTag = null; 102 lastEndTag = null;
103 lastEndTag = null; 103 lastEvent = null;
104 lastEvent = null; 104
105 105 offsetBlocks = 0;
106 offsetBlocks = 0; 106 currentContainedTextElements.clear();
107 currentContainedTextElements.clear(); 107
108 108 flush = false;
109 flush = false; 109 inAnchorText = false;
110 inAnchorText = false; 110 }
111 } 111
112 112 /**
113 /** 113 * Constructs a {@link BoilerpipeHTMLContentHandler} using the
114 * Constructs a {@link BoilerpipeHTMLContentHandler} using the 114 * {@link DefaultTagActionMap}.
115 * {@link DefaultTagActionMap}. 115 */
116 */ 116 public BoilerpipeHTMLContentHandler() {
117 public BoilerpipeHTMLContentHandler() { 117 this(DefaultTagActionMap.INSTANCE);
118 this(DefaultTagActionMap.INSTANCE); 118 }
119 } 119
120 120 /**
121 /** 121 * Constructs a {@link BoilerpipeHTMLContentHandler} using the given
122 * Constructs a {@link BoilerpipeHTMLContentHandler} using the given 122 * {@link TagActionMap}.
123 * {@link TagActionMap}. 123 *
124 * 124 * @param tagActions
125 * @param tagActions 125 * The {@link TagActionMap} to use, e.g.
126 * The {@link TagActionMap} to use, e.g. 126 * {@link DefaultTagActionMap}.
127 * {@link DefaultTagActionMap}. 127 */
128 */ 128 public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) {
129 public BoilerpipeHTMLContentHandler(final TagActionMap tagActions) { 129 this.tagActions = tagActions;
130 this.tagActions = tagActions; 130 }
131 } 131
132 132 @Override
133 // @Override 133 public void endDocument() throws SAXException {
134 public void endDocument() throws SAXException { 134 flushBlock();
135 flushBlock(); 135 }
136 } 136
137 137 @Override
138 // @Override 138 public void endPrefixMapping(String prefix) throws SAXException {
139 public void endPrefixMapping(String prefix) throws SAXException { 139 }
140 } 140
141 141 @Override
142 // @Override 142 public void ignorableWhitespace(char[] ch, int start, int length)
143 public void ignorableWhitespace(char[] ch, int start, int length) 143 throws SAXException {
144 throws SAXException { 144 if (!sbLastWasWhitespace) {
145 if (!sbLastWasWhitespace) { 145 textBuffer.append(' ');
146 textBuffer.append(' '); 146 tokenBuffer.append(' ');
147 tokenBuffer.append(' '); 147 }
148 } 148 sbLastWasWhitespace = true;
149 sbLastWasWhitespace = true; 149 }
150 } 150
151 151 @Override
152 // @Override 152 public void processingInstruction(String target, String data)
153 public void processingInstruction(String target, String data) 153 throws SAXException {
154 throws SAXException { 154 }
155 } 155
156 156 @Override
157 // @Override 157 public void setDocumentLocator(Locator locator) {
158 public void setDocumentLocator(Locator locator) { 158 }
159 } 159
160 160 @Override
161 // @Override 161 public void skippedEntity(String name) throws SAXException {
162 public void skippedEntity(String name) throws SAXException { 162 }
163 } 163
164 164 @Override
165 // @Override 165 public void startDocument() throws SAXException {
166 public void startDocument() throws SAXException { 166 }
167 } 167
168 168 @Override
169 // @Override 169 public void startPrefixMapping(String prefix, String uri)
170 public void startPrefixMapping(String prefix, String uri) 170 throws SAXException {
171 throws SAXException { 171 }
172 } 172
173 173 @Override
174 // @Override 174 public void startElement(String uri, String localName, String qName,
175 public void startElement(String uri, String localName, String qName, 175 Attributes atts) throws SAXException {
176 Attributes atts) throws SAXException { 176 labelStacks.add(null);
177 labelStacks.add(null); 177
178 178 TagAction ta = tagActions.get(localName);
179 TagAction ta = tagActions.get(localName); 179 if (ta != null) {
180 if (ta != null) { 180 if(ta.changesTagLevel()) {
181 if(ta.changesTagLevel()) { 181 tagLevel++;
182 tagLevel++; 182 }
183 } 183 flush = ta.start(this, localName, qName, atts) | flush;
184 flush = ta.start(this, localName, qName, atts) | flush; 184 } else {
185 } else { 185 tagLevel++;
186 tagLevel++; 186 flush = true;
187 flush = true; 187 }
188 } 188
189 189 lastEvent = Event.START_TAG;
190 lastEvent = Event.START_TAG; 190 lastStartTag = localName;
191 lastStartTag = localName; 191 }
192 } 192
193 193 @Override
194 // @Override 194 public void endElement(String uri, String localName, String qName)
195 public void endElement(String uri, String localName, String qName) 195 throws SAXException {
196 throws SAXException { 196 TagAction ta = tagActions.get(localName);
197 TagAction ta = tagActions.get(localName); 197 if (ta != null) {
198 if (ta != null) { 198 flush = ta.end(this, localName, qName) | flush;
199 flush = ta.end(this, localName, qName) | flush; 199 } else {
200 } else { 200 flush = true;
201 flush = true; 201 }
202 } 202
203 203 if(ta == null || ta.changesTagLevel()) {
204 if(ta == null || ta.changesTagLevel()) { 204 tagLevel--;
205 tagLevel--; 205 }
206 } 206
207 207 if (flush) {
208 if (flush) { 208 flushBlock();
209 flushBlock(); 209 }
210 } 210
211 211 lastEvent = Event.END_TAG;
212 lastEvent = Event.END_TAG; 212 lastEndTag = localName;
213 lastEndTag = localName; 213
214 214 labelStacks.removeLast();
215 labelStacks.removeLast(); 215 }
216 } 216
217 217 @Override
218 // @Override 218 public void characters(char[] ch, int start, int length)
219 public void characters(char[] ch, int start, int length) 219 throws SAXException {
220 throws SAXException { 220 textElementIdx++;
221 textElementIdx++; 221
222 222
223 223 if (flush) {
224 if (flush) { 224 flushBlock();
225 flushBlock(); 225 flush = false;
226 flush = false; 226 }
227 } 227
228 228 if (inIgnorableElement != 0) {
229 if (inIgnorableElement != 0) { 229 return;
230 return; 230 }
231 } 231
232 232 char c;
233 char c; 233 boolean startWhitespace = false;
234 boolean startWhitespace = false; 234 boolean endWhitespace = false;
235 boolean endWhitespace = false; 235 if (length == 0) {
236 if (length == 0) { 236 return;
237 return; 237 }
238 } 238
239 239 final int end = start + length;
240 final int end = start + length; 240 for (int i = start; i < end; i++) {
241 for (int i = start; i < end; i++) { 241 if (StringUtil.isWhitespace(ch[i])) {
242 if (StringUtil.isWhitespace(ch[i])) { 242 ch[i] = ' ';
243 ch[i] = ' '; 243 }
244 } 244 }
245 } 245 while (start < end) {
246 while (start < end) { 246 c = ch[start];
247 c = ch[start]; 247 if (c == ' ') {
248 if (c == ' ') { 248 startWhitespace = true;
249 startWhitespace = true; 249 start++;
250 start++; 250 length--;
251 length--; 251 } else {
252 } else { 252 break;
253 break; 253 }
254 } 254 }
255 } 255 while (length > 0) {
256 while (length > 0) { 256 c = ch[start + length - 1];
257 c = ch[start + length - 1]; 257 if (c == ' ') {
258 if (c == ' ') { 258 endWhitespace = true;
259 endWhitespace = true; 259 length--;
260 length--; 260 } else {
261 } else { 261 break;
262 break; 262 }
263 } 263 }
264 } 264 if (length == 0) {
265 if (length == 0) { 265 if (startWhitespace || endWhitespace) {
266 if (startWhitespace || endWhitespace) { 266 if (!sbLastWasWhitespace) {
267 if (!sbLastWasWhitespace) { 267 textBuffer.append(' ');
268 textBuffer.append(' '); 268 tokenBuffer.append(' ');
269 tokenBuffer.append(' '); 269 }
270 } 270 sbLastWasWhitespace = true;
271 sbLastWasWhitespace = true; 271 } else {
272 } else { 272 sbLastWasWhitespace = false;
273 sbLastWasWhitespace = false; 273 }
274 } 274 lastEvent = Event.WHITESPACE;
275 lastEvent = Event.WHITESPACE; 275 return;
276 return; 276 }
277 } 277 if (startWhitespace) {
278 if (startWhitespace) { 278 if (!sbLastWasWhitespace) {
279 if (!sbLastWasWhitespace) { 279 textBuffer.append(' ');
280 textBuffer.append(' '); 280 tokenBuffer.append(' ');
281 tokenBuffer.append(' '); 281 }
282 } 282 }
283 } 283
284 284 if (blockTagLevel == -1) {
285 if (blockTagLevel == -1) { 285 blockTagLevel = tagLevel;
286 blockTagLevel = tagLevel; 286 }
287 } 287
288 288 textBuffer.append(ch, start, length);
289 textBuffer.append(ch, start, length); 289 tokenBuffer.append(ch, start, length);
290 tokenBuffer.append(ch, start, length); 290 if (endWhitespace) {
291 if (endWhitespace) { 291 textBuffer.append(' ');
292 textBuffer.append(' '); 292 tokenBuffer.append(' ');
293 tokenBuffer.append(' '); 293 }
294 } 294
295 295 sbLastWasWhitespace = endWhitespace;
296 sbLastWasWhitespace = endWhitespace; 296 lastEvent = Event.CHARACTERS;
297 lastEvent = Event.CHARACTERS; 297
298 298 currentContainedTextElements.add(textElementIdx);
299 currentContainedTextElements.add(textElementIdx); 299 }
300 } 300
301 301 List<TextBlock> getTextBlocks() {
302 List<TextBlock> getTextBlocks() { 302 return textBlocks;
303 return textBlocks; 303 }
304 } 304
305 305 public void flushBlock() {
306 public void flushBlock() { 306 if (inBody == 0) {
307 if (inBody == 0) { 307 textBuffer.setLength(0);
308 if ("TITLE".equalsIgnoreCase(lastStartTag) && inBody == 0) { 308 tokenBuffer.setLength(0);
309 setTitle(tokenBuffer.toString().trim()); 309 return;
310 } 310 }
311 textBuffer.setLength(0); 311
312 tokenBuffer.setLength(0); 312 final int length = tokenBuffer.length();
313 return; 313 switch (length) {
314 } 314 case 0:
315 315 return;
316 final int length = tokenBuffer.length(); 316 case 1:
317 switch (length) { 317 if (sbLastWasWhitespace) {
318 case 0: 318 textBuffer.setLength(0);
319 return; 319 tokenBuffer.setLength(0);
320 case 1: 320 return;
321 if (sbLastWasWhitespace) { 321 }
322 textBuffer.setLength(0); 322 }
323 tokenBuffer.setLength(0); 323 final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer);
324 return; 324
325 } 325 int numWords = 0;
326 } 326 int numLinkedWords = 0;
327 final String[] tokens = UnicodeTokenizer.tokenize(tokenBuffer); 327 int numWrappedLines = 0;
328 328 int currentLineLength = -1; // don't count the first space
329 int numWords = 0; 329 final int maxLineLength = 80;
330 int numLinkedWords = 0; 330 int numTokens = 0;
331 int numWrappedLines = 0; 331 int numWordsCurrentLine = 0;
332 int currentLineLength = -1; // don't count the first space 332
333 final int maxLineLength = 80; 333 for (String token : tokens) {
334 int numTokens = 0; 334 if (ANCHOR_TEXT_START.equals(token)) {
335 int numWordsCurrentLine = 0; 335 inAnchorText = true;
336 336 } else if (ANCHOR_TEXT_END.equals(token)) {
337 for (String token : tokens) { 337 inAnchorText = false;
338 if (ANCHOR_TEXT_START.equals(token)) { 338 } else if (isWord(token)) {
339 inAnchorText = true; 339 numTokens++;
340 } else if (ANCHOR_TEXT_END.equals(token)) { 340 numWords++;
341 inAnchorText = false; 341 numWordsCurrentLine++;
342 } else if (isWord(token)) { 342 if (inAnchorText) {
343 numTokens++; 343 numLinkedWords++;
344 numWords++; 344 }
345 numWordsCurrentLine++; 345 final int tokenLength = token.length();
346 if (inAnchorText) { 346 currentLineLength += tokenLength + 1;
347 numLinkedWords++; 347 if (currentLineLength > maxLineLength) {
348 } 348 numWrappedLines++;
349 final int tokenLength = token.length(); 349 currentLineLength = tokenLength;
350 currentLineLength += tokenLength + 1; 350 numWordsCurrentLine = 1;
351 if (currentLineLength > maxLineLength) { 351 }
352 numWrappedLines++; 352 } else {
353 currentLineLength = tokenLength; 353 numTokens++;
354 numWordsCurrentLine = 1; 354 }
355 } 355 }
356 } else { 356 if (numTokens == 0) {
357 numTokens++; 357 return;
358 } 358 }
359 } 359 int numWordsInWrappedLines;
360 if (numTokens == 0) { 360 if (numWrappedLines == 0) {
361 return; 361 numWordsInWrappedLines = numWords;
362 } 362 numWrappedLines = 1;
363 int numWordsInWrappedLines; 363 } else {
364 if (numWrappedLines == 0) { 364 numWordsInWrappedLines = numWords - numWordsCurrentLine;
365 numWordsInWrappedLines = numWords; 365 }
366 numWrappedLines = 1; 366
367 } else { 367 TextBlock tb = new TextBlock(StringUtil.javaTrim(textBuffer.toString()),
368 numWordsInWrappedLines = numWords - numWordsCurrentLine; 368 currentContainedTextElements, numWords, numLinkedWords,
369 } 369 numWordsInWrappedLines, numWrappedLines, offsetBlocks);
370 370 currentContainedTextElements = new HashSet<Integer>();
371 TextBlock tb = new TextBlock(StringUtil.javaTrim(textBuffer.toSt ring()), 371
372 currentContainedTextElements, numWords, numLinke dWords, 372 offsetBlocks++;
373 numWordsInWrappedLines, numWrappedLines, offsetB locks); 373
374 currentContainedTextElements = new HashSet<Integer>(); 374 textBuffer.setLength(0);
375 375 tokenBuffer.setLength(0);
376 offsetBlocks++; 376
377 377 tb.setTagLevel(blockTagLevel);
378 textBuffer.setLength(0); 378 addTextBlock(tb);
379 tokenBuffer.setLength(0); 379 blockTagLevel = -1;
380 380 }
381 tb.setTagLevel(blockTagLevel); 381
382 addTextBlock(tb); 382 protected void addTextBlock(final TextBlock tb) {
383 blockTagLevel = -1; 383
384 } 384 for (Integer l : fontSizeStack) {
385 385 if (l != null) {
386 protected void addTextBlock(final TextBlock tb) { 386 tb.addLabel("font-" + l);
387 387 break;
388 for (Integer l : fontSizeStack) { 388 }
389 if (l != null) { 389 }
390 tb.addLabel("font-" + l); 390 for (LinkedList<LabelAction> labelStack : labelStacks) {
391 break; 391 if (labelStack != null) {
392 } 392 for (LabelAction labels : labelStack) {
393 } 393 if (labels != null) {
394 for (LinkedList<LabelAction> labelStack : labelStacks) { 394 labels.addTo(tb);
395 if (labelStack != null) { 395 }
396 for (LabelAction labels : labelStack) { 396 }
397 if (labels != null) { 397 }
398 labels.addTo(tb); 398 }
399 } 399
400 } 400 textBlocks.add(tb);
401 } 401 }
402 } 402
403 403 public static boolean isWord(final String token) {
404 textBlocks.add(tb); 404 return PAT_VALID_WORD_CHARACTER.matcher(token).find();
405 } 405 }
406 406
407 public static boolean isWord(final String token) { 407 static private enum Event {
408 return PAT_VALID_WORD_CHARACTER.matcher(token).find(); 408 START_TAG, END_TAG, CHARACTERS, WHITESPACE
409 } 409 }
410 410
411 static private enum Event { 411
412 START_TAG, END_TAG, CHARACTERS, WHITESPACE 412 /**
413 } 413 * Returns a {@link TextDocument} containing the extracted {@link TextBlock}
414 414 * s. NOTE: Only call this after parsing.
415 public String getTitle() { 415 *
416 return title; 416 * @return The {@link TextDocument}
417 } 417 */
418 418 public TextDocument toTextDocument() {
419 public void setTitle(String s) { 419 // just to be sure
420 if (s == null || s.length() == 0) { 420 flushBlock();
421 return; 421 // TODO(yfriedman): When BoilerpipeHTMLContentHandler is finished being moved to
422 } 422 // DomToSaxVisitor, we should be able to set Title directly.
423 title = s; 423 return new TextDocument(null, getTextBlocks());
424 } 424 }
425 425
426 /** 426 public void addWhitespaceIfNecessary() {
427 * Returns a {@link TextDocument} containing the extracted {@link TextBl ock} 427 if (!sbLastWasWhitespace) {
428 * s. NOTE: Only call this after parsing. 428 tokenBuffer.append(' ');
429 * 429 textBuffer.append(' ');
430 * @return The {@link TextDocument} 430 sbLastWasWhitespace = true;
431 */ 431 }
432 public TextDocument toTextDocument() { 432 }
433 // just to be sure 433
434 flushBlock(); 434 public void addLabelAction(final LabelAction la)
435 435 throws IllegalStateException {
436 return new TextDocument(getTitle(), getTextBlocks()); 436 LinkedList<LabelAction> labelStack = labelStacks.getLast();
437 } 437 if (labelStack == null) {
438 438 labelStack = new LinkedList<LabelAction>();
439 public void addWhitespaceIfNecessary() { 439 labelStacks.removeLast();
440 if (!sbLastWasWhitespace) { 440 labelStacks.add(labelStack);
441 tokenBuffer.append(' '); 441 }
442 textBuffer.append(' '); 442 labelStack.add(la);
443 sbLastWasWhitespace = true; 443 }
444 } 444
445 } 445 private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern
446 446 .compile(
447 public void addLabelAction(final LabelAction la)
448 throws IllegalStateException {
449 LinkedList<LabelAction> labelStack = labelStacks.getLast();
450 if (labelStack == null) {
451 labelStack = new LinkedList<LabelAction>();
452 labelStacks.removeLast();
453 labelStacks.add(labelStack);
454 }
455 labelStack.add(la);
456 }
457
458 private static final Pattern PAT_VALID_WORD_CHARACTER = Pattern
459 .compile(
460 "[" + 447 "[" +
461 "\u0030-\u0039\u0041-\u005a\u0061-\u007a\u00aa\u 00b2-\u00b3\u00b5\u00b9-\u00ba\u00bc-\u00be\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02 36\u0250-\u02c1\u02c6-\u02d1\u02e0-\u02e4\u02ee\u037a\u0386\u0388-\u038a\u038c\u 038e-\u03a1\u03a3-\u03ce\u03d0-\u03f5\u03f7-\u03fb\u0400-\u0481\u048a-\u04ce\u04 d0-\u04f5\u04f8-\u04f9\u0500-\u050f\u0531-\u0556\u0559\u0561-\u0587\u05d0-\u05ea \u05f0-\u05f2\u0621-\u063a\u0640-\u064a\u0660-\u0669\u066e-\u066f\u0671-\u06d3\u 06d5\u06e5-\u06e6\u06ee-\u06fc\u06ff\u0710\u0712-\u072f\u074d-\u074f\u0780-\u07a 5\u07b1\u0904-\u0939\u093d\u0950\u0958-\u0961\u0966-\u096f\u0985-\u098c\u098f-\u 0990\u0993-\u09a8\u09aa-\u09b0\u09b2\u09b6-\u09b9\u09bd\u09dc-\u09dd\u09df-\u09e 1\u09e6-\u09f1\u09f4-\u09f9\u0a05-\u0a0a\u0a0f-\u0a10\u0a13-\u0a28\u0a2a-\u0a30\ u0a32-\u0a33\u0a35-\u0a36\u0a38-\u0a39\u0a59-\u0a5c\u0a5e\u0a66-\u0a6f\u0a72-\u0 a74\u0a85-\u0a8d\u0a8f-\u0a91\u0a93-\u0aa8\u0aaa-\u0ab0\u0ab2-\u0ab3\u0ab5-\u0ab 9\u0abd\u0ad0\u0ae0-\u0ae1\u0ae6-\u0aef\u0b05-\u0b0c\u0b0f-\u0b10\u0b13-\u0b28\u 0b2a-\u0b30\u0b32-\u0b33\u0b35-\u0b39\u0b3d\u0b5c-\u0b5d\u0b5f-\u0b61\u0b66-\u0b 6f\u0b71\u0b83\u0b85-\u0b8a\u0b8e-\u0b90\u0b92-\u0b95\u0b99-\u0b9a\u0b9c\u0b9e-\ u0b9f\u0ba3-\u0ba4\u0ba8-\u0baa\u0bae-\u0bb5\u0bb7-\u0bb9\u0be7-\u0bf2\u0c05-\u0 c0c\u0c0e-\u0c10\u0c12-\u0c28\u0c2a-\u0c33\u0c35-\u0c39\u0c60-\u0c61\u0c66-\u0c6 f\u0c85-\u0c8c\u0c8e-\u0c90\u0c92-\u0ca8\u0caa-\u0cb3\u0cb5-\u0cb9\u0cbd\u0cde\u 0ce0-\u0ce1\u0ce6-\u0cef\u0d05-\u0d0c\u0d0e-\u0d10\u0d12-\u0d28\u0d2a-\u0d39\u0d 60-\u0d61\u0d66-\u0d6f\u0d85-\u0d96\u0d9a-\u0db1\u0db3-\u0dbb\u0dbd\u0dc0-\u0dc6 \u0e01-\u0e30\u0e32-\u0e33\u0e40-\u0e46\u0e50-\u0e59\u0e81-\u0e82\u0e84\u0e87-\u 0e88\u0e8a\u0e8d\u0e94-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5\u0ea7\u0eaa-\u0eab \u0ead-\u0eb0\u0eb2-\u0eb3\u0ebd\u0ec0-\u0ec4\u0ec6\u0ed0-\u0ed9\u0edc-\u0edd\u0 f00\u0f20-\u0f33\u0f40-\u0f47\u0f49-\u0f6a\u0f88-\u0f8b\u1000-\u1021\u1023-\u102 7\u1029-\u102a\u1040-\u1049\u1050-\u1055\u10a0-\u10c5\u10d0-\u10f8\u1100-\u1159\ u115f-\u11a2\u11a8-\u11f9\u1200-\u1206\u1208-\u1246\u1248\u124a-\u124d\u1250-\u1 256\u1258\u125a-\u125d\u1260-\u1286\u1288\u128a-\u128d\u1290-\u12ae\u12b0\u12b2- \u12b5\u12b8-\u12be\u12c0\u12c2-\u12c5\u12c8-\u12ce\u12d0-\u12d6\u12d8-\u12ee\u1 2f0-\u130e\u1310\u1312-\u1315\u1318-\u131e\u1320-\u1346\u1348-\u135a\u1369-\u137 c\u13a0-\u13f4\u1401-\u166c\u166f-\u1676\u1681-\u169a\u16a0-\u16ea\u16ee-\u16f0\ u1700-\u170c\u170e-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176c\u176e-\u1770\u1 780-\u17b3\u17d7\u17dc\u17e0-\u17e9\u17f0-\u17f9\u1810-\u1819\u1820-\u1877\u1880 -\u18a8\u1900-\u191c\u1946-\u196d\u1970-\u1974\u1d00-\u1d6b\u1e00-\u1e9b\u1ea0-\ u1ef9\u1f00-\u1f15\u1f18-\u1f1d\u1f20-\u1f45\u1f48-\u1f4d\u1f50-\u1f57\u1f59\u1f 5b\u1f5d\u1f5f-\u1f7d\u1f80-\u1fb4\u1fb6-\u1fbc\u1fbe\u1fc2-\u1fc4\u1fc6-\u1fcc\ u1fd0-\u1fd3\u1fd6-\u1fdb\u1fe0-\u1fec\u1ff2-\u1ff4\u1ff6-\u1ffc\u2070-\u2071\u2 074-\u2079\u207f-\u2089\u2102\u2107\u210a-\u2113\u2115\u2119-\u211d\u2124\u2126\ u2128\u212a-\u212d\u212f-\u2131\u2133-\u2139\u213d-\u213f\u2145-\u2149\u2153-\u2 183\u2460-\u249b\u24ea-\u24ff\u2776-\u2793\u3005-\u3007\u3021-\u3029\u3031-\u303 5\u3038-\u303c\u3041-\u3096\u309d-\u309f\u30a1-\u30fa\u30fc-\u30ff\u3105-\u312c\ u3131-\u318e\u3192-\u3195\u31a0-\u31b7\u31f0-\u31ff\u3220-\u3229\u3251-\u325f\u3 280-\u3289\u32b1-\u32bf\u3400-\u4db5\u4e00-\u9fa5\ua000-\ua48c\uac00-\ud7a3\uf90 0-\ufa2d\ufa30-\ufa6a\ufb00-\ufb06\ufb13-\ufb17\ufb1d\ufb1f-\ufb28\ufb2a-\ufb36\ ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufbb1\ufbd3-\ufd3d\ufd50-\uf d8f\ufd92-\ufdc7\ufdf0-\ufdfb\ufe70-\ufe74\ufe76-\ufefc\uff10-\uff19\uff21-\uff3 a\uff41-\uff5a\uff66-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc" 448 "\u0030-\u0039\u0041-\u005a\u0061-\u007a\u00aa\u 00b2-\u00b3\u00b5\u00b9-\u00ba\u00bc-\u00be\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u02 36\u0250-\u02c1\u02c6-\u02d1\u02e0-\u02e4\u02ee\u037a\u0386\u0388-\u038a\u038c\u 038e-\u03a1\u03a3-\u03ce\u03d0-\u03f5\u03f7-\u03fb\u0400-\u0481\u048a-\u04ce\u04 d0-\u04f5\u04f8-\u04f9\u0500-\u050f\u0531-\u0556\u0559\u0561-\u0587\u05d0-\u05ea \u05f0-\u05f2\u0621-\u063a\u0640-\u064a\u0660-\u0669\u066e-\u066f\u0671-\u06d3\u 06d5\u06e5-\u06e6\u06ee-\u06fc\u06ff\u0710\u0712-\u072f\u074d-\u074f\u0780-\u07a 5\u07b1\u0904-\u0939\u093d\u0950\u0958-\u0961\u0966-\u096f\u0985-\u098c\u098f-\u 0990\u0993-\u09a8\u09aa-\u09b0\u09b2\u09b6-\u09b9\u09bd\u09dc-\u09dd\u09df-\u09e 1\u09e6-\u09f1\u09f4-\u09f9\u0a05-\u0a0a\u0a0f-\u0a10\u0a13-\u0a28\u0a2a-\u0a30\ u0a32-\u0a33\u0a35-\u0a36\u0a38-\u0a39\u0a59-\u0a5c\u0a5e\u0a66-\u0a6f\u0a72-\u0 a74\u0a85-\u0a8d\u0a8f-\u0a91\u0a93-\u0aa8\u0aaa-\u0ab0\u0ab2-\u0ab3\u0ab5-\u0ab 9\u0abd\u0ad0\u0ae0-\u0ae1\u0ae6-\u0aef\u0b05-\u0b0c\u0b0f-\u0b10\u0b13-\u0b28\u 0b2a-\u0b30\u0b32-\u0b33\u0b35-\u0b39\u0b3d\u0b5c-\u0b5d\u0b5f-\u0b61\u0b66-\u0b 6f\u0b71\u0b83\u0b85-\u0b8a\u0b8e-\u0b90\u0b92-\u0b95\u0b99-\u0b9a\u0b9c\u0b9e-\ u0b9f\u0ba3-\u0ba4\u0ba8-\u0baa\u0bae-\u0bb5\u0bb7-\u0bb9\u0be7-\u0bf2\u0c05-\u0 c0c\u0c0e-\u0c10\u0c12-\u0c28\u0c2a-\u0c33\u0c35-\u0c39\u0c60-\u0c61\u0c66-\u0c6 f\u0c85-\u0c8c\u0c8e-\u0c90\u0c92-\u0ca8\u0caa-\u0cb3\u0cb5-\u0cb9\u0cbd\u0cde\u 0ce0-\u0ce1\u0ce6-\u0cef\u0d05-\u0d0c\u0d0e-\u0d10\u0d12-\u0d28\u0d2a-\u0d39\u0d 60-\u0d61\u0d66-\u0d6f\u0d85-\u0d96\u0d9a-\u0db1\u0db3-\u0dbb\u0dbd\u0dc0-\u0dc6 \u0e01-\u0e30\u0e32-\u0e33\u0e40-\u0e46\u0e50-\u0e59\u0e81-\u0e82\u0e84\u0e87-\u 0e88\u0e8a\u0e8d\u0e94-\u0e97\u0e99-\u0e9f\u0ea1-\u0ea3\u0ea5\u0ea7\u0eaa-\u0eab \u0ead-\u0eb0\u0eb2-\u0eb3\u0ebd\u0ec0-\u0ec4\u0ec6\u0ed0-\u0ed9\u0edc-\u0edd\u0 f00\u0f20-\u0f33\u0f40-\u0f47\u0f49-\u0f6a\u0f88-\u0f8b\u1000-\u1021\u1023-\u102 7\u1029-\u102a\u1040-\u1049\u1050-\u1055\u10a0-\u10c5\u10d0-\u10f8\u1100-\u1159\ u115f-\u11a2\u11a8-\u11f9\u1200-\u1206\u1208-\u1246\u1248\u124a-\u124d\u1250-\u1 256\u1258\u125a-\u125d\u1260-\u1286\u1288\u128a-\u128d\u1290-\u12ae\u12b0\u12b2- \u12b5\u12b8-\u12be\u12c0\u12c2-\u12c5\u12c8-\u12ce\u12d0-\u12d6\u12d8-\u12ee\u1 2f0-\u130e\u1310\u1312-\u1315\u1318-\u131e\u1320-\u1346\u1348-\u135a\u1369-\u137 c\u13a0-\u13f4\u1401-\u166c\u166f-\u1676\u1681-\u169a\u16a0-\u16ea\u16ee-\u16f0\ u1700-\u170c\u170e-\u1711\u1720-\u1731\u1740-\u1751\u1760-\u176c\u176e-\u1770\u1 780-\u17b3\u17d7\u17dc\u17e0-\u17e9\u17f0-\u17f9\u1810-\u1819\u1820-\u1877\u1880 -\u18a8\u1900-\u191c\u1946-\u196d\u1970-\u1974\u1d00-\u1d6b\u1e00-\u1e9b\u1ea0-\ u1ef9\u1f00-\u1f15\u1f18-\u1f1d\u1f20-\u1f45\u1f48-\u1f4d\u1f50-\u1f57\u1f59\u1f 5b\u1f5d\u1f5f-\u1f7d\u1f80-\u1fb4\u1fb6-\u1fbc\u1fbe\u1fc2-\u1fc4\u1fc6-\u1fcc\ u1fd0-\u1fd3\u1fd6-\u1fdb\u1fe0-\u1fec\u1ff2-\u1ff4\u1ff6-\u1ffc\u2070-\u2071\u2 074-\u2079\u207f-\u2089\u2102\u2107\u210a-\u2113\u2115\u2119-\u211d\u2124\u2126\ u2128\u212a-\u212d\u212f-\u2131\u2133-\u2139\u213d-\u213f\u2145-\u2149\u2153-\u2 183\u2460-\u249b\u24ea-\u24ff\u2776-\u2793\u3005-\u3007\u3021-\u3029\u3031-\u303 5\u3038-\u303c\u3041-\u3096\u309d-\u309f\u30a1-\u30fa\u30fc-\u30ff\u3105-\u312c\ u3131-\u318e\u3192-\u3195\u31a0-\u31b7\u31f0-\u31ff\u3220-\u3229\u3251-\u325f\u3 280-\u3289\u32b1-\u32bf\u3400-\u4db5\u4e00-\u9fa5\ua000-\ua48c\uac00-\ud7a3\uf90 0-\ufa2d\ufa30-\ufa6a\ufb00-\ufb06\ufb13-\ufb17\ufb1d\ufb1f-\ufb28\ufb2a-\ufb36\ ufb38-\ufb3c\ufb3e\ufb40-\ufb41\ufb43-\ufb44\ufb46-\ufbb1\ufbd3-\ufd3d\ufd50-\uf d8f\ufd92-\ufdc7\ufdf0-\ufdfb\ufe70-\ufe74\ufe76-\ufefc\uff10-\uff19\uff21-\uff3 a\uff41-\uff5a\uff66-\uffbe\uffc2-\uffc7\uffca-\uffcf\uffd2-\uffd7\uffda-\uffdc"
462 + "]"); 449 + "]");
463 450
464 } 451 }
OLDNEW
« no previous file with comments | « no previous file | src/com/dom_distiller/client/ContentExtractor.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698