OLD | NEW |
1 // Copyright 2014 The Chromium Authors. All rights reserved. | 1 // Copyright 2014 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 /** | 4 /** |
5 * boilerpipe | 5 * boilerpipe |
6 * | 6 * |
7 * Copyright (c) 2009 Christian Kohlschütter | 7 * Copyright (c) 2009 Christian Kohlschütter |
8 * | 8 * |
9 * The author licenses this file to You under the Apache License, Version 2.0 | 9 * The author licenses this file to You under the Apache License, Version 2.0 |
10 * (the "License"); you may not use this file except in compliance with | 10 * (the "License"); you may not use this file except in compliance with |
11 * the License. You may obtain a copy of the License at | 11 * the License. You may obtain a copy of the License at |
12 * | 12 * |
13 * http://www.apache.org/licenses/LICENSE-2.0 | 13 * http://www.apache.org/licenses/LICENSE-2.0 |
14 * | 14 * |
15 * Unless required by applicable law or agreed to in writing, software | 15 * Unless required by applicable law or agreed to in writing, software |
16 * distributed under the License is distributed on an "AS IS" BASIS, | 16 * distributed under the License is distributed on an "AS IS" BASIS, |
17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
18 * See the License for the specific language governing permissions and | 18 * See the License for the specific language governing permissions and |
19 * limitations under the License. | 19 * limitations under the License. |
20 */ | 20 */ |
21 package de.l3s.boilerpipe.document; | 21 package de.l3s.boilerpipe.document; |
22 | 22 |
| 23 import com.google.gwt.dom.client.Node; |
| 24 |
23 import java.util.LinkedList; | 25 import java.util.LinkedList; |
24 import java.util.List; | 26 import java.util.List; |
| 27 import java.util.Set; |
25 | 28 |
26 /** | 29 /** |
27 * A text document, consisting of one or more {@link TextBlock}s. | 30 * A text document, consisting of one or more {@link TextBlock}s, and features |
| 31 * of the original page (e.g. candidate titles, hidden elements, etc). |
28 * | 32 * |
29 * @author Christian Kohlschütter | 33 * @author Christian Kohlschütter |
30 */ | 34 */ |
31 public class TextDocument implements Cloneable { | 35 public class TextDocument implements Cloneable { |
32 final List<TextBlock> textBlocks; | 36 final List<TextBlock> textBlocks; |
33 List<String> candidateTitles; | 37 List<String> candidateTitles; |
| 38 private Set<Node> dataTables; |
| 39 private Set<Node> hiddenElements; |
34 | 40 |
35 /** | 41 /** |
36 * Creates a new {@link TextDocument} with given {@link TextBlock}s, and no | 42 * Creates a new {@link TextDocument} with given {@link TextBlock}s, and no |
37 * title. | 43 * title. |
38 * | 44 * |
39 * @param textBlocks | 45 * @param textBlocks |
40 * The text blocks of this document. | 46 * The text blocks of this document. |
41 */ | 47 */ |
42 public TextDocument(final List<TextBlock> textBlocks) { | 48 public TextDocument(final List<TextBlock> textBlocks) { |
43 this(null, textBlocks); | 49 this(null, textBlocks); |
(...skipping 30 matching lines...) Expand all Loading... |
74 * @return The list of possible titles. | 80 * @return The list of possible titles. |
75 */ | 81 */ |
76 public List<String> getCandidateTitles() { | 82 public List<String> getCandidateTitles() { |
77 return candidateTitles; | 83 return candidateTitles; |
78 } | 84 } |
79 | 85 |
80 /** | 86 /** |
81 * Sets the list of candidate titles. | 87 * Sets the list of candidate titles. |
82 * @param candidateTitles | 88 * @param candidateTitles |
83 */ | 89 */ |
84 public void setCanddiateTitles(List<String> candidateTitles) { | 90 public void setCandidateTitles(List<String> candidateTitles) { |
85 this.candidateTitles = new LinkedList<String>(candidateTitles); | 91 this.candidateTitles = new LinkedList<String>(candidateTitles); |
86 } | 92 } |
| 93 |
87 /** | 94 /** |
88 * Returns the {@link TextDocument}'s content. | 95 * Returns the {@link TextDocument}'s content. |
89 * | 96 * |
90 * @return The content text. | 97 * @return The content text. |
91 */ | 98 */ |
92 public String getContent() { | 99 public String getContent() { |
93 return getText(true, false); | 100 return getText(true, false); |
94 } | 101 } |
95 | 102 |
96 /** | 103 /** |
(...skipping 27 matching lines...) Expand all Loading... |
124 * @return Debug information. | 131 * @return Debug information. |
125 */ | 132 */ |
126 public String debugString() { | 133 public String debugString() { |
127 StringBuilder sb = new StringBuilder(); | 134 StringBuilder sb = new StringBuilder(); |
128 for(TextBlock tb : getTextBlocks()) { | 135 for(TextBlock tb : getTextBlocks()) { |
129 sb.append(tb.toString()); | 136 sb.append(tb.toString()); |
130 sb.append('\n'); | 137 sb.append('\n'); |
131 } | 138 } |
132 return sb.toString(); | 139 return sb.toString(); |
133 } | 140 } |
| 141 |
| 142 /** |
| 143 * Returns a list of nodes from the original Document which were classified
as data tables |
| 144 * (i.e. are treated as an atomic block of text). |
| 145 * @return the set of data tables |
| 146 */ |
| 147 public Set<Node> getDataTables() { |
| 148 return dataTables; |
| 149 } |
| 150 |
| 151 /** |
| 152 * Sets the data tables identified while processing the document. |
| 153 * @param dataTables the set of data tables |
| 154 */ |
| 155 public void setDataTables(Set<Node> dataTables) { |
| 156 this.dataTables = dataTables; |
| 157 } |
| 158 |
| 159 /** |
| 160 * Returns a list of nodes fro mteh original Document which weren't actually
visible. These |
| 161 * are typically omitted from boilerpipe text processing but are tracked for
post-processing. |
| 162 * @return the set of hidden elements |
| 163 */ |
| 164 public Set<Node> getHiddenElements() { |
| 165 return hiddenElements; |
| 166 } |
| 167 |
| 168 /** |
| 169 * Sets the hidden elements identified while processing the document. |
| 170 * @param hiddenElements the set of hidden elements |
| 171 */ |
| 172 public void setHiddenElements(Set<Node> hiddenElements) { |
| 173 this.hiddenElements = hiddenElements; |
| 174 } |
| 175 |
134 } | 176 } |
OLD | NEW |