Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(19)

Side by Side Diff: src/de/l3s/boilerpipe/document/TextDocument.java

Issue 499623002: Instrument DomDistiller with timing information. (Closed) Base URL: https://code.google.com/p/dom-distiller/@master
Patch Set: Created 6 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/com/dom_distiller/client/DomUtil.java ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 The Chromium Authors. All rights reserved. 1 // Copyright 2014 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 /** 4 /**
5 * boilerpipe 5 * boilerpipe
6 * 6 *
7 * Copyright (c) 2009 Christian Kohlschütter 7 * Copyright (c) 2009 Christian Kohlschütter
8 * 8 *
9 * The author licenses this file to You under the Apache License, Version 2.0 9 * The author licenses this file to You under the Apache License, Version 2.0
10 * (the "License"); you may not use this file except in compliance with 10 * (the "License"); you may not use this file except in compliance with
11 * the License. You may obtain a copy of the License at 11 * the License. You may obtain a copy of the License at
12 * 12 *
13 * http://www.apache.org/licenses/LICENSE-2.0 13 * http://www.apache.org/licenses/LICENSE-2.0
14 * 14 *
15 * Unless required by applicable law or agreed to in writing, software 15 * Unless required by applicable law or agreed to in writing, software
16 * distributed under the License is distributed on an "AS IS" BASIS, 16 * distributed under the License is distributed on an "AS IS" BASIS,
17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 * See the License for the specific language governing permissions and 18 * See the License for the specific language governing permissions and
19 * limitations under the License. 19 * limitations under the License.
20 */ 20 */
21 package de.l3s.boilerpipe.document; 21 package de.l3s.boilerpipe.document;
22 22
23 import com.google.gwt.dom.client.Node;
24
23 import java.util.LinkedList; 25 import java.util.LinkedList;
24 import java.util.List; 26 import java.util.List;
27 import java.util.Set;
25 28
26 /** 29 /**
27 * A text document, consisting of one or more {@link TextBlock}s. 30 * A text document, consisting of one or more {@link TextBlock}s, and features
31 * of the original page (e.g. candidate titles, hidden elements, etc).
28 * 32 *
29 * @author Christian Kohlschütter 33 * @author Christian Kohlschütter
30 */ 34 */
31 public class TextDocument implements Cloneable { 35 public class TextDocument implements Cloneable {
32 final List<TextBlock> textBlocks; 36 final List<TextBlock> textBlocks;
33 List<String> candidateTitles; 37 List<String> candidateTitles;
38 private Set<Node> dataTables;
39 private Set<Node> hiddenElements;
34 40
35 /** 41 /**
36 * Creates a new {@link TextDocument} with given {@link TextBlock}s, and no 42 * Creates a new {@link TextDocument} with given {@link TextBlock}s, and no
37 * title. 43 * title.
38 * 44 *
39 * @param textBlocks 45 * @param textBlocks
40 * The text blocks of this document. 46 * The text blocks of this document.
41 */ 47 */
42 public TextDocument(final List<TextBlock> textBlocks) { 48 public TextDocument(final List<TextBlock> textBlocks) {
43 this(null, textBlocks); 49 this(null, textBlocks);
(...skipping 30 matching lines...) Expand all
74 * @return The list of possible titles. 80 * @return The list of possible titles.
75 */ 81 */
76 public List<String> getCandidateTitles() { 82 public List<String> getCandidateTitles() {
77 return candidateTitles; 83 return candidateTitles;
78 } 84 }
79 85
80 /** 86 /**
81 * Sets the list of candidate titles. 87 * Sets the list of candidate titles.
82 * @param candidateTitles 88 * @param candidateTitles
83 */ 89 */
84 public void setCanddiateTitles(List<String> candidateTitles) { 90 public void setCandidateTitles(List<String> candidateTitles) {
85 this.candidateTitles = new LinkedList<String>(candidateTitles); 91 this.candidateTitles = new LinkedList<String>(candidateTitles);
86 } 92 }
93
87 /** 94 /**
88 * Returns the {@link TextDocument}'s content. 95 * Returns the {@link TextDocument}'s content.
89 * 96 *
90 * @return The content text. 97 * @return The content text.
91 */ 98 */
92 public String getContent() { 99 public String getContent() {
93 return getText(true, false); 100 return getText(true, false);
94 } 101 }
95 102
96 /** 103 /**
(...skipping 27 matching lines...) Expand all
124 * @return Debug information. 131 * @return Debug information.
125 */ 132 */
126 public String debugString() { 133 public String debugString() {
127 StringBuilder sb = new StringBuilder(); 134 StringBuilder sb = new StringBuilder();
128 for(TextBlock tb : getTextBlocks()) { 135 for(TextBlock tb : getTextBlocks()) {
129 sb.append(tb.toString()); 136 sb.append(tb.toString());
130 sb.append('\n'); 137 sb.append('\n');
131 } 138 }
132 return sb.toString(); 139 return sb.toString();
133 } 140 }
141
142 /**
143 * Returns a list of nodes from the original Document which were classified as data tables
144 * (i.e. are treated as an atomic block of text).
145 * @return the set of data tables
146 */
147 public Set<Node> getDataTables() {
148 return dataTables;
149 }
150
151 /**
152 * Sets the data tables identified while processing the document.
153 * @param dataTables the set of data tables
154 */
155 public void setDataTables(Set<Node> dataTables) {
156 this.dataTables = dataTables;
157 }
158
159 /**
160 * Returns a list of nodes fro mteh original Document which weren't actually visible. These
161 * are typically omitted from boilerpipe text processing but are tracked for post-processing.
162 * @return the set of hidden elements
163 */
164 public Set<Node> getHiddenElements() {
165 return hiddenElements;
166 }
167
168 /**
169 * Sets the hidden elements identified while processing the document.
170 * @param hiddenElements the set of hidden elements
171 */
172 public void setHiddenElements(Set<Node> hiddenElements) {
173 this.hiddenElements = hiddenElements;
174 }
175
134 } 176 }
OLDNEW
« no previous file with comments | « src/com/dom_distiller/client/DomUtil.java ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698