Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(974)

Side by Side Diff: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp

Issue 2777623002: Move json-ld parsing to Blink.
Patch Set: update policy enforcement in blink, clank handling of repeated values Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2017 The Chromium Authors. All rights reserved. 1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "modules/document_metadata/CopylessPasteExtractor.h" 5 #include "modules/document_metadata/CopylessPasteExtractor.h"
6 6
7 #include <memory>
8 #include <algorithm>
7 #include "core/HTMLNames.h" 9 #include "core/HTMLNames.h"
8 #include "core/dom/Document.h" 10 #include "core/dom/Document.h"
9 #include "core/dom/ElementTraversal.h" 11 #include "core/dom/ElementTraversal.h"
10 #include "core/frame/LocalFrame.h" 12 #include "core/frame/LocalFrame.h"
11 #include "core/html/HTMLElement.h" 13 #include "core/html/HTMLElement.h"
12 #include "platform/Histogram.h" 14 #include "platform/Histogram.h"
13 #include "platform/instrumentation/tracing/TraceEvent.h" 15 #include "platform/instrumentation/tracing/TraceEvent.h"
16 #include "platform/json/JSONParser.h"
17 #include "wtf/Vector.h"
14 #include "wtf/text/StringBuilder.h" 18 #include "wtf/text/StringBuilder.h"
15 19
16 namespace blink { 20 namespace blink {
17 21
22 // TODO(dproctor): Temporary structs to hold entity return value. Replace with
23 // whatever the mojo service uses, once that is ready.
24 struct Property;
25
26 struct Entity {
27 Vector<Property> properties;
28 };
29 bool operator==(const Entity&, const Entity&);
30 std::ostream& operator<<(std::ostream&, const Entity&);
31
32 struct Property {
33 String name;
34 JSONValue::ValueType type;
35
36 Vector<bool> boolVal;
37 Vector<int> intVal;
38 Vector<double> doubleVal;
39 Vector<String> strVal;
40 Vector<Entity> entityVal;
41 };
42 bool operator==(const Property&, const Property&);
43 std::ostream& operator<<(std::ostream&, const Property&);
44
45 struct WebPage {
46 String url;
47 String title;
48 Vector<Entity> entities;
49 };
50 bool operator==(const WebPage&, const WebPage&);
51 std::ostream& operator<<(std::ostream&, const WebPage&);
52
53 bool operator==(const Entity& lhs, const Entity& rhs) {
54 return lhs.properties == rhs.properties;
55 }
56 bool operator==(const Property& lhs, const Property& rhs) {
57 if (!(lhs.name == rhs.name && lhs.type == rhs.type)) {
58 return false;
59 }
60 switch (lhs.type) {
61 case JSONValue::ValueType::TypeBoolean:
62 return lhs.boolVal == rhs.boolVal;
63 case JSONValue::ValueType::TypeInteger:
64 return lhs.intVal == rhs.intVal;
65 case JSONValue::ValueType::TypeDouble:
66 return lhs.doubleVal == rhs.doubleVal;
67 case JSONValue::ValueType::TypeString:
68 return lhs.strVal == rhs.strVal;
69 case JSONValue::ValueType::TypeObject:
70 return lhs.entityVal == rhs.entityVal;
71 default:
72 return false;
73 }
74 }
75 bool operator==(const WebPage& lhs, const WebPage& rhs) {
76 return lhs.url == rhs.url && lhs.title == rhs.title &&
77 lhs.entities == rhs.entities;
78 }
79 std::ostream& operator<<(std::ostream& os, const Entity& v) {
80 os << "ENTITY: [";
81 for (auto p : v.properties) {
82 os << p;
83 }
84 os << "]";
85 return os;
86 }
87 std::ostream& operator<<(std::ostream& os, const Property& v) {
88 os << "Name: " << v.name << " TYPE : " << v.type << " VALUE: [ ";
89 switch (v.type) {
90 case JSONValue::ValueType::TypeBoolean:
91 for (auto b : v.boolVal)
92 os << b;
93 case JSONValue::ValueType::TypeInteger:
94 for (auto i : v.intVal)
95 os << i;
96 break;
97 case JSONValue::ValueType::TypeDouble:
98 for (auto d : v.doubleVal)
99 os << d;
100 break;
101 case JSONValue::ValueType::TypeString:
102 for (auto s : v.strVal)
103 os << s;
104 break;
105 case JSONValue::ValueType::TypeObject:
106 for (auto e : v.entityVal)
107 os << e;
108 break;
109 default:
110 break;
111 }
112 os << " ]";
113 return os;
114 }
115 std::ostream& operator<<(std::ostream& os, const WebPage& v) {
116 os << "URL: " << v.url << "TITLE: " << v.title << "ENTITIES: [";
117 for (auto e : v.entities) {
118 os << e;
119 }
120 os << "]";
121 return os;
122 }
123
18 namespace { 124 namespace {
19 125
20 String extractMetadata(Element& root) { 126 // App Indexing enforces a max nesting depth of 5. Our top level message
21 StringBuilder result; 127 // corresponds to the WebPage, so this only leaves 4 more levels.
22 result.append("["); 128 // TODO(dproctor): Do we want to fail parsing, or (more likely) only pass the
23 bool multiple = false; 129 // top levels to Icing?
130 constexpr int kMaxDepth = 4;
131 // Some strings are very long, and we don't currently use those, so limit string
132 // length to something reasonable to avoid undue pressure on Icing. Note that
133 // App Indexing supports strings up to length 20k.
134 constexpr int kMaxStringLength = 200;
135 // Enforced by App Indexing, so stop processing early if possible.
136 constexpr size_t kMaxNumFields = 20;
137 // Enforced by App Indexing, so stop processing early if possible.
138 constexpr size_t kMaxRepeatedSize = 100;
139
140 constexpr char kJSONLDKeyName[] = "name";
141 constexpr char kJSONLDKeyType[] = "@type";
142 constexpr char kJSONLDKeyGraph[] = "@graph";
143 constexpr char kJSONLDKeyContext[] = "@context";
144
145 void extractEntity(JSONObject* val, Entity* entity) {
146 for (size_t i = 0; i < std::min(val->size(), kMaxNumFields); ++i) {
147 Property property;
148 JSONObject::Entry entry = val->at(i);
149 property.name = entry.first;
150 property.type = entry.second->getType();
151 bool addProperty = true;
152
153 switch (property.type) {
154 case JSONValue::ValueType::TypeBoolean: {
155 bool v;
156 val->getBoolean(entry.first, &v);
157 property.boolVal.push_back(v);
158 } break;
159 case JSONValue::ValueType::TypeInteger: {
160 int v;
161 val->getInteger(entry.first, &v);
162 property.intVal.push_back(v);
163 } break;
164 case JSONValue::ValueType::TypeDouble: {
165 double v;
166 val->getDouble(entry.first, &v);
167 property.doubleVal.push_back(v);
168 } break;
169 case JSONValue::ValueType::TypeString: {
170 String v;
171 val->getString(entry.first, &v);
172 v.truncate(kMaxStringLength);
173 property.strVal.push_back(v);
174 } break;
175 case JSONValue::ValueType::TypeObject: {
176 property.entityVal.push_back(Entity());
177 extractEntity(val->getObject(entry.first), &(property.entityVal.at(0)));
178 } break;
179 case JSONValue::ValueType::TypeArray: {
180 JSONArray* arr = val->getArray(entry.first);
181 if (arr->size() < 1) {
182 addProperty = false;
183 break;
184 }
185
186 property.type = arr->at(0)->getType();
187 if (property.type == JSONArray::ValueType::TypeArray) {
188 // App Indexing doesn't support nested arrays.
189 addProperty = false;
190 break;
191 }
192 for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) {
193 JSONValue* innerVal = arr->at(j);
194 if (innerVal->getType() != property.type) {
195 addProperty = false;
196 break;
197 }
198 switch (innerVal->getType()) {
199 case JSONValue::ValueType::TypeBoolean: {
200 bool v;
201 innerVal->asBoolean(&v);
202 property.boolVal.push_back(v);
203 } break;
204 case JSONValue::ValueType::TypeInteger: {
205 int v;
206 innerVal->asInteger(&v);
207 property.intVal.push_back(v);
208 } break;
209 case JSONValue::ValueType::TypeDouble: {
210 double v;
211 innerVal->asDouble(&v);
212 property.doubleVal.push_back(v);
213 } break;
214 case JSONValue::ValueType::TypeString: {
215 String v;
216 innerVal->asString(&v);
217 property.strVal.push_back(v);
218 } break;
219 case JSONValue::ValueType::TypeObject:
220 property.entityVal.push_back(Entity());
221 extractEntity(JSONObject::cast(innerVal),
222 &(property.entityVal.at(j)));
223 break;
224 default:
225 break;
226 }
227 }
228 } break;
229 default:
230 break;
231 }
232 if (addProperty)
233 entity->properties.push_back(property);
234 }
235 }
236
237 bool isWhitelistedType(String type) {
238 DEFINE_STATIC_LOCAL(HashSet<String>, elements,
239 ({// Common types that include addresses.
240 "AutoDealer", "Hotel", "LocalBusiness", "Organization",
241 "Person", "Place", "PostalAddress", "Product",
242 "Residence", "Restaurant", "SingleFamilyResidence",
243 // Common types including phone numbers
244 "Store", "ContactPoint", "LodgingBusiness"}));
245 return type && elements.contains(type);
246 }
247
248 void extractTopLevelEntity(JSONObject* val, Vector<Entity>* entities) {
249 // Now we have a JSONObject which corresponds to a single (possibly nested)
250 // entity.
251 Entity entity;
252 String type;
253 val->getString(kJSONLDKeyType, &type);
254 if (!isWhitelistedType(type)) {
255 return;
256 }
257 extractEntity(val, &entity);
258 entities->push_back(entity);
259 }
260
261 void extractEntitiesFromArray(JSONArray* arr, Vector<Entity>* entities) {
262 for (size_t i = 0; i < arr->size(); ++i) {
263 JSONValue* val = arr->at(i);
264 switch (val->getType()) {
265 case JSONValue::ValueType::TypeObject:
266 extractTopLevelEntity(JSONObject::cast(val), entities);
267 break;
268 default:
269 // TODO(dproctor): :(
270 return;
271 }
272 }
273 }
274
275 void extractEntityFromTopLevelObject(JSONObject* val,
276 Vector<Entity>* entities) {
277 JSONArray* graph = val->getArray(kJSONLDKeyGraph);
278 if (graph) {
279 extractEntitiesFromArray(graph, entities);
280 }
281 extractTopLevelEntity(val, entities);
282 }
283
284 bool extractMetadata(const Element& root, Vector<Entity>* entities) {
24 for (Element& element : ElementTraversal::descendantsOf(root)) { 285 for (Element& element : ElementTraversal::descendantsOf(root)) {
25 if (element.hasTagName(HTMLNames::scriptTag) && 286 if (element.hasTagName(HTMLNames::scriptTag) &&
26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { 287 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") {
27 if (multiple) { 288 std::unique_ptr<JSONValue> json =
28 result.append(","); 289 parseJSON(element.textContent(), kMaxDepth);
290 if (!json.get()) {
291 LOG(ERROR) << "Failed to parse json.";
292 return false;
29 } 293 }
30 result.append(element.textContent()); 294 LOG(ERROR) << "PARSED JSON: " << json->toPrettyJSONString();
31 multiple = true; 295 switch (json->getType()) {
296 case JSONValue::ValueType::TypeArray:
297 extractEntitiesFromArray(JSONArray::cast(json.get()), entities);
298 break;
299 case JSONValue::ValueType::TypeObject:
300 extractEntityFromTopLevelObject(JSONObject::cast(json.get()),
301 entities);
302 break;
303 default:
304 return false;
305 }
32 } 306 }
33 } 307 }
34 result.append("]"); 308 return !entities->isEmpty();
35 return result.toString();
36 } 309 }
37 310
38 } // namespace 311 } // namespace
39 312
40 String CopylessPasteExtractor::extract(Document& document) { 313 bool CopylessPasteExtractor::extract(const Document& document, WebPage* page) {
41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); 314 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract");
42 315
43 if (!document.frame() || !document.frame()->isMainFrame()) 316 if (!document.frame() || !document.frame()->isMainFrame())
44 return emptyString; 317 return false;
45 318
46 DCHECK(document.hasFinishedParsing()); 319 DCHECK(document.hasFinishedParsing());
47 320
48 Element* html = document.documentElement(); 321 Element* html = document.documentElement();
49 if (!html) 322 if (!html)
50 return emptyString; 323 return false;
51 324
52 double startTime = monotonicallyIncreasingTime(); 325 double startTime = monotonicallyIncreasingTime();
53 326
54 // Traverse the DOM tree and extract the metadata. 327 // Traverse the DOM tree and extract the metadata.
55 String result = extractMetadata(*html); 328 if (!extractMetadata(*html, &(page->entities)))
329 return false;
330 page->url = document.url().getString();
331 page->title = document.title();
56 332
57 double elapsedTime = monotonicallyIncreasingTime() - startTime; 333 double elapsedTime = monotonicallyIncreasingTime() - startTime;
58 334
59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, 335 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram,
60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); 336 ("CopylessPaste.ExtractionUs", 1, 1000000, 50));
61 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); 337 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime));
62 return result; 338 return true;
63 } 339 }
64 340
65 } // namespace blink 341 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698