Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(487)

Side by Side Diff: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp

Issue 2793103002: Parse JSON in Blink for CopylessPaste. (Closed)
Patch Set: parse object best-effort up to max nesting depth Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2017 The Chromium Authors. All rights reserved. 1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "modules/document_metadata/CopylessPasteExtractor.h" 5 #include "modules/document_metadata/CopylessPasteExtractor.h"
6 6
7 #include <algorithm>
8 #include <memory>
9 #include <utility>
7 #include "core/HTMLNames.h" 10 #include "core/HTMLNames.h"
8 #include "core/dom/Document.h" 11 #include "core/dom/Document.h"
9 #include "core/dom/ElementTraversal.h" 12 #include "core/dom/ElementTraversal.h"
10 #include "core/frame/LocalFrame.h" 13 #include "core/frame/LocalFrame.h"
11 #include "core/html/HTMLElement.h" 14 #include "core/html/HTMLElement.h"
12 #include "platform/Histogram.h" 15 #include "platform/Histogram.h"
13 #include "platform/instrumentation/tracing/TraceEvent.h" 16 #include "platform/instrumentation/tracing/TraceEvent.h"
17 #include "platform/json/JSONParser.h"
18 #include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h "
19 #include "wtf/Vector.h"
20 #include "wtf/text/AtomicString.h"
14 #include "wtf/text/StringBuilder.h" 21 #include "wtf/text/StringBuilder.h"
15 22
16 namespace blink { 23 namespace blink {
17 24
18 namespace { 25 namespace {
19 26
20 String extractMetadata(Element& root) { 27 using mojom::blink::Entity;
21 StringBuilder result; 28 using mojom::blink::EntityPtr;
22 result.append("["); 29 using mojom::blink::Property;
23 bool multiple = false; 30 using mojom::blink::PropertyPtr;
31 using mojom::blink::Values;
32 using mojom::blink::ValuesPtr;
33 using mojom::blink::WebPage;
34 using mojom::blink::WebPagePtr;
35
36 // App Indexing enforces a max nesting depth of 5. Our top level message
37 // corresponds to the WebPage, so this only leaves 4 more levels. We will parse
38 // entites up to this depth, and ignore any further nesting. If an object at the
39 // max nesting depth has a property corresponding to an entity, that property
40 // will be dropped. Note that we will still parse json-ld blocks deeper than
41 // this, but it won't be passed to App Indexing.
42 constexpr int kMaxDepth = 4;
43 // Some strings are very long, and we don't currently use those, so limit string
44 // length to something reasonable to avoid undue pressure on Icing. Note that
45 // App Indexing supports strings up to length 20k.
46 constexpr int kMaxStringLength = 200;
47 // Enforced by App Indexing, so stop processing early if possible.
48 constexpr size_t kMaxNumFields = 20;
49 // Enforced by App Indexing, so stop processing early if possible.
50 constexpr size_t kMaxRepeatedSize = 100;
51
52 constexpr char kJSONLDKeyType[] = "@type";
53 constexpr char kJSONLDKeyGraph[] = "@graph";
54
55 void extractEntity(const JSONObject& val, Entity& entity, int recursionLevel) {
wychen 2017/04/04 18:40:19 This function feels a bit long.
dproctor 2017/04/04 20:44:05 Split out the nested switch statement.
56 if (recursionLevel >= kMaxDepth) {
57 return;
58 }
59
60 String type;
61 val.getString(kJSONLDKeyType, &type);
62 if (!type) {
63 type = "Thing";
64 }
65 entity.type = type;
66 for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) {
67 PropertyPtr property = Property::New();
68 const JSONObject::Entry entry = val.at(i);
wychen 2017/04/04 18:40:19 Would Entry& work?
dproctor 2017/04/04 20:44:05 Done.
69 property->name = entry.first;
70 if (property->name == kJSONLDKeyType) {
71 continue;
72 }
73 property->values = Values::New();
74 JSONValue::ValueType type = entry.second->getType();
wychen 2017/04/04 18:40:19 inline in the switch to avoid confusion from Strin
dproctor 2017/04/04 20:44:04 Done.
75
76 bool addProperty = true;
77
78 switch (type) {
79 case JSONValue::ValueType::TypeBoolean: {
80 bool v;
81 val.getBoolean(entry.first, &v);
82 property->values->set_bool_values(Vector<bool>(1, v));
83 } break;
84 case JSONValue::ValueType::TypeInteger: {
85 int v;
86 val.getInteger(entry.first, &v);
87 property->values->set_long_values(Vector<int64_t>(1, v));
88 } break;
89 case JSONValue::ValueType::TypeDouble: {
90 double v;
91 val.getDouble(entry.first, &v);
92 String s = String::number(v);
93 s.truncate(kMaxStringLength);
94 property->values->set_string_values(Vector<String>(1, s));
95 } break;
96 case JSONValue::ValueType::TypeString: {
97 String v;
98 val.getString(entry.first, &v);
99 v.truncate(kMaxStringLength);
100 property->values->set_string_values(Vector<String>(1, v));
101 } break;
102 case JSONValue::ValueType::TypeObject: {
103 if (recursionLevel + 1 >= kMaxDepth) {
104 addProperty = false;
105 break;
106 }
107 property->values->set_entity_values(Vector<EntityPtr>());
108 property->values->get_entity_values().push_back(Entity::New());
109
110 extractEntity(*(val.getObject(entry.first)),
111 *(property->values->get_entity_values().at(0)),
112 recursionLevel + 1);
113 } break;
114 case JSONValue::ValueType::TypeArray: {
wychen 2017/04/04 18:40:19 Probably split this part out. The nested switch an
dproctor 2017/04/04 20:44:04 Done.
115 JSONArray* arr = val.getArray(entry.first);
wychen 2017/04/04 18:40:19 const
dproctor 2017/04/04 20:44:04 JSONArray::at isn't const.
116 if (arr->size() < 1) {
117 addProperty = false;
118 break;
119 }
120
121 type = arr->at(0)->getType();
122 if (type == JSONArray::ValueType::TypeArray) {
123 // App Indexing doesn't support nested arrays.
124 addProperty = false;
125 break;
126 }
127 for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) {
128 JSONValue* innerVal = arr->at(j);
129 if (innerVal->getType() != type) {
130 // App Indexing doesn't support mixed types. If there are mixed
131 // types in the parsed object, we will drop the property.
132 addProperty = false;
133 break;
134 }
135 switch (innerVal->getType()) {
136 case JSONValue::ValueType::TypeBoolean: {
137 if (!property->values->is_bool_values()) {
138 property->values->set_bool_values(Vector<bool>());
139 }
140 bool v;
141 innerVal->asBoolean(&v);
142 property->values->get_bool_values().push_back(v);
143 } break;
144 case JSONValue::ValueType::TypeInteger: {
145 if (!property->values->is_long_values()) {
146 property->values->set_long_values(Vector<int64_t>());
147 }
148 int v;
149 innerVal->asInteger(&v);
150 property->values->get_long_values().push_back(v);
151 } break;
152 case JSONValue::ValueType::TypeDouble: {
153 if (!property->values->is_string_values()) {
154 property->values->set_string_values(Vector<String>());
155 }
156 double v;
157 val.getDouble(entry.first, &v);
158 String s = String::number(v);
159 s.truncate(kMaxStringLength);
160 property->values->get_string_values().push_back(s);
161 } break;
162 case JSONValue::ValueType::TypeString: {
163 if (!property->values->is_string_values()) {
164 property->values->set_string_values(Vector<String>());
165 }
166 String v;
167 innerVal->asString(&v);
168 v.truncate(kMaxStringLength);
169 property->values->get_string_values().push_back(v);
170 } break;
171 case JSONValue::ValueType::TypeObject:
172 if (recursionLevel + 1 >= kMaxDepth) {
173 addProperty = false;
174 break;
175 }
176 if (!property->values->is_entity_values()) {
177 property->values->set_entity_values(Vector<EntityPtr>());
178 }
179 property->values->get_entity_values().push_back(Entity::New());
180 extractEntity(*(JSONObject::cast(innerVal)),
181 *(property->values->get_entity_values().at(j)),
182 recursionLevel + 1);
183 break;
184 default:
185 break;
186 }
187 }
188 } break;
189 default:
190 break;
191 }
192 if (addProperty)
193 entity.properties.push_back(std::move(property));
194 }
195 }
196
197 bool isWhitelistedType(AtomicString type) {
198 DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements,
199 ({// Common types that include addresses.
200 "AutoDealer", "Hotel", "LocalBusiness", "Organization",
201 "Person", "Place", "PostalAddress", "Product",
202 "Residence", "Restaurant", "SingleFamilyResidence",
203 // Common types including phone numbers
204 "Store", "ContactPoint", "LodgingBusiness"}));
205 return type && elements.contains(type);
206 }
207
208 void extractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) {
209 // Now we have a JSONObject which corresponds to a single (possibly nested)
210 // entity.
211 EntityPtr entity = Entity::New();
212 String type;
213 val.getString(kJSONLDKeyType, &type);
214 if (!isWhitelistedType(AtomicString(type))) {
215 return;
216 }
217 extractEntity(val, *(entity.get()), 0);
218 entities.push_back(std::move(entity));
219 }
220
221 void extractEntitiesFromArray(JSONArray& arr, Vector<EntityPtr>& entities) {
wychen 2017/04/04 18:40:19 const arr
dproctor 2017/04/04 20:44:04 JSONArray::at isn't const.
wychen 2017/04/04 23:15:32 Hmm. This is unexpected, but OK. Let's see how fas
wychen 2017/04/05 16:29:41 https://codereview.chromium.org/2795393002/ has la
dproctor 2017/04/05 17:41:44 Done.
222 for (size_t i = 0; i < arr.size(); ++i) {
223 JSONValue* val = arr.at(i);
wychen 2017/04/04 18:40:19 const
dproctor 2017/04/04 20:44:04 Done.
224 if (val->getType() == JSONValue::ValueType::TypeObject) {
225 extractTopLevelEntity(*(JSONObject::cast(val)), entities);
226 }
227 }
228 }
229
230 void extractEntityFromTopLevelObject(const JSONObject& val,
231 Vector<EntityPtr>& entities) {
232 JSONArray* graph = val.getArray(kJSONLDKeyGraph);
wychen 2017/04/04 18:40:19 const
dproctor 2017/04/04 20:44:05 Done.
233 if (graph) {
234 extractEntitiesFromArray(*graph, entities);
235 }
236 extractTopLevelEntity(val, entities);
237 }
238
239 bool extractMetadata(const Element& root, Vector<EntityPtr>& entities) {
24 for (Element& element : ElementTraversal::descendantsOf(root)) { 240 for (Element& element : ElementTraversal::descendantsOf(root)) {
25 if (element.hasTagName(HTMLNames::scriptTag) && 241 if (element.hasTagName(HTMLNames::scriptTag) &&
26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { 242 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") {
27 if (multiple) { 243 std::unique_ptr<JSONValue> json = parseJSON(element.textContent());
wychen 2017/04/04 18:40:19 We could use the maxDepth version.
dproctor 2017/04/04 20:44:04 So, there are actually two different notions of ma
wychen 2017/04/04 23:15:32 My bad. JSONParser returns nullptr when exceeding
28 result.append(","); 244 if (!json.get()) {
245 LOG(ERROR) << "Failed to parse json.";
246 return false;
29 } 247 }
30 result.append(element.textContent()); 248 switch (json->getType()) {
31 multiple = true; 249 case JSONValue::ValueType::TypeArray:
32 } 250 extractEntitiesFromArray(*(JSONArray::cast(json.get())), entities);
33 } 251 break;
34 result.append("]"); 252 case JSONValue::ValueType::TypeObject:
35 return result.toString(); 253 extractEntityFromTopLevelObject(*(JSONObject::cast(json.get())),
254 entities);
255 break;
256 default:
257 return false;
258 }
259 }
260 }
261 return !entities.isEmpty();
36 } 262 }
37 263
38 } // namespace 264 } // namespace
39 265
40 String CopylessPasteExtractor::extract(Document& document) { 266 bool CopylessPasteExtractor::extract(const Document& document,
267 mojom::blink::WebPage& page) {
41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); 268 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract");
42 269
43 if (!document.frame() || !document.frame()->isMainFrame()) 270 if (!document.frame() || !document.frame()->isMainFrame())
44 return emptyString; 271 return false;
45 272
46 DCHECK(document.hasFinishedParsing()); 273 DCHECK(document.hasFinishedParsing());
47 274
48 Element* html = document.documentElement(); 275 Element* html = document.documentElement();
49 if (!html) 276 if (!html)
50 return emptyString; 277 return false;
51 278
52 double startTime = monotonicallyIncreasingTime(); 279 double startTime = monotonicallyIncreasingTime();
53 280
54 // Traverse the DOM tree and extract the metadata. 281 // Traverse the DOM tree and extract the metadata.
55 String result = extractMetadata(*html); 282 if (!extractMetadata(*html, page.entities))
283 return false;
284 page.url = document.url().getString();
wychen 2017/04/04 18:40:19 The mojo type will change from string to GURL.
dproctor 2017/04/04 20:44:04 Acknowledged.
285 page.title = document.title();
56 286
57 double elapsedTime = monotonicallyIncreasingTime() - startTime; 287 double elapsedTime = monotonicallyIncreasingTime() - startTime;
58 288
59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, 289 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram,
60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); 290 ("CopylessPaste.ExtractionUs", 1, 1000000, 50));
61 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); 291 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime));
62 return result; 292 return true;
63 } 293 }
64 294
65 } // namespace blink 295 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698