Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(111)

Side by Side Diff: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp

Issue 2793103002: Parse JSON in Blink for CopylessPaste. (Closed)
Patch Set: Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2017 The Chromium Authors. All rights reserved. 1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "modules/document_metadata/CopylessPasteExtractor.h" 5 #include "modules/document_metadata/CopylessPasteExtractor.h"
6 6
7 #include <algorithm>
8 #include <memory>
9 #include <utility>
7 #include "core/HTMLNames.h" 10 #include "core/HTMLNames.h"
8 #include "core/dom/Document.h" 11 #include "core/dom/Document.h"
9 #include "core/dom/ElementTraversal.h" 12 #include "core/dom/ElementTraversal.h"
10 #include "core/frame/LocalFrame.h" 13 #include "core/frame/LocalFrame.h"
11 #include "core/html/HTMLElement.h" 14 #include "core/html/HTMLElement.h"
12 #include "platform/Histogram.h" 15 #include "platform/Histogram.h"
13 #include "platform/instrumentation/tracing/TraceEvent.h" 16 #include "platform/instrumentation/tracing/TraceEvent.h"
17 #include "platform/json/JSONParser.h"
18 #include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h "
19 #include "wtf/Vector.h"
14 #include "wtf/text/StringBuilder.h" 20 #include "wtf/text/StringBuilder.h"
15 21
16 namespace blink { 22 namespace blink {
17 23
18 namespace { 24 namespace {
19 25
20 String extractMetadata(Element& root) { 26 using mojom::blink::Entity;
21 StringBuilder result; 27 using mojom::blink::EntityPtr;
22 result.append("["); 28 using mojom::blink::Property;
23 bool multiple = false; 29 using mojom::blink::PropertyPtr;
30 using mojom::blink::Values;
31 using mojom::blink::ValuesPtr;
32 using mojom::blink::WebPage;
33 using mojom::blink::WebPagePtr;
34
35 // App Indexing enforces a max nesting depth of 5. Our top level message
36 // corresponds to the WebPage, so this only leaves 4 more levels.
37 // TODO(dproctor): Do we want to fail parsing, or (more likely) only pass the
wychen 2017/04/04 02:07:14 Best-effort parsing sounds good.
dproctor 2017/04/04 07:13:34 Done.
38 // top levels to Icing?
39 constexpr int kMaxDepth = 4;
40 // Some strings are very long, and we don't currently use those, so limit string
41 // length to something reasonable to avoid undue pressure on Icing. Note that
42 // App Indexing supports strings up to length 20k.
43 constexpr int kMaxStringLength = 200;
44 // Enforced by App Indexing, so stop processing early if possible.
45 constexpr size_t kMaxNumFields = 20;
46 // Enforced by App Indexing, so stop processing early if possible.
47 constexpr size_t kMaxRepeatedSize = 100;
48
49 constexpr char kJSONLDKeyType[] = "@type";
50 constexpr char kJSONLDKeyGraph[] = "@graph";
51
52 void extractEntity(JSONObject* val, EntityPtr* entity) {
wychen 2017/04/04 02:07:14 Sorry for the confusion. It might be better to pas
dproctor 2017/04/04 06:21:19 Done.
53 String type;
54 val->getString(kJSONLDKeyType, &type);
55 if (!type) {
56 type = "Thing";
57 }
58 (*entity)->type = type;
59 for (size_t i = 0; i < std::min(val->size(), kMaxNumFields); ++i) {
60 PropertyPtr property = Property::New();
61 JSONObject::Entry entry = val->at(i);
62 property->name = entry.first;
63 if (property->name == kJSONLDKeyType) {
64 continue;
65 }
66 property->values = Values::New();
67 JSONValue::ValueType type = entry.second->getType();
68
69 bool addProperty = true;
70
71 switch (type) {
72 case JSONValue::ValueType::TypeBoolean: {
73 bool v;
74 val->getBoolean(entry.first, &v);
75 property->values->get_bool_values().push_back(v);
wychen 2017/04/04 02:07:14 Why is this different from the rest?
dproctor 2017/04/04 06:21:19 Done.
76 } break;
77 case JSONValue::ValueType::TypeInteger: {
78 int v;
79 val->getInteger(entry.first, &v);
80 property->values->set_long_values(Vector<int64_t>(1, v));
81 } break;
82 case JSONValue::ValueType::TypeDouble: {
83 double v;
84 val->getDouble(entry.first, &v);
85 String s = String::number(v);
86 s.truncate(kMaxStringLength);
87 property->values->set_string_values(Vector<String>(1, s));
88 } break;
89 case JSONValue::ValueType::TypeString: {
90 String v;
91 val->getString(entry.first, &v);
92 v.truncate(kMaxStringLength);
93 property->values->set_string_values(Vector<String>(1, v));
94 } break;
95 case JSONValue::ValueType::TypeObject: {
96 property->values->set_entity_values(Vector<EntityPtr>());
97 property->values->get_entity_values().push_back(Entity::New());
98
99 extractEntity(val->getObject(entry.first),
100 &(property->values->get_entity_values().at(0)));
101 } break;
102 case JSONValue::ValueType::TypeArray: {
103 JSONArray* arr = val->getArray(entry.first);
104 if (arr->size() < 1) {
105 addProperty = false;
106 break;
107 }
108
109 type = arr->at(0)->getType();
110 if (type == JSONArray::ValueType::TypeArray) {
111 // App Indexing doesn't support nested arrays.
112 addProperty = false;
113 break;
114 }
115 for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) {
116 JSONValue* innerVal = arr->at(j);
117 if (innerVal->getType() != type) {
118 addProperty = false;
wychen 2017/04/04 02:07:14 Add a comment about mixed types.
dproctor 2017/04/04 06:21:19 Done.
119 break;
120 }
121 switch (innerVal->getType()) {
122 case JSONValue::ValueType::TypeBoolean: {
123 if (!property->values->is_bool_values()) {
124 property->values->set_bool_values(Vector<bool>());
125 }
126 bool v;
127 innerVal->asBoolean(&v);
128 property->values->get_bool_values().push_back(v);
129 } break;
130 case JSONValue::ValueType::TypeInteger: {
131 if (!property->values->is_long_values()) {
132 property->values->set_long_values(Vector<int64_t>());
133 }
134 int v;
135 innerVal->asInteger(&v);
136 property->values->get_long_values().push_back(v);
137 } break;
138 case JSONValue::ValueType::TypeDouble: {
139 if (!property->values->is_string_values()) {
140 property->values->set_string_values(Vector<String>());
141 }
142 double v;
143 val->getDouble(entry.first, &v);
144 String s = String::number(v);
145 s.truncate(kMaxStringLength);
146 property->values->get_string_values().push_back(s);
147 } break;
148 case JSONValue::ValueType::TypeString: {
149 if (!property->values->is_string_values()) {
150 property->values->set_string_values(Vector<String>());
151 }
152 String v;
153 innerVal->asString(&v);
154 v.truncate(kMaxStringLength);
155 property->values->get_string_values().push_back(v);
156 } break;
157 case JSONValue::ValueType::TypeObject:
158 if (!property->values->is_entity_values()) {
159 property->values->set_entity_values(Vector<EntityPtr>());
160 }
161 property->values->get_entity_values().push_back(Entity::New());
162 extractEntity(JSONObject::cast(innerVal),
163 &(property->values->get_entity_values().at(j)));
164 break;
165 default:
166 break;
167 }
168 }
169 } break;
170 default:
171 break;
172 }
173 if (addProperty)
174 (*entity)->properties.push_back(std::move(property));
175 }
176 }
177
178 bool isWhitelistedType(String type) {
wychen 2017/04/04 02:07:15 Would AtomicString be a premature optimization?
dproctor 2017/04/04 06:21:19 Done.
179 DEFINE_STATIC_LOCAL(HashSet<String>, elements,
180 ({// Common types that include addresses.
181 "AutoDealer", "Hotel", "LocalBusiness", "Organization",
182 "Person", "Place", "PostalAddress", "Product",
183 "Residence", "Restaurant", "SingleFamilyResidence",
184 // Common types including phone numbers
185 "Store", "ContactPoint", "LodgingBusiness"}));
186 return type && elements.contains(type);
187 }
188
189 void extractTopLevelEntity(JSONObject* val, Vector<EntityPtr>* entities) {
190 // Now we have a JSONObject which corresponds to a single (possibly nested)
191 // entity.
192 EntityPtr entity = Entity::New();
193 String type;
194 val->getString(kJSONLDKeyType, &type);
195 if (!isWhitelistedType(type)) {
196 return;
197 }
198 extractEntity(val, &entity);
199 entities->push_back(std::move(entity));
200 }
201
202 void extractEntitiesFromArray(JSONArray* arr, Vector<EntityPtr>* entities) {
203 for (size_t i = 0; i < arr->size(); ++i) {
204 JSONValue* val = arr->at(i);
205 switch (val->getType()) {
206 case JSONValue::ValueType::TypeObject:
207 extractTopLevelEntity(JSONObject::cast(val), entities);
208 break;
209 default:
210 // TODO(dproctor): :(
211 return;
212 }
213 }
214 }
215
216 void extractEntityFromTopLevelObject(JSONObject* val,
217 Vector<EntityPtr>* entities) {
218 JSONArray* graph = val->getArray(kJSONLDKeyGraph);
219 if (graph) {
220 extractEntitiesFromArray(graph, entities);
221 }
222 extractTopLevelEntity(val, entities);
223 }
224
225 bool extractMetadata(const Element& root, Vector<EntityPtr>* entities) {
wychen 2017/04/04 02:07:15 Similarly, Vector<>& might be better.
dproctor 2017/04/04 06:21:19 Done.
24 for (Element& element : ElementTraversal::descendantsOf(root)) { 226 for (Element& element : ElementTraversal::descendantsOf(root)) {
25 if (element.hasTagName(HTMLNames::scriptTag) && 227 if (element.hasTagName(HTMLNames::scriptTag) &&
26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { 228 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") {
27 if (multiple) { 229 std::unique_ptr<JSONValue> json =
28 result.append(","); 230 parseJSON(element.textContent(), kMaxDepth);
231 if (!json.get()) {
232 LOG(ERROR) << "Failed to parse json.";
wychen 2017/04/04 02:07:14 Needs to be more specific.
dproctor 2017/04/04 06:21:19 What did you have in mind? I don't believe that th
233 return false;
29 } 234 }
30 result.append(element.textContent()); 235 switch (json->getType()) {
31 multiple = true; 236 case JSONValue::ValueType::TypeArray:
32 } 237 extractEntitiesFromArray(JSONArray::cast(json.get()), entities);
33 } 238 break;
34 result.append("]"); 239 case JSONValue::ValueType::TypeObject:
35 return result.toString(); 240 extractEntityFromTopLevelObject(JSONObject::cast(json.get()),
241 entities);
242 break;
243 default:
244 return false;
245 }
246 }
247 }
248 return !entities->isEmpty();
36 } 249 }
37 250
38 } // namespace 251 } // namespace
39 252
40 String CopylessPasteExtractor::extract(Document& document) { 253 bool CopylessPasteExtractor::extract(const Document& document,
254 mojom::blink::WebPagePtr* page) {
wychen 2017/04/04 02:07:14 Similarly, using WebPage& here might be better.
dproctor 2017/04/04 06:21:19 Done.
41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); 255 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract");
42 256
43 if (!document.frame() || !document.frame()->isMainFrame()) 257 if (!document.frame() || !document.frame()->isMainFrame())
44 return emptyString; 258 return false;
45 259
46 DCHECK(document.hasFinishedParsing()); 260 DCHECK(document.hasFinishedParsing());
47 261
48 Element* html = document.documentElement(); 262 Element* html = document.documentElement();
49 if (!html) 263 if (!html)
50 return emptyString; 264 return false;
51 265
52 double startTime = monotonicallyIncreasingTime(); 266 double startTime = monotonicallyIncreasingTime();
53 267
54 // Traverse the DOM tree and extract the metadata. 268 // Traverse the DOM tree and extract the metadata.
55 String result = extractMetadata(*html); 269 if (!extractMetadata(*html, &(page->get()->entities)))
270 return false;
271 page->get()->url = document.url().getString();
272 page->get()->title = document.title();
56 273
57 double elapsedTime = monotonicallyIncreasingTime() - startTime; 274 double elapsedTime = monotonicallyIncreasingTime() - startTime;
58 275
59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, 276 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram,
60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); 277 ("CopylessPaste.ExtractionUs", 1, 1000000, 50));
61 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); 278 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime));
62 return result; 279 return true;
63 } 280 }
64 281
65 } // namespace blink 282 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698