Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(149)

Side by Side Diff: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp

Issue 2793103002: Parse JSON in Blink for CopylessPaste. (Closed)
Patch Set: move private member functions to local functions Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2017 The Chromium Authors. All rights reserved. 1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "modules/document_metadata/CopylessPasteExtractor.h" 5 #include "modules/document_metadata/CopylessPasteExtractor.h"
6 6
7 #include <algorithm>
8 #include <memory>
9 #include <utility>
7 #include "core/HTMLNames.h" 10 #include "core/HTMLNames.h"
8 #include "core/dom/Document.h" 11 #include "core/dom/Document.h"
9 #include "core/dom/ElementTraversal.h" 12 #include "core/dom/ElementTraversal.h"
10 #include "core/frame/LocalFrame.h" 13 #include "core/frame/LocalFrame.h"
11 #include "core/html/HTMLElement.h" 14 #include "core/html/HTMLElement.h"
12 #include "platform/Histogram.h" 15 #include "platform/Histogram.h"
13 #include "platform/instrumentation/tracing/TraceEvent.h" 16 #include "platform/instrumentation/tracing/TraceEvent.h"
17 #include "platform/json/JSONParser.h"
18 #include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h "
19 #include "wtf/Vector.h"
20 #include "wtf/text/AtomicString.h"
14 #include "wtf/text/StringBuilder.h" 21 #include "wtf/text/StringBuilder.h"
15 22
16 namespace blink { 23 namespace blink {
17 24
18 namespace { 25 namespace {
19 26
20 String extractMetadata(Element& root) { 27 using mojom::blink::Entity;
21 StringBuilder result; 28 using mojom::blink::EntityPtr;
22 result.append("["); 29 using mojom::blink::Property;
23 bool multiple = false; 30 using mojom::blink::PropertyPtr;
31 using mojom::blink::Values;
32 using mojom::blink::ValuesPtr;
33 using mojom::blink::WebPage;
34 using mojom::blink::WebPagePtr;
35
36 // App Indexing enforces a max nesting depth of 5. Our top level message
37 // corresponds to the WebPage, so this only leaves 4 more levels. We will parse
38 // entites up to this depth, and ignore any further nesting. If an object at the
39 // max nesting depth has a property corresponding to an entity, that property
40 // will be dropped. Note that we will still parse json-ld blocks deeper than
41 // this, but it won't be passed to App Indexing.
42 constexpr int kMaxDepth = 4;
43 // Some strings are very long, and we don't currently use those, so limit string
44 // length to something reasonable to avoid undue pressure on Icing. Note that
45 // App Indexing supports strings up to length 20k.
46 constexpr int kMaxStringLength = 200;
47 // Enforced by App Indexing, so stop processing early if possible.
48 constexpr size_t kMaxNumFields = 20;
49 // Enforced by App Indexing, so stop processing early if possible.
50 constexpr size_t kMaxRepeatedSize = 100;
51
52 constexpr char kJSONLDKeyType[] = "@type";
53 constexpr char kJSONLDKeyGraph[] = "@graph";
54 bool isWhitelistedType(AtomicString type) {
55 DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements,
56 ({// Common types that include addresses.
57 "AutoDealer", "Hotel", "LocalBusiness", "Organization",
58 "Person", "Place", "PostalAddress", "Product",
59 "Residence", "Restaurant", "SingleFamilyResidence",
60 // Common types including phone numbers
61 "Store", "ContactPoint", "LodgingBusiness"}));
62 return type && elements.contains(type);
63 }
64
65 void extractEntity(const JSONObject&, Entity&, int recursionLevel);
66
67 bool parseRepeatedValue(JSONArray* arr, Values& values, int recursionLevel) {
68 if (arr->size() < 1) {
69 return false;
70 }
71
72 JSONValue::ValueType type = arr->at(0)->getType();
73 if (type == JSONArray::ValueType::TypeArray) {
74 // App Indexing doesn't support nested arrays.
75 return false;
76 }
77 for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) {
78 JSONValue* innerVal = arr->at(j);
79 if (innerVal->getType() != type) {
80 // App Indexing doesn't support mixed types. If there are mixed
81 // types in the parsed object, we will drop the property.
82 return false;
83 }
84 switch (innerVal->getType()) {
85 case JSONValue::ValueType::TypeBoolean: {
86 if (!values.is_bool_values()) {
87 values.set_bool_values(Vector<bool>());
88 }
89 bool v;
90 innerVal->asBoolean(&v);
91 values.get_bool_values().push_back(v);
92 } break;
93 case JSONValue::ValueType::TypeInteger: {
94 if (!values.is_long_values()) {
95 values.set_long_values(Vector<int64_t>());
96 }
97 int v;
98 innerVal->asInteger(&v);
99 values.get_long_values().push_back(v);
100 } break;
101 case JSONValue::ValueType::TypeDouble: {
102 // App Indexing doesn't support double type, so just encode its decimal
103 // value as a string instead.
104 if (!values.is_string_values()) {
105 values.set_string_values(Vector<String>());
106 }
107 double v;
108 innerVal->asDouble(&v);
109 String s = String::number(v);
110 s.truncate(kMaxStringLength);
111 values.get_string_values().push_back(s);
112 } break;
113 case JSONValue::ValueType::TypeString: {
114 if (!values.is_string_values()) {
115 values.set_string_values(Vector<String>());
116 }
117 String v;
118 innerVal->asString(&v);
119 v.truncate(kMaxStringLength);
120 values.get_string_values().push_back(v);
121 } break;
122 case JSONValue::ValueType::TypeObject:
123 if (recursionLevel + 1 >= kMaxDepth) {
124 return false;
125 }
126 if (!values.is_entity_values()) {
127 values.set_entity_values(Vector<EntityPtr>());
128 }
129 values.get_entity_values().push_back(Entity::New());
130 extractEntity(*(JSONObject::cast(innerVal)),
131 *(values.get_entity_values().at(j)), recursionLevel + 1);
132 break;
133 default:
134 break;
135 }
136 }
137 return true;
138 }
139
140 void extractEntity(const JSONObject& val, Entity& entity, int recursionLevel) {
141 if (recursionLevel >= kMaxDepth) {
142 return;
143 }
144
145 String type;
146 val.getString(kJSONLDKeyType, &type);
147 if (!type) {
148 type = "Thing";
149 }
150 entity.type = type;
151 for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) {
152 PropertyPtr property = Property::New();
153 const JSONObject::Entry& entry = val.at(i);
154 property->name = entry.first;
155 if (property->name == kJSONLDKeyType) {
156 continue;
157 }
158 property->values = Values::New();
159
160 bool addProperty = true;
161
162 switch (entry.second->getType()) {
163 case JSONValue::ValueType::TypeBoolean: {
164 bool v;
165 val.getBoolean(entry.first, &v);
166 property->values->set_bool_values(Vector<bool>(1, v));
167 } break;
168 case JSONValue::ValueType::TypeInteger: {
169 int v;
170 val.getInteger(entry.first, &v);
171 property->values->set_long_values(Vector<int64_t>(1, v));
172 } break;
173 case JSONValue::ValueType::TypeDouble: {
174 double v;
175 val.getDouble(entry.first, &v);
176 String s = String::number(v);
177 s.truncate(kMaxStringLength);
178 property->values->set_string_values(Vector<String>(1, s));
179 } break;
180 case JSONValue::ValueType::TypeString: {
181 String v;
182 val.getString(entry.first, &v);
183 v.truncate(kMaxStringLength);
184 property->values->set_string_values(Vector<String>(1, v));
185 } break;
186 case JSONValue::ValueType::TypeObject: {
187 if (recursionLevel + 1 >= kMaxDepth) {
188 addProperty = false;
189 break;
190 }
191 property->values->set_entity_values(Vector<EntityPtr>());
192 property->values->get_entity_values().push_back(Entity::New());
193
194 extractEntity(*(val.getObject(entry.first)),
195 *(property->values->get_entity_values().at(0)),
196 recursionLevel + 1);
197 } break;
198 case JSONValue::ValueType::TypeArray:
199 addProperty = parseRepeatedValue(val.getArray(entry.first),
200 *(property->values), recursionLevel);
201 break;
202 default:
203 break;
204 }
205 if (addProperty)
206 entity.properties.push_back(std::move(property));
207 }
208 }
209
210 void extractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) {
211 // Now we have a JSONObject which corresponds to a single (possibly nested)
212 // entity.
213 EntityPtr entity = Entity::New();
214 String type;
215 val.getString(kJSONLDKeyType, &type);
216 if (!isWhitelistedType(AtomicString(type))) {
217 return;
218 }
219 extractEntity(val, *(entity.get()), 0);
220 entities.push_back(std::move(entity));
221 }
222
223 void extractEntitiesFromArray(JSONArray& arr, Vector<EntityPtr>& entities) {
224 for (size_t i = 0; i < arr.size(); ++i) {
225 JSONValue* val = arr.at(i);
226 if (val->getType() == JSONValue::ValueType::TypeObject) {
227 extractTopLevelEntity(*(JSONObject::cast(val)), entities);
228 }
229 }
230 }
231
232 void extractEntityFromTopLevelObject(const JSONObject& val,
233 Vector<EntityPtr>& entities) {
234 JSONArray* graph = val.getArray(kJSONLDKeyGraph);
235 if (graph) {
236 extractEntitiesFromArray(*graph, entities);
237 }
238 extractTopLevelEntity(val, entities);
239 }
240
241 bool extractMetadata(const Element& root, Vector<EntityPtr>& entities) {
24 for (Element& element : ElementTraversal::descendantsOf(root)) { 242 for (Element& element : ElementTraversal::descendantsOf(root)) {
25 if (element.hasTagName(HTMLNames::scriptTag) && 243 if (element.hasTagName(HTMLNames::scriptTag) &&
26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { 244 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") {
27 if (multiple) { 245 std::unique_ptr<JSONValue> json = parseJSON(element.textContent());
28 result.append(","); 246 if (!json.get()) {
247 LOG(ERROR) << "Failed to parse json.";
248 return false;
29 } 249 }
30 result.append(element.textContent()); 250 switch (json->getType()) {
31 multiple = true; 251 case JSONValue::ValueType::TypeArray:
32 } 252 extractEntitiesFromArray(*(JSONArray::cast(json.get())), entities);
33 } 253 break;
34 result.append("]"); 254 case JSONValue::ValueType::TypeObject:
35 return result.toString(); 255 extractEntityFromTopLevelObject(*(JSONObject::cast(json.get())),
256 entities);
257 break;
258 default:
259 return false;
260 }
261 }
262 }
263 return !entities.isEmpty();
36 } 264 }
37 265
38 } // namespace 266 } // namespace
39 267
40 String CopylessPasteExtractor::extract(Document& document) { 268 bool CopylessPasteExtractor::extract(const Document& document,
269 mojom::blink::WebPage& page) {
41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); 270 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract");
42 271
43 if (!document.frame() || !document.frame()->isMainFrame()) 272 if (!document.frame() || !document.frame()->isMainFrame())
44 return emptyString; 273 return false;
45 274
46 DCHECK(document.hasFinishedParsing()); 275 DCHECK(document.hasFinishedParsing());
47 276
48 Element* html = document.documentElement(); 277 Element* html = document.documentElement();
49 if (!html) 278 if (!html)
50 return emptyString; 279 return false;
51 280
52 double startTime = monotonicallyIncreasingTime(); 281 double startTime = monotonicallyIncreasingTime();
53 282
54 // Traverse the DOM tree and extract the metadata. 283 // Traverse the DOM tree and extract the metadata.
55 String result = extractMetadata(*html); 284 if (!extractMetadata(*html, page.entities))
285 return false;
286 page.url = document.url().getString();
wychen 2017/04/05 00:59:11 After https://codereview.chromium.org/2789313002/#
dproctor 2017/04/05 01:48:52 Done.
287 page.title = document.title();
56 288
57 double elapsedTime = monotonicallyIncreasingTime() - startTime; 289 double elapsedTime = monotonicallyIncreasingTime() - startTime;
58 290
59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, 291 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram,
60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); 292 ("CopylessPaste.ExtractionUs", 1, 1000000, 50));
61 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); 293 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime));
62 return result; 294 return true;
63 } 295 }
64 296
65 } // namespace blink 297 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698