Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(203)

Side by Side Diff: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp

Issue 2793103002: Parse JSON in Blink for CopylessPaste. (Closed)
Patch Set: more const Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2017 The Chromium Authors. All rights reserved. 1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "modules/document_metadata/CopylessPasteExtractor.h" 5 #include "modules/document_metadata/CopylessPasteExtractor.h"
6 6
7 #include <algorithm>
8 #include <memory>
9 #include <utility>
7 #include "core/HTMLNames.h" 10 #include "core/HTMLNames.h"
8 #include "core/dom/Document.h" 11 #include "core/dom/Document.h"
9 #include "core/dom/ElementTraversal.h" 12 #include "core/dom/ElementTraversal.h"
10 #include "core/frame/LocalFrame.h" 13 #include "core/frame/LocalFrame.h"
11 #include "core/html/HTMLElement.h" 14 #include "core/html/HTMLElement.h"
12 #include "platform/Histogram.h" 15 #include "platform/Histogram.h"
13 #include "platform/instrumentation/tracing/TraceEvent.h" 16 #include "platform/instrumentation/tracing/TraceEvent.h"
17 #include "platform/json/JSONParser.h"
18 #include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h "
19 #include "wtf/Vector.h"
20 #include "wtf/text/AtomicString.h"
14 #include "wtf/text/StringBuilder.h" 21 #include "wtf/text/StringBuilder.h"
15 22
16 namespace blink { 23 namespace blink {
17 24
18 namespace { 25 namespace {
19 26
20 String extractMetadata(Element& root) { 27 using mojom::blink::Entity;
21 StringBuilder result; 28 using mojom::blink::EntityPtr;
22 result.append("["); 29 using mojom::blink::Property;
23 bool multiple = false; 30 using mojom::blink::PropertyPtr;
31 using mojom::blink::Values;
32 using mojom::blink::ValuesPtr;
33 using mojom::blink::WebPage;
34 using mojom::blink::WebPagePtr;
35
36 // App Indexing enforces a max nesting depth of 5. Our top level message
37 // corresponds to the WebPage, so this only leaves 4 more levels. We will parse
38 // entites up to this depth, and ignore any further nesting. If an object at the
39 // max nesting depth has a property corresponding to an entity, that property
40 // will be dropped. Note that we will still parse json-ld blocks deeper than
41 // this, but it won't be passed to App Indexing.
42 constexpr int kMaxDepth = 4;
43 // Some strings are very long, and we don't currently use those, so limit string
44 // length to something reasonable to avoid undue pressure on Icing. Note that
45 // App Indexing supports strings up to length 20k.
46 constexpr int kMaxStringLength = 200;
47 // Enforced by App Indexing, so stop processing early if possible.
48 constexpr size_t kMaxNumFields = 20;
49 // Enforced by App Indexing, so stop processing early if possible.
50 constexpr size_t kMaxRepeatedSize = 100;
51
52 constexpr char kJSONLDKeyType[] = "@type";
53 constexpr char kJSONLDKeyGraph[] = "@graph";
54 bool isWhitelistedType(AtomicString type) {
55 DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements,
56 ({// Common types that include addresses.
57 "AutoDealer", "Hotel", "LocalBusiness", "Organization",
58 "Person", "Place", "PostalAddress", "Product",
59 "Residence", "Restaurant", "SingleFamilyResidence",
60 // Common types including phone numbers
61 "Store", "ContactPoint", "LodgingBusiness"}));
62 return type && elements.contains(type);
63 }
64
65 void extractEntity(const JSONObject&, Entity&, int recursionLevel);
66
67 bool parseRepeatedValue(const JSONArray& arr,
68 Values& values,
69 int recursionLevel) {
70 if (arr.size() < 1) {
71 return false;
72 }
73
74 const JSONValue::ValueType type = arr.at(0)->getType();
75 if (type == JSONArray::ValueType::TypeArray) {
76 // App Indexing doesn't support nested arrays.
77 return false;
78 }
79 for (size_t j = 0; j < std::min(arr.size(), kMaxRepeatedSize); ++j) {
80 const JSONValue* innerVal = arr.at(j);
81 if (innerVal->getType() != type) {
82 // App Indexing doesn't support mixed types. If there are mixed
83 // types in the parsed object, we will drop the property.
84 return false;
85 }
86 switch (innerVal->getType()) {
87 case JSONValue::ValueType::TypeBoolean: {
88 if (!values.is_bool_values()) {
89 values.set_bool_values(Vector<bool>());
90 }
91 bool v;
92 innerVal->asBoolean(&v);
93 values.get_bool_values().push_back(v);
94 } break;
95 case JSONValue::ValueType::TypeInteger: {
96 if (!values.is_long_values()) {
97 values.set_long_values(Vector<int64_t>());
98 }
99 int v;
100 innerVal->asInteger(&v);
101 values.get_long_values().push_back(v);
102 } break;
103 case JSONValue::ValueType::TypeDouble: {
104 // App Indexing doesn't support double type, so just encode its decimal
105 // value as a string instead.
106 if (!values.is_string_values()) {
107 values.set_string_values(Vector<String>());
108 }
109 double v;
110 innerVal->asDouble(&v);
111 String s = String::number(v);
112 s.truncate(kMaxStringLength);
113 values.get_string_values().push_back(s);
114 } break;
115 case JSONValue::ValueType::TypeString: {
116 if (!values.is_string_values()) {
117 values.set_string_values(Vector<String>());
118 }
119 String v;
120 innerVal->asString(&v);
121 v.truncate(kMaxStringLength);
122 values.get_string_values().push_back(v);
123 } break;
124 case JSONValue::ValueType::TypeObject:
125 if (recursionLevel + 1 >= kMaxDepth) {
126 return false;
127 }
128 if (!values.is_entity_values()) {
129 values.set_entity_values(Vector<EntityPtr>());
130 }
131 values.get_entity_values().push_back(Entity::New());
132 extractEntity(*(JSONObject::cast(innerVal)),
133 *(values.get_entity_values().at(j)), recursionLevel + 1);
134 break;
135 default:
136 break;
137 }
138 }
139 return true;
140 }
141
142 void extractEntity(const JSONObject& val, Entity& entity, int recursionLevel) {
143 if (recursionLevel >= kMaxDepth) {
144 return;
145 }
146
147 String type;
148 val.getString(kJSONLDKeyType, &type);
149 if (!type) {
150 type = "Thing";
151 }
152 entity.type = type;
153 for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) {
154 PropertyPtr property = Property::New();
155 const JSONObject::Entry& entry = val.at(i);
156 property->name = entry.first;
157 if (property->name == kJSONLDKeyType) {
158 continue;
159 }
160 property->values = Values::New();
161
162 bool addProperty = true;
163
164 switch (entry.second->getType()) {
165 case JSONValue::ValueType::TypeBoolean: {
166 bool v;
167 val.getBoolean(entry.first, &v);
168 property->values->set_bool_values(Vector<bool>(1, v));
169 } break;
170 case JSONValue::ValueType::TypeInteger: {
171 int v;
172 val.getInteger(entry.first, &v);
173 property->values->set_long_values(Vector<int64_t>(1, v));
174 } break;
175 case JSONValue::ValueType::TypeDouble: {
176 double v;
177 val.getDouble(entry.first, &v);
178 String s = String::number(v);
179 s.truncate(kMaxStringLength);
180 property->values->set_string_values(Vector<String>(1, s));
181 } break;
182 case JSONValue::ValueType::TypeString: {
183 String v;
184 val.getString(entry.first, &v);
185 v.truncate(kMaxStringLength);
186 property->values->set_string_values(Vector<String>(1, v));
187 } break;
188 case JSONValue::ValueType::TypeObject: {
189 if (recursionLevel + 1 >= kMaxDepth) {
190 addProperty = false;
191 break;
192 }
193 property->values->set_entity_values(Vector<EntityPtr>());
194 property->values->get_entity_values().push_back(Entity::New());
195
196 extractEntity(*(val.getObject(entry.first)),
197 *(property->values->get_entity_values().at(0)),
198 recursionLevel + 1);
199 } break;
200 case JSONValue::ValueType::TypeArray:
201 addProperty = parseRepeatedValue(*(val.getArray(entry.first)),
202 *(property->values), recursionLevel);
203 break;
204 default:
205 break;
206 }
207 if (addProperty)
208 entity.properties.push_back(std::move(property));
209 }
210 }
211
212 void extractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) {
213 // Now we have a JSONObject which corresponds to a single (possibly nested)
214 // entity.
215 EntityPtr entity = Entity::New();
216 String type;
217 val.getString(kJSONLDKeyType, &type);
218 if (!isWhitelistedType(AtomicString(type))) {
219 return;
220 }
221 extractEntity(val, *(entity.get()), 0);
222 entities.push_back(std::move(entity));
223 }
224
225 void extractEntitiesFromArray(const JSONArray& arr,
226 Vector<EntityPtr>& entities) {
227 for (size_t i = 0; i < arr.size(); ++i) {
228 const JSONValue* val = arr.at(i);
229 if (val->getType() == JSONValue::ValueType::TypeObject) {
230 extractTopLevelEntity(*(JSONObject::cast(val)), entities);
231 }
232 }
233 }
234
235 void extractEntityFromTopLevelObject(const JSONObject& val,
236 Vector<EntityPtr>& entities) {
237 const JSONArray* graph = val.getArray(kJSONLDKeyGraph);
238 if (graph) {
239 extractEntitiesFromArray(*graph, entities);
240 }
241 extractTopLevelEntity(val, entities);
242 }
243
244 bool extractMetadata(const Element& root, Vector<EntityPtr>& entities) {
24 for (Element& element : ElementTraversal::descendantsOf(root)) { 245 for (Element& element : ElementTraversal::descendantsOf(root)) {
25 if (element.hasTagName(HTMLNames::scriptTag) && 246 if (element.hasTagName(HTMLNames::scriptTag) &&
26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { 247 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") {
27 if (multiple) { 248 std::unique_ptr<JSONValue> json = parseJSON(element.textContent());
28 result.append(","); 249 if (!json.get()) {
250 LOG(ERROR) << "Failed to parse json.";
251 return false;
29 } 252 }
30 result.append(element.textContent()); 253 switch (json->getType()) {
31 multiple = true; 254 case JSONValue::ValueType::TypeArray:
32 } 255 extractEntitiesFromArray(*(JSONArray::cast(json.get())), entities);
33 } 256 break;
34 result.append("]"); 257 case JSONValue::ValueType::TypeObject:
35 return result.toString(); 258 extractEntityFromTopLevelObject(*(JSONObject::cast(json.get())),
259 entities);
260 break;
261 default:
262 return false;
263 }
264 }
265 }
266 return !entities.isEmpty();
36 } 267 }
37 268
38 } // namespace 269 } // namespace
39 270
40 String CopylessPasteExtractor::extract(Document& document) { 271 bool CopylessPasteExtractor::extract(const Document& document,
272 mojom::blink::WebPage& page) {
41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); 273 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract");
42 274
43 if (!document.frame() || !document.frame()->isMainFrame()) 275 if (!document.frame() || !document.frame()->isMainFrame())
44 return emptyString; 276 return false;
45 277
46 DCHECK(document.hasFinishedParsing()); 278 DCHECK(document.hasFinishedParsing());
wychen 2017/04/05 23:07:48 Oops. This doesn't hold anymore, since it is no lo
dproctor 2017/04/06 00:39:17 Done.
47 279
48 Element* html = document.documentElement(); 280 Element* html = document.documentElement();
49 if (!html) 281 if (!html)
50 return emptyString; 282 return false;
51 283
52 double startTime = monotonicallyIncreasingTime(); 284 double startTime = monotonicallyIncreasingTime();
53 285
54 // Traverse the DOM tree and extract the metadata. 286 // Traverse the DOM tree and extract the metadata.
55 String result = extractMetadata(*html); 287 if (!extractMetadata(*html, page.entities))
288 return false;
289 page.url = document.url();
290 page.title = document.title();
56 291
57 double elapsedTime = monotonicallyIncreasingTime() - startTime; 292 double elapsedTime = monotonicallyIncreasingTime() - startTime;
58 293
59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, 294 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram,
60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); 295 ("CopylessPaste.ExtractionUs", 1, 1000000, 50));
61 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); 296 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime));
62 return result; 297 return true;
63 } 298 }
64 299
65 } // namespace blink 300 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698