Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(296)

Side by Side Diff: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp

Issue 2793103002: Parse JSON in Blink for CopylessPaste. (Closed)
Patch Set: address wychen comments Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2017 The Chromium Authors. All rights reserved. 1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "modules/document_metadata/CopylessPasteExtractor.h" 5 #include "modules/document_metadata/CopylessPasteExtractor.h"
6 6
7 #include <algorithm>
8 #include <memory>
9 #include <utility>
7 #include "core/HTMLNames.h" 10 #include "core/HTMLNames.h"
8 #include "core/dom/Document.h" 11 #include "core/dom/Document.h"
9 #include "core/dom/ElementTraversal.h" 12 #include "core/dom/ElementTraversal.h"
10 #include "core/frame/LocalFrame.h" 13 #include "core/frame/LocalFrame.h"
11 #include "core/html/HTMLElement.h" 14 #include "core/html/HTMLElement.h"
12 #include "platform/Histogram.h" 15 #include "platform/Histogram.h"
13 #include "platform/instrumentation/tracing/TraceEvent.h" 16 #include "platform/instrumentation/tracing/TraceEvent.h"
17 #include "platform/json/JSONParser.h"
18 #include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h "
19 #include "wtf/Vector.h"
20 #include "wtf/text/AtomicString.h"
14 #include "wtf/text/StringBuilder.h" 21 #include "wtf/text/StringBuilder.h"
15 22
16 namespace blink { 23 namespace blink {
17 24
18 namespace { 25 namespace {
19 26
20 String extractMetadata(Element& root) { 27 using mojom::blink::Entity;
21 StringBuilder result; 28 using mojom::blink::EntityPtr;
22 result.append("["); 29 using mojom::blink::Property;
23 bool multiple = false; 30 using mojom::blink::PropertyPtr;
31 using mojom::blink::Values;
32 using mojom::blink::ValuesPtr;
33 using mojom::blink::WebPage;
34 using mojom::blink::WebPagePtr;
35
36 // App Indexing enforces a max nesting depth of 5. Our top level message
37 // corresponds to the WebPage, so this only leaves 4 more levels. We will parse
38 // entites up to this depth, and ignore any further nesting. If an object at the
39 // max nesting depth has a property corresponding to an entity, that property
40 // will be dropped. Note that we will still parse json-ld blocks deeper than
41 // this, but it won't be passed to App Indexing.
42 constexpr int kMaxDepth = 4;
43 // Some strings are very long, and we don't currently use those, so limit string
44 // length to something reasonable to avoid undue pressure on Icing. Note that
45 // App Indexing supports strings up to length 20k.
46 constexpr int kMaxStringLength = 200;
47 // Enforced by App Indexing, so stop processing early if possible.
48 constexpr size_t kMaxNumFields = 20;
49 // Enforced by App Indexing, so stop processing early if possible.
50 constexpr size_t kMaxRepeatedSize = 100;
51
52 constexpr char kJSONLDKeyType[] = "@type";
53 constexpr char kJSONLDKeyGraph[] = "@graph";
54 bool isWhitelistedType(AtomicString type) {
55 DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements,
56 ({// Common types that include addresses.
57 "AutoDealer", "Hotel", "LocalBusiness", "Organization",
58 "Person", "Place", "PostalAddress", "Product",
59 "Residence", "Restaurant", "SingleFamilyResidence",
60 // Common types including phone numbers
61 "Store", "ContactPoint", "LodgingBusiness"}));
62 return type && elements.contains(type);
63 }
64
65 } // namespace
66
67 bool CopylessPasteExtractor::extract(const Document& document,
68 mojom::blink::WebPage& page) {
69 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract");
70
71 if (!document.frame() || !document.frame()->isMainFrame())
72 return false;
73
74 DCHECK(document.hasFinishedParsing());
75
76 Element* html = document.documentElement();
77 if (!html)
78 return false;
79
80 double startTime = monotonicallyIncreasingTime();
81
82 // Traverse the DOM tree and extract the metadata.
83 if (!extractMetadata(*html, page.entities))
84 return false;
85 page.url = document.url().getString();
86 page.title = document.title();
87
88 double elapsedTime = monotonicallyIncreasingTime() - startTime;
89
90 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram,
91 ("CopylessPaste.ExtractionUs", 1, 1000000, 50));
92 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime));
93 return true;
94 }
95
96 bool CopylessPasteExtractor::parseRepeatedValue(JSONArray* arr,
97 Values& values,
98 int recursionLevel) {
99 if (arr->size() < 1) {
100 return false;
101 }
102
103 JSONValue::ValueType type = arr->at(0)->getType();
104 if (type == JSONArray::ValueType::TypeArray) {
105 // App Indexing doesn't support nested arrays.
106 return false;
107 }
108 for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) {
109 JSONValue* innerVal = arr->at(j);
110 if (innerVal->getType() != type) {
111 // App Indexing doesn't support mixed types. If there are mixed
112 // types in the parsed object, we will drop the property.
113 return false;
114 }
115 switch (innerVal->getType()) {
116 case JSONValue::ValueType::TypeBoolean: {
117 if (!values.is_bool_values()) {
118 values.set_bool_values(Vector<bool>());
119 }
120 bool v;
121 innerVal->asBoolean(&v);
122 values.get_bool_values().push_back(v);
123 } break;
124 case JSONValue::ValueType::TypeInteger: {
125 if (!values.is_long_values()) {
126 values.set_long_values(Vector<int64_t>());
127 }
128 int v;
129 innerVal->asInteger(&v);
130 values.get_long_values().push_back(v);
131 } break;
132 case JSONValue::ValueType::TypeDouble: {
133 // App Indexing doesn't support double type, so just encode its decimal
134 // value as a string instead.
135 if (!values.is_string_values()) {
136 values.set_string_values(Vector<String>());
137 }
138 double v;
139 innerVal->asDouble(&v);
140 String s = String::number(v);
141 s.truncate(kMaxStringLength);
142 values.get_string_values().push_back(s);
143 } break;
144 case JSONValue::ValueType::TypeString: {
145 if (!values.is_string_values()) {
146 values.set_string_values(Vector<String>());
147 }
148 String v;
149 innerVal->asString(&v);
150 v.truncate(kMaxStringLength);
151 values.get_string_values().push_back(v);
152 } break;
153 case JSONValue::ValueType::TypeObject:
154 if (recursionLevel + 1 >= kMaxDepth) {
155 return false;
156 }
157 if (!values.is_entity_values()) {
158 values.set_entity_values(Vector<EntityPtr>());
159 }
160 values.get_entity_values().push_back(Entity::New());
161 extractEntity(*(JSONObject::cast(innerVal)),
162 *(values.get_entity_values().at(j)), recursionLevel + 1);
163 break;
164 default:
165 break;
166 }
167 }
168 return true;
169 }
170
171 void CopylessPasteExtractor::extractEntity(const JSONObject& val,
172 Entity& entity,
173 int recursionLevel) {
174 if (recursionLevel >= kMaxDepth) {
175 return;
176 }
177
178 String type;
179 val.getString(kJSONLDKeyType, &type);
180 if (!type) {
181 type = "Thing";
182 }
183 entity.type = type;
184 for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) {
185 PropertyPtr property = Property::New();
186 const JSONObject::Entry& entry = val.at(i);
187 property->name = entry.first;
188 if (property->name == kJSONLDKeyType) {
189 continue;
190 }
191 property->values = Values::New();
192
193 bool addProperty = true;
194
195 switch (entry.second->getType()) {
196 case JSONValue::ValueType::TypeBoolean: {
197 bool v;
198 val.getBoolean(entry.first, &v);
199 property->values->set_bool_values(Vector<bool>(1, v));
200 } break;
201 case JSONValue::ValueType::TypeInteger: {
202 int v;
203 val.getInteger(entry.first, &v);
204 property->values->set_long_values(Vector<int64_t>(1, v));
205 } break;
206 case JSONValue::ValueType::TypeDouble: {
207 double v;
208 val.getDouble(entry.first, &v);
209 String s = String::number(v);
210 s.truncate(kMaxStringLength);
211 property->values->set_string_values(Vector<String>(1, s));
212 } break;
213 case JSONValue::ValueType::TypeString: {
214 String v;
215 val.getString(entry.first, &v);
216 v.truncate(kMaxStringLength);
217 property->values->set_string_values(Vector<String>(1, v));
218 } break;
219 case JSONValue::ValueType::TypeObject: {
220 if (recursionLevel + 1 >= kMaxDepth) {
221 addProperty = false;
222 break;
223 }
224 property->values->set_entity_values(Vector<EntityPtr>());
225 property->values->get_entity_values().push_back(Entity::New());
226
227 extractEntity(*(val.getObject(entry.first)),
228 *(property->values->get_entity_values().at(0)),
229 recursionLevel + 1);
230 } break;
231 case JSONValue::ValueType::TypeArray:
232 addProperty = parseRepeatedValue(val.getArray(entry.first),
233 *(property->values), recursionLevel);
234 break;
235 default:
236 break;
237 }
238 if (addProperty)
239 entity.properties.push_back(std::move(property));
240 }
241 }
242
243 void CopylessPasteExtractor::extractTopLevelEntity(
244 const JSONObject& val,
245 Vector<EntityPtr>& entities) {
246 // Now we have a JSONObject which corresponds to a single (possibly nested)
247 // entity.
248 EntityPtr entity = Entity::New();
249 String type;
250 val.getString(kJSONLDKeyType, &type);
251 if (!isWhitelistedType(AtomicString(type))) {
252 return;
253 }
254 extractEntity(val, *(entity.get()), 0);
255 entities.push_back(std::move(entity));
256 }
257
258 bool CopylessPasteExtractor::extractMetadata(const Element& root,
259 Vector<EntityPtr>& entities) {
24 for (Element& element : ElementTraversal::descendantsOf(root)) { 260 for (Element& element : ElementTraversal::descendantsOf(root)) {
25 if (element.hasTagName(HTMLNames::scriptTag) && 261 if (element.hasTagName(HTMLNames::scriptTag) &&
26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { 262 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") {
27 if (multiple) { 263 std::unique_ptr<JSONValue> json = parseJSON(element.textContent());
28 result.append(","); 264 if (!json.get()) {
265 LOG(ERROR) << "Failed to parse json.";
266 return false;
29 } 267 }
30 result.append(element.textContent()); 268 switch (json->getType()) {
31 multiple = true; 269 case JSONValue::ValueType::TypeArray:
32 } 270 extractEntitiesFromArray(*(JSONArray::cast(json.get())), entities);
33 } 271 break;
34 result.append("]"); 272 case JSONValue::ValueType::TypeObject:
35 return result.toString(); 273 extractEntityFromTopLevelObject(*(JSONObject::cast(json.get())),
36 } 274 entities);
37 275 break;
38 } // namespace 276 default:
39 277 return false;
40 String CopylessPasteExtractor::extract(Document& document) { 278 }
41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); 279 }
42 280 }
43 if (!document.frame() || !document.frame()->isMainFrame()) 281 return !entities.isEmpty();
44 return emptyString; 282 }
45 283
46 DCHECK(document.hasFinishedParsing()); 284 void CopylessPasteExtractor::extractEntityFromTopLevelObject(
47 285 const JSONObject& val,
48 Element* html = document.documentElement(); 286 Vector<EntityPtr>& entities) {
49 if (!html) 287 JSONArray* graph = val.getArray(kJSONLDKeyGraph);
50 return emptyString; 288 if (graph) {
51 289 extractEntitiesFromArray(*graph, entities);
52 double startTime = monotonicallyIncreasingTime(); 290 }
53 291 extractTopLevelEntity(val, entities);
54 // Traverse the DOM tree and extract the metadata. 292 }
55 String result = extractMetadata(*html); 293
56 294 void CopylessPasteExtractor::extractEntitiesFromArray(
57 double elapsedTime = monotonicallyIncreasingTime() - startTime; 295 JSONArray& arr,
58 296 Vector<EntityPtr>& entities) {
59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, 297 for (size_t i = 0; i < arr.size(); ++i) {
60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); 298 JSONValue* val = arr.at(i);
61 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); 299 if (val->getType() == JSONValue::ValueType::TypeObject) {
62 return result; 300 extractTopLevelEntity(*(JSONObject::cast(val)), entities);
301 }
302 }
63 } 303 }
64 304
65 } // namespace blink 305 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698