Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(231)

Side by Side Diff: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp

Issue 2793103002: Parse JSON in Blink for CopylessPaste. (Closed)
Patch Set: address dcheng comments Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2017 The Chromium Authors. All rights reserved. 1 // Copyright 2017 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "modules/document_metadata/CopylessPasteExtractor.h" 5 #include "modules/document_metadata/CopylessPasteExtractor.h"
6 6
7 #include <algorithm>
8 #include <memory>
9 #include <utility>
10
7 #include "core/HTMLNames.h" 11 #include "core/HTMLNames.h"
8 #include "core/dom/Document.h" 12 #include "core/dom/Document.h"
9 #include "core/dom/ElementTraversal.h" 13 #include "core/dom/ElementTraversal.h"
10 #include "core/frame/LocalFrame.h" 14 #include "core/frame/LocalFrame.h"
11 #include "core/html/HTMLElement.h" 15 #include "core/html/HTMLElement.h"
12 #include "platform/Histogram.h" 16 #include "platform/Histogram.h"
13 #include "platform/instrumentation/tracing/TraceEvent.h" 17 #include "platform/instrumentation/tracing/TraceEvent.h"
18 #include "platform/json/JSONParser.h"
19 #include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h "
20 #include "wtf/Vector.h"
21 #include "wtf/text/AtomicString.h"
14 #include "wtf/text/StringBuilder.h" 22 #include "wtf/text/StringBuilder.h"
15 23
16 namespace blink { 24 namespace blink {
17 25
18 namespace { 26 namespace {
19 27
20 String ExtractMetadata(Element& root) { 28 using mojom::document_metadata::blink::Entity;
21 StringBuilder result; 29 using mojom::document_metadata::blink::EntityPtr;
22 result.Append("["); 30 using mojom::document_metadata::blink::Property;
23 bool multiple = false; 31 using mojom::document_metadata::blink::PropertyPtr;
32 using mojom::document_metadata::blink::Values;
33 using mojom::document_metadata::blink::ValuesPtr;
34 using mojom::document_metadata::blink::WebPage;
35 using mojom::document_metadata::blink::WebPagePtr;
36
37 // App Indexing enforces a max nesting depth of 5. Our top level message
38 // corresponds to the WebPage, so this only leaves 4 more levels. We will parse
39 // entites up to this depth, and ignore any further nesting. If an object at the
40 // max nesting depth has a property corresponding to an entity, that property
41 // will be dropped. Note that we will still parse json-ld blocks deeper than
42 // this, but it won't be passed to App Indexing.
43 constexpr int kMaxDepth = 4;
44 // Some strings are very long, and we don't currently use those, so limit string
45 // length to something reasonable to avoid undue pressure on Icing. Note that
46 // App Indexing supports strings up to length 20k.
47 constexpr int kMaxStringLength = 200;
48 // Enforced by App Indexing, so stop processing early if possible.
49 constexpr size_t kMaxNumFields = 20;
50 // Enforced by App Indexing, so stop processing early if possible.
51 constexpr size_t kMaxRepeatedSize = 100;
52
53 constexpr char kJSONLDKeyType[] = "@type";
54 constexpr char kJSONLDKeyGraph[] = "@graph";
55 bool isWhitelistedType(AtomicString type) {
56 DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements,
57 ({// Common types that include addresses.
58 "AutoDealer", "Hotel", "LocalBusiness", "Organization",
59 "Person", "Place", "PostalAddress", "Product",
60 "Residence", "Restaurant", "SingleFamilyResidence",
61 // Common types including phone numbers
62 "Store", "ContactPoint", "LodgingBusiness"}));
63 return type && elements.Contains(type);
64 }
65
66 void extractEntity(const JSONObject&, Entity&, int recursionLevel);
67
68 bool parseRepeatedValue(const JSONArray& arr,
69 Values& values,
70 int recursionLevel) {
71 if (arr.size() < 1) {
72 return false;
73 }
74
75 const JSONValue::ValueType type = arr.at(0)->GetType();
76 switch (type) {
77 case JSONValue::ValueType::kTypeBoolean:
78 values.set_bool_values(Vector<bool>());
79 break;
80 case JSONValue::ValueType::kTypeInteger:
81 values.set_long_values(Vector<int64_t>());
82 break;
83 case JSONValue::ValueType::kTypeDouble:
84 // App Indexing doesn't support double type, so just encode its decimal
85 // value as a string instead.
86 values.set_string_values(Vector<String>());
87 break;
88 case JSONValue::ValueType::kTypeString:
89 values.set_string_values(Vector<String>());
90 break;
91 case JSONValue::ValueType::kTypeObject:
92 if (recursionLevel + 1 >= kMaxDepth) {
93 return false;
94 }
95 values.set_entity_values(Vector<EntityPtr>());
96 break;
97 case JSONArray::ValueType::kTypeArray:
98 // App Indexing doesn't support nested arrays.
99 return false;
100 default:
101 break;
102 }
103 for (size_t j = 0; j < std::min(arr.size(), kMaxRepeatedSize); ++j) {
104 const JSONValue* innerVal = arr.at(j);
105 if (innerVal->GetType() != type) {
106 // App Indexing doesn't support mixed types. If there are mixed
107 // types in the parsed object, we will drop the property.
108 return false;
109 }
110 switch (innerVal->GetType()) {
111 case JSONValue::ValueType::kTypeBoolean: {
112 bool v;
113 innerVal->AsBoolean(&v);
114 values.get_bool_values().push_back(v);
115 } break;
116 case JSONValue::ValueType::kTypeInteger: {
117 int v;
118 innerVal->AsInteger(&v);
119 values.get_long_values().push_back(v);
120 } break;
121 case JSONValue::ValueType::kTypeDouble: {
122 // App Indexing doesn't support double type, so just encode its decimal
123 // value as a string instead.
124 double v;
125 innerVal->AsDouble(&v);
126 String s = String::Number(v);
127 s.Truncate(kMaxStringLength);
128 values.get_string_values().push_back(s);
129 } break;
130 case JSONValue::ValueType::kTypeString: {
131 String v;
132 innerVal->AsString(&v);
133 v.Truncate(kMaxStringLength);
134 values.get_string_values().push_back(v);
135 } break;
136 case JSONValue::ValueType::kTypeObject:
137 values.get_entity_values().push_back(Entity::New());
138 extractEntity(*(JSONObject::Cast(innerVal)),
139 *(values.get_entity_values().at(j)), recursionLevel + 1);
140 break;
141 default:
142 break;
143 }
144 }
145 return true;
146 }
147
148 void extractEntity(const JSONObject& val, Entity& entity, int recursionLevel) {
149 if (recursionLevel >= kMaxDepth) {
150 return;
151 }
152
153 String type;
154 val.GetString(kJSONLDKeyType, &type);
155 if (!type) {
156 type = "Thing";
157 }
158 entity.type = type;
159 for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) {
160 PropertyPtr property = Property::New();
161 const JSONObject::Entry& entry = val.at(i);
162 property->name = entry.first;
163 if (property->name == kJSONLDKeyType) {
164 continue;
165 }
166 property->values = Values::New();
167
168 bool addProperty = true;
169
170 switch (entry.second->GetType()) {
171 case JSONValue::ValueType::kTypeBoolean: {
172 bool v;
173 val.GetBoolean(entry.first, &v);
174 property->values->set_bool_values({v});
175 } break;
176 case JSONValue::ValueType::kTypeInteger: {
177 int v;
178 val.GetInteger(entry.first, &v);
179 property->values->set_long_values({v});
180 } break;
181 case JSONValue::ValueType::kTypeDouble: {
182 double v;
183 val.GetDouble(entry.first, &v);
184 String s = String::Number(v);
185 s.Truncate(kMaxStringLength);
186 property->values->set_string_values({s});
187 } break;
188 case JSONValue::ValueType::kTypeString: {
189 String v;
190 val.GetString(entry.first, &v);
191 v.Truncate(kMaxStringLength);
192 property->values->set_string_values({v});
193 } break;
194 case JSONValue::ValueType::kTypeObject: {
195 if (recursionLevel + 1 >= kMaxDepth) {
196 addProperty = false;
197 break;
198 }
199 property->values->set_entity_values(Vector<EntityPtr>());
200 property->values->get_entity_values().push_back(Entity::New());
201
202 extractEntity(*(val.GetObject(entry.first)),
203 *(property->values->get_entity_values().at(0)),
204 recursionLevel + 1);
205 } break;
206 case JSONValue::ValueType::kTypeArray:
207 addProperty = parseRepeatedValue(*(val.GetArray(entry.first)),
208 *(property->values), recursionLevel);
209 break;
210 default:
211 break;
212 }
213 if (addProperty)
214 entity.properties.push_back(std::move(property));
215 }
216 }
217
218 void extractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) {
219 // Now we have a JSONObject which corresponds to a single (possibly nested)
220 // entity.
221 EntityPtr entity = Entity::New();
222 String type;
223 val.GetString(kJSONLDKeyType, &type);
224 if (!isWhitelistedType(AtomicString(type))) {
225 return;
226 }
227 extractEntity(val, *entity, 0);
228 entities.push_back(std::move(entity));
229 }
230
231 void extractEntitiesFromArray(const JSONArray& arr,
232 Vector<EntityPtr>& entities) {
233 for (size_t i = 0; i < arr.size(); ++i) {
234 const JSONValue* val = arr.at(i);
235 if (val->GetType() == JSONValue::ValueType::kTypeObject) {
236 extractTopLevelEntity(*(JSONObject::Cast(val)), entities);
237 }
238 }
239 }
240
241 void extractEntityFromTopLevelObject(const JSONObject& val,
242 Vector<EntityPtr>& entities) {
243 const JSONArray* graph = val.GetArray(kJSONLDKeyGraph);
244 if (graph) {
245 extractEntitiesFromArray(*graph, entities);
246 }
247 extractTopLevelEntity(val, entities);
248 }
249
250 bool extractMetadata(const Element& root, Vector<EntityPtr>& entities) {
24 for (Element& element : ElementTraversal::DescendantsOf(root)) { 251 for (Element& element : ElementTraversal::DescendantsOf(root)) {
25 if (element.HasTagName(HTMLNames::scriptTag) && 252 if (element.HasTagName(HTMLNames::scriptTag) &&
26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { 253 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") {
27 if (multiple) { 254 std::unique_ptr<JSONValue> json = ParseJSON(element.textContent());
28 result.Append(","); 255 if (!json) {
256 LOG(ERROR) << "Failed to parse json.";
257 return false;
29 } 258 }
30 result.Append(element.textContent()); 259 switch (json->GetType()) {
31 multiple = true; 260 case JSONValue::ValueType::kTypeArray:
32 } 261 extractEntitiesFromArray(*(JSONArray::Cast(json.get())), entities);
33 } 262 break;
34 result.Append("]"); 263 case JSONValue::ValueType::kTypeObject:
35 return result.ToString(); 264 extractEntityFromTopLevelObject(*(JSONObject::Cast(json.get())),
265 entities);
266 break;
267 default:
268 return false;
269 }
270 }
271 }
272 return !entities.IsEmpty();
36 } 273 }
37 274
38 } // namespace 275 } // namespace
39 276
40 String CopylessPasteExtractor::Extract(Document& document) { 277 WebPagePtr CopylessPasteExtractor::extract(const Document& document) {
41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); 278 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract");
42 279
43 if (!document.GetFrame() || !document.GetFrame()->IsMainFrame()) 280 if (!document.GetFrame() || !document.GetFrame()->IsMainFrame())
44 return g_empty_string; 281 return nullptr;
45
46 DCHECK(document.HasFinishedParsing());
47 282
48 Element* html = document.documentElement(); 283 Element* html = document.documentElement();
49 if (!html) 284 if (!html)
50 return g_empty_string; 285 return nullptr;
51 286
52 double start_time = MonotonicallyIncreasingTime(); 287 double start_time = MonotonicallyIncreasingTime();
53 288
289 WebPagePtr page = WebPage::New();
290
54 // Traverse the DOM tree and extract the metadata. 291 // Traverse the DOM tree and extract the metadata.
55 String result = ExtractMetadata(*html); 292 if (!extractMetadata(*html, page->entities))
293 return nullptr;
294 page->url = document.Url();
295 page->title = document.title();
56 296
57 double elapsed_time = MonotonicallyIncreasingTime() - start_time; 297 double elapsed_time = MonotonicallyIncreasingTime() - start_time;
58 298
59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extraction_histogram, 299 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram,
60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); 300 ("CopylessPaste.ExtractionUs", 1, 1000000, 50));
61 extraction_histogram.Count(static_cast<int>(1e6 * elapsed_time)); 301 extractionHistogram.Count(static_cast<int>(1e6 * elapsed_time));
62 return result; 302 return page;
63 } 303 }
64 304
65 } // namespace blink 305 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698