OLD | NEW |
---|---|
1 // Copyright 2017 The Chromium Authors. All rights reserved. | 1 // Copyright 2017 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "modules/document_metadata/CopylessPasteExtractor.h" | 5 #include "modules/document_metadata/CopylessPasteExtractor.h" |
6 | 6 |
7 #include <algorithm> | |
8 #include <memory> | |
9 #include <utility> | |
7 #include "core/HTMLNames.h" | 10 #include "core/HTMLNames.h" |
8 #include "core/dom/Document.h" | 11 #include "core/dom/Document.h" |
9 #include "core/dom/ElementTraversal.h" | 12 #include "core/dom/ElementTraversal.h" |
10 #include "core/frame/LocalFrame.h" | 13 #include "core/frame/LocalFrame.h" |
11 #include "core/html/HTMLElement.h" | 14 #include "core/html/HTMLElement.h" |
12 #include "platform/Histogram.h" | 15 #include "platform/Histogram.h" |
13 #include "platform/instrumentation/tracing/TraceEvent.h" | 16 #include "platform/instrumentation/tracing/TraceEvent.h" |
17 #include "platform/json/JSONParser.h" | |
18 #include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h " | |
19 #include "wtf/Vector.h" | |
14 #include "wtf/text/StringBuilder.h" | 20 #include "wtf/text/StringBuilder.h" |
15 | 21 |
16 namespace blink { | 22 namespace blink { |
17 | 23 |
18 namespace { | 24 namespace { |
19 | 25 |
20 String extractMetadata(Element& root) { | 26 using mojom::blink::Entity; |
21 StringBuilder result; | 27 using mojom::blink::EntityPtr; |
22 result.append("["); | 28 using mojom::blink::Property; |
23 bool multiple = false; | 29 using mojom::blink::PropertyPtr; |
30 using mojom::blink::Values; | |
31 using mojom::blink::ValuesPtr; | |
32 using mojom::blink::WebPage; | |
33 using mojom::blink::WebPagePtr; | |
34 | |
35 // App Indexing enforces a max nesting depth of 5. Our top level message | |
36 // corresponds to the WebPage, so this only leaves 4 more levels. | |
37 // TODO(dproctor): Do we want to fail parsing, or (more likely) only pass the | |
wychen
2017/04/04 02:07:14
Best-effort parsing sounds good.
dproctor
2017/04/04 07:13:34
Done.
| |
38 // top levels to Icing? | |
39 constexpr int kMaxDepth = 4; | |
40 // Some strings are very long, and we don't currently use those, so limit string | |
41 // length to something reasonable to avoid undue pressure on Icing. Note that | |
42 // App Indexing supports strings up to length 20k. | |
43 constexpr int kMaxStringLength = 200; | |
44 // Enforced by App Indexing, so stop processing early if possible. | |
45 constexpr size_t kMaxNumFields = 20; | |
46 // Enforced by App Indexing, so stop processing early if possible. | |
47 constexpr size_t kMaxRepeatedSize = 100; | |
48 | |
49 constexpr char kJSONLDKeyType[] = "@type"; | |
50 constexpr char kJSONLDKeyGraph[] = "@graph"; | |
51 | |
52 void extractEntity(JSONObject* val, EntityPtr* entity) { | |
wychen
2017/04/04 02:07:14
Sorry for the confusion. It might be better to pas
dproctor
2017/04/04 06:21:19
Done.
| |
53 String type; | |
54 val->getString(kJSONLDKeyType, &type); | |
55 if (!type) { | |
56 type = "Thing"; | |
57 } | |
58 (*entity)->type = type; | |
59 for (size_t i = 0; i < std::min(val->size(), kMaxNumFields); ++i) { | |
60 PropertyPtr property = Property::New(); | |
61 JSONObject::Entry entry = val->at(i); | |
62 property->name = entry.first; | |
63 if (property->name == kJSONLDKeyType) { | |
64 continue; | |
65 } | |
66 property->values = Values::New(); | |
67 JSONValue::ValueType type = entry.second->getType(); | |
68 | |
69 bool addProperty = true; | |
70 | |
71 switch (type) { | |
72 case JSONValue::ValueType::TypeBoolean: { | |
73 bool v; | |
74 val->getBoolean(entry.first, &v); | |
75 property->values->get_bool_values().push_back(v); | |
wychen
2017/04/04 02:07:14
Why is this different from the rest?
dproctor
2017/04/04 06:21:19
Done.
| |
76 } break; | |
77 case JSONValue::ValueType::TypeInteger: { | |
78 int v; | |
79 val->getInteger(entry.first, &v); | |
80 property->values->set_long_values(Vector<int64_t>(1, v)); | |
81 } break; | |
82 case JSONValue::ValueType::TypeDouble: { | |
83 double v; | |
84 val->getDouble(entry.first, &v); | |
85 String s = String::number(v); | |
86 s.truncate(kMaxStringLength); | |
87 property->values->set_string_values(Vector<String>(1, s)); | |
88 } break; | |
89 case JSONValue::ValueType::TypeString: { | |
90 String v; | |
91 val->getString(entry.first, &v); | |
92 v.truncate(kMaxStringLength); | |
93 property->values->set_string_values(Vector<String>(1, v)); | |
94 } break; | |
95 case JSONValue::ValueType::TypeObject: { | |
96 property->values->set_entity_values(Vector<EntityPtr>()); | |
97 property->values->get_entity_values().push_back(Entity::New()); | |
98 | |
99 extractEntity(val->getObject(entry.first), | |
100 &(property->values->get_entity_values().at(0))); | |
101 } break; | |
102 case JSONValue::ValueType::TypeArray: { | |
103 JSONArray* arr = val->getArray(entry.first); | |
104 if (arr->size() < 1) { | |
105 addProperty = false; | |
106 break; | |
107 } | |
108 | |
109 type = arr->at(0)->getType(); | |
110 if (type == JSONArray::ValueType::TypeArray) { | |
111 // App Indexing doesn't support nested arrays. | |
112 addProperty = false; | |
113 break; | |
114 } | |
115 for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) { | |
116 JSONValue* innerVal = arr->at(j); | |
117 if (innerVal->getType() != type) { | |
118 addProperty = false; | |
wychen
2017/04/04 02:07:14
Add a comment about mixed types.
dproctor
2017/04/04 06:21:19
Done.
| |
119 break; | |
120 } | |
121 switch (innerVal->getType()) { | |
122 case JSONValue::ValueType::TypeBoolean: { | |
123 if (!property->values->is_bool_values()) { | |
124 property->values->set_bool_values(Vector<bool>()); | |
125 } | |
126 bool v; | |
127 innerVal->asBoolean(&v); | |
128 property->values->get_bool_values().push_back(v); | |
129 } break; | |
130 case JSONValue::ValueType::TypeInteger: { | |
131 if (!property->values->is_long_values()) { | |
132 property->values->set_long_values(Vector<int64_t>()); | |
133 } | |
134 int v; | |
135 innerVal->asInteger(&v); | |
136 property->values->get_long_values().push_back(v); | |
137 } break; | |
138 case JSONValue::ValueType::TypeDouble: { | |
139 if (!property->values->is_string_values()) { | |
140 property->values->set_string_values(Vector<String>()); | |
141 } | |
142 double v; | |
143 val->getDouble(entry.first, &v); | |
144 String s = String::number(v); | |
145 s.truncate(kMaxStringLength); | |
146 property->values->get_string_values().push_back(s); | |
147 } break; | |
148 case JSONValue::ValueType::TypeString: { | |
149 if (!property->values->is_string_values()) { | |
150 property->values->set_string_values(Vector<String>()); | |
151 } | |
152 String v; | |
153 innerVal->asString(&v); | |
154 v.truncate(kMaxStringLength); | |
155 property->values->get_string_values().push_back(v); | |
156 } break; | |
157 case JSONValue::ValueType::TypeObject: | |
158 if (!property->values->is_entity_values()) { | |
159 property->values->set_entity_values(Vector<EntityPtr>()); | |
160 } | |
161 property->values->get_entity_values().push_back(Entity::New()); | |
162 extractEntity(JSONObject::cast(innerVal), | |
163 &(property->values->get_entity_values().at(j))); | |
164 break; | |
165 default: | |
166 break; | |
167 } | |
168 } | |
169 } break; | |
170 default: | |
171 break; | |
172 } | |
173 if (addProperty) | |
174 (*entity)->properties.push_back(std::move(property)); | |
175 } | |
176 } | |
177 | |
178 bool isWhitelistedType(String type) { | |
wychen
2017/04/04 02:07:15
Would AtomicString be a premature optimization?
dproctor
2017/04/04 06:21:19
Done.
| |
179 DEFINE_STATIC_LOCAL(HashSet<String>, elements, | |
180 ({// Common types that include addresses. | |
181 "AutoDealer", "Hotel", "LocalBusiness", "Organization", | |
182 "Person", "Place", "PostalAddress", "Product", | |
183 "Residence", "Restaurant", "SingleFamilyResidence", | |
184 // Common types including phone numbers | |
185 "Store", "ContactPoint", "LodgingBusiness"})); | |
186 return type && elements.contains(type); | |
187 } | |
188 | |
189 void extractTopLevelEntity(JSONObject* val, Vector<EntityPtr>* entities) { | |
190 // Now we have a JSONObject which corresponds to a single (possibly nested) | |
191 // entity. | |
192 EntityPtr entity = Entity::New(); | |
193 String type; | |
194 val->getString(kJSONLDKeyType, &type); | |
195 if (!isWhitelistedType(type)) { | |
196 return; | |
197 } | |
198 extractEntity(val, &entity); | |
199 entities->push_back(std::move(entity)); | |
200 } | |
201 | |
202 void extractEntitiesFromArray(JSONArray* arr, Vector<EntityPtr>* entities) { | |
203 for (size_t i = 0; i < arr->size(); ++i) { | |
204 JSONValue* val = arr->at(i); | |
205 switch (val->getType()) { | |
206 case JSONValue::ValueType::TypeObject: | |
207 extractTopLevelEntity(JSONObject::cast(val), entities); | |
208 break; | |
209 default: | |
210 // TODO(dproctor): :( | |
211 return; | |
212 } | |
213 } | |
214 } | |
215 | |
216 void extractEntityFromTopLevelObject(JSONObject* val, | |
217 Vector<EntityPtr>* entities) { | |
218 JSONArray* graph = val->getArray(kJSONLDKeyGraph); | |
219 if (graph) { | |
220 extractEntitiesFromArray(graph, entities); | |
221 } | |
222 extractTopLevelEntity(val, entities); | |
223 } | |
224 | |
225 bool extractMetadata(const Element& root, Vector<EntityPtr>* entities) { | |
wychen
2017/04/04 02:07:15
Similarly, Vector<>& might be better.
dproctor
2017/04/04 06:21:19
Done.
| |
24 for (Element& element : ElementTraversal::descendantsOf(root)) { | 226 for (Element& element : ElementTraversal::descendantsOf(root)) { |
25 if (element.hasTagName(HTMLNames::scriptTag) && | 227 if (element.hasTagName(HTMLNames::scriptTag) && |
26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { | 228 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { |
27 if (multiple) { | 229 std::unique_ptr<JSONValue> json = |
28 result.append(","); | 230 parseJSON(element.textContent(), kMaxDepth); |
231 if (!json.get()) { | |
232 LOG(ERROR) << "Failed to parse json."; | |
wychen
2017/04/04 02:07:14
Needs to be more specific.
dproctor
2017/04/04 06:21:19
What did you have in mind? I don't believe that th
| |
233 return false; | |
29 } | 234 } |
30 result.append(element.textContent()); | 235 switch (json->getType()) { |
31 multiple = true; | 236 case JSONValue::ValueType::TypeArray: |
32 } | 237 extractEntitiesFromArray(JSONArray::cast(json.get()), entities); |
33 } | 238 break; |
34 result.append("]"); | 239 case JSONValue::ValueType::TypeObject: |
35 return result.toString(); | 240 extractEntityFromTopLevelObject(JSONObject::cast(json.get()), |
241 entities); | |
242 break; | |
243 default: | |
244 return false; | |
245 } | |
246 } | |
247 } | |
248 return !entities->isEmpty(); | |
36 } | 249 } |
37 | 250 |
38 } // namespace | 251 } // namespace |
39 | 252 |
40 String CopylessPasteExtractor::extract(Document& document) { | 253 bool CopylessPasteExtractor::extract(const Document& document, |
254 mojom::blink::WebPagePtr* page) { | |
wychen
2017/04/04 02:07:14
Similarly, using WebPage& here might be better.
dproctor
2017/04/04 06:21:19
Done.
| |
41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); | 255 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); |
42 | 256 |
43 if (!document.frame() || !document.frame()->isMainFrame()) | 257 if (!document.frame() || !document.frame()->isMainFrame()) |
44 return emptyString; | 258 return false; |
45 | 259 |
46 DCHECK(document.hasFinishedParsing()); | 260 DCHECK(document.hasFinishedParsing()); |
47 | 261 |
48 Element* html = document.documentElement(); | 262 Element* html = document.documentElement(); |
49 if (!html) | 263 if (!html) |
50 return emptyString; | 264 return false; |
51 | 265 |
52 double startTime = monotonicallyIncreasingTime(); | 266 double startTime = monotonicallyIncreasingTime(); |
53 | 267 |
54 // Traverse the DOM tree and extract the metadata. | 268 // Traverse the DOM tree and extract the metadata. |
55 String result = extractMetadata(*html); | 269 if (!extractMetadata(*html, &(page->get()->entities))) |
270 return false; | |
271 page->get()->url = document.url().getString(); | |
272 page->get()->title = document.title(); | |
56 | 273 |
57 double elapsedTime = monotonicallyIncreasingTime() - startTime; | 274 double elapsedTime = monotonicallyIncreasingTime() - startTime; |
58 | 275 |
59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, | 276 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, |
60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); | 277 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); |
61 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); | 278 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); |
62 return result; | 279 return true; |
63 } | 280 } |
64 | 281 |
65 } // namespace blink | 282 } // namespace blink |
OLD | NEW |