OLD | NEW |
---|---|
1 // Copyright 2017 The Chromium Authors. All rights reserved. | 1 // Copyright 2017 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "modules/document_metadata/CopylessPasteExtractor.h" | 5 #include "modules/document_metadata/CopylessPasteExtractor.h" |
6 | 6 |
7 #include <algorithm> | |
8 #include <memory> | |
9 #include <utility> | |
7 #include "core/HTMLNames.h" | 10 #include "core/HTMLNames.h" |
8 #include "core/dom/Document.h" | 11 #include "core/dom/Document.h" |
9 #include "core/dom/ElementTraversal.h" | 12 #include "core/dom/ElementTraversal.h" |
10 #include "core/frame/LocalFrame.h" | 13 #include "core/frame/LocalFrame.h" |
11 #include "core/html/HTMLElement.h" | 14 #include "core/html/HTMLElement.h" |
12 #include "platform/Histogram.h" | 15 #include "platform/Histogram.h" |
13 #include "platform/instrumentation/tracing/TraceEvent.h" | 16 #include "platform/instrumentation/tracing/TraceEvent.h" |
17 #include "platform/json/JSONParser.h" | |
18 #include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h " | |
19 #include "wtf/Vector.h" | |
20 #include "wtf/text/AtomicString.h" | |
14 #include "wtf/text/StringBuilder.h" | 21 #include "wtf/text/StringBuilder.h" |
15 | 22 |
16 namespace blink { | 23 namespace blink { |
17 | 24 |
18 namespace { | 25 namespace { |
19 | 26 |
20 String extractMetadata(Element& root) { | 27 using mojom::blink::Entity; |
21 StringBuilder result; | 28 using mojom::blink::EntityPtr; |
22 result.append("["); | 29 using mojom::blink::Property; |
23 bool multiple = false; | 30 using mojom::blink::PropertyPtr; |
31 using mojom::blink::Values; | |
32 using mojom::blink::ValuesPtr; | |
33 using mojom::blink::WebPage; | |
34 using mojom::blink::WebPagePtr; | |
35 | |
36 // App Indexing enforces a max nesting depth of 5. Our top level message | |
37 // corresponds to the WebPage, so this only leaves 4 more levels. We will parse | |
38 // entites up to this depth, and ignore any further nesting. If an object at the | |
39 // max nesting depth has a property corresponding to an entity, that property | |
40 // will be dropped. Note that we will still parse json-ld blocks deeper than | |
41 // this, but it won't be passed to App Indexing. | |
42 constexpr int kMaxDepth = 4; | |
43 // Some strings are very long, and we don't currently use those, so limit string | |
44 // length to something reasonable to avoid undue pressure on Icing. Note that | |
45 // App Indexing supports strings up to length 20k. | |
46 constexpr int kMaxStringLength = 200; | |
47 // Enforced by App Indexing, so stop processing early if possible. | |
48 constexpr size_t kMaxNumFields = 20; | |
49 // Enforced by App Indexing, so stop processing early if possible. | |
50 constexpr size_t kMaxRepeatedSize = 100; | |
51 | |
52 constexpr char kJSONLDKeyType[] = "@type"; | |
53 constexpr char kJSONLDKeyGraph[] = "@graph"; | |
54 bool isWhitelistedType(AtomicString type) { | |
55 DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements, | |
56 ({// Common types that include addresses. | |
57 "AutoDealer", "Hotel", "LocalBusiness", "Organization", | |
58 "Person", "Place", "PostalAddress", "Product", | |
59 "Residence", "Restaurant", "SingleFamilyResidence", | |
60 // Common types including phone numbers | |
61 "Store", "ContactPoint", "LodgingBusiness"})); | |
62 return type && elements.contains(type); | |
63 } | |
64 | |
65 void extractEntity(const JSONObject&, Entity&, int recursionLevel); | |
66 | |
67 bool parseRepeatedValue(const JSONArray& arr, | |
68 Values& values, | |
69 int recursionLevel) { | |
70 if (arr.size() < 1) { | |
71 return false; | |
72 } | |
73 | |
74 const JSONValue::ValueType type = arr.at(0)->getType(); | |
75 if (type == JSONArray::ValueType::TypeArray) { | |
76 // App Indexing doesn't support nested arrays. | |
77 return false; | |
78 } | |
79 for (size_t j = 0; j < std::min(arr.size(), kMaxRepeatedSize); ++j) { | |
80 const JSONValue* innerVal = arr.at(j); | |
81 if (innerVal->getType() != type) { | |
82 // App Indexing doesn't support mixed types. If there are mixed | |
83 // types in the parsed object, we will drop the property. | |
84 return false; | |
85 } | |
86 switch (innerVal->getType()) { | |
87 case JSONValue::ValueType::TypeBoolean: { | |
88 if (!values.is_bool_values()) { | |
89 values.set_bool_values(Vector<bool>()); | |
90 } | |
91 bool v; | |
92 innerVal->asBoolean(&v); | |
93 values.get_bool_values().push_back(v); | |
94 } break; | |
95 case JSONValue::ValueType::TypeInteger: { | |
96 if (!values.is_long_values()) { | |
97 values.set_long_values(Vector<int64_t>()); | |
98 } | |
99 int v; | |
100 innerVal->asInteger(&v); | |
101 values.get_long_values().push_back(v); | |
102 } break; | |
103 case JSONValue::ValueType::TypeDouble: { | |
104 // App Indexing doesn't support double type, so just encode its decimal | |
105 // value as a string instead. | |
106 if (!values.is_string_values()) { | |
107 values.set_string_values(Vector<String>()); | |
108 } | |
109 double v; | |
110 innerVal->asDouble(&v); | |
111 String s = String::number(v); | |
112 s.truncate(kMaxStringLength); | |
113 values.get_string_values().push_back(s); | |
114 } break; | |
115 case JSONValue::ValueType::TypeString: { | |
116 if (!values.is_string_values()) { | |
117 values.set_string_values(Vector<String>()); | |
118 } | |
119 String v; | |
120 innerVal->asString(&v); | |
121 v.truncate(kMaxStringLength); | |
122 values.get_string_values().push_back(v); | |
123 } break; | |
124 case JSONValue::ValueType::TypeObject: | |
125 if (recursionLevel + 1 >= kMaxDepth) { | |
126 return false; | |
127 } | |
128 if (!values.is_entity_values()) { | |
129 values.set_entity_values(Vector<EntityPtr>()); | |
130 } | |
131 values.get_entity_values().push_back(Entity::New()); | |
132 extractEntity(*(JSONObject::cast(innerVal)), | |
133 *(values.get_entity_values().at(j)), recursionLevel + 1); | |
134 break; | |
135 default: | |
136 break; | |
137 } | |
138 } | |
139 return true; | |
140 } | |
141 | |
142 void extractEntity(const JSONObject& val, Entity& entity, int recursionLevel) { | |
143 if (recursionLevel >= kMaxDepth) { | |
144 return; | |
145 } | |
146 | |
147 String type; | |
148 val.getString(kJSONLDKeyType, &type); | |
149 if (!type) { | |
150 type = "Thing"; | |
151 } | |
152 entity.type = type; | |
153 for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) { | |
154 PropertyPtr property = Property::New(); | |
155 const JSONObject::Entry& entry = val.at(i); | |
156 property->name = entry.first; | |
157 if (property->name == kJSONLDKeyType) { | |
158 continue; | |
159 } | |
160 property->values = Values::New(); | |
161 | |
162 bool addProperty = true; | |
163 | |
164 switch (entry.second->getType()) { | |
165 case JSONValue::ValueType::TypeBoolean: { | |
166 bool v; | |
167 val.getBoolean(entry.first, &v); | |
168 property->values->set_bool_values(Vector<bool>(1, v)); | |
169 } break; | |
170 case JSONValue::ValueType::TypeInteger: { | |
171 int v; | |
172 val.getInteger(entry.first, &v); | |
173 property->values->set_long_values(Vector<int64_t>(1, v)); | |
174 } break; | |
175 case JSONValue::ValueType::TypeDouble: { | |
176 double v; | |
177 val.getDouble(entry.first, &v); | |
178 String s = String::number(v); | |
179 s.truncate(kMaxStringLength); | |
180 property->values->set_string_values(Vector<String>(1, s)); | |
181 } break; | |
182 case JSONValue::ValueType::TypeString: { | |
183 String v; | |
184 val.getString(entry.first, &v); | |
185 v.truncate(kMaxStringLength); | |
186 property->values->set_string_values(Vector<String>(1, v)); | |
187 } break; | |
188 case JSONValue::ValueType::TypeObject: { | |
189 if (recursionLevel + 1 >= kMaxDepth) { | |
190 addProperty = false; | |
191 break; | |
192 } | |
193 property->values->set_entity_values(Vector<EntityPtr>()); | |
194 property->values->get_entity_values().push_back(Entity::New()); | |
195 | |
196 extractEntity(*(val.getObject(entry.first)), | |
197 *(property->values->get_entity_values().at(0)), | |
198 recursionLevel + 1); | |
199 } break; | |
200 case JSONValue::ValueType::TypeArray: | |
201 addProperty = parseRepeatedValue(*(val.getArray(entry.first)), | |
202 *(property->values), recursionLevel); | |
203 break; | |
204 default: | |
205 break; | |
206 } | |
207 if (addProperty) | |
208 entity.properties.push_back(std::move(property)); | |
209 } | |
210 } | |
211 | |
212 void extractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) { | |
213 // Now we have a JSONObject which corresponds to a single (possibly nested) | |
214 // entity. | |
215 EntityPtr entity = Entity::New(); | |
216 String type; | |
217 val.getString(kJSONLDKeyType, &type); | |
218 if (!isWhitelistedType(AtomicString(type))) { | |
219 return; | |
220 } | |
221 extractEntity(val, *(entity.get()), 0); | |
222 entities.push_back(std::move(entity)); | |
223 } | |
224 | |
225 void extractEntitiesFromArray(const JSONArray& arr, | |
226 Vector<EntityPtr>& entities) { | |
227 for (size_t i = 0; i < arr.size(); ++i) { | |
228 const JSONValue* val = arr.at(i); | |
229 if (val->getType() == JSONValue::ValueType::TypeObject) { | |
230 extractTopLevelEntity(*(JSONObject::cast(val)), entities); | |
231 } | |
232 } | |
233 } | |
234 | |
235 void extractEntityFromTopLevelObject(const JSONObject& val, | |
236 Vector<EntityPtr>& entities) { | |
237 const JSONArray* graph = val.getArray(kJSONLDKeyGraph); | |
238 if (graph) { | |
239 extractEntitiesFromArray(*graph, entities); | |
240 } | |
241 extractTopLevelEntity(val, entities); | |
242 } | |
243 | |
244 bool extractMetadata(const Element& root, Vector<EntityPtr>& entities) { | |
24 for (Element& element : ElementTraversal::descendantsOf(root)) { | 245 for (Element& element : ElementTraversal::descendantsOf(root)) { |
25 if (element.hasTagName(HTMLNames::scriptTag) && | 246 if (element.hasTagName(HTMLNames::scriptTag) && |
26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { | 247 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { |
27 if (multiple) { | 248 std::unique_ptr<JSONValue> json = parseJSON(element.textContent()); |
28 result.append(","); | 249 if (!json.get()) { |
250 LOG(ERROR) << "Failed to parse json."; | |
251 return false; | |
29 } | 252 } |
30 result.append(element.textContent()); | 253 switch (json->getType()) { |
31 multiple = true; | 254 case JSONValue::ValueType::TypeArray: |
32 } | 255 extractEntitiesFromArray(*(JSONArray::cast(json.get())), entities); |
33 } | 256 break; |
34 result.append("]"); | 257 case JSONValue::ValueType::TypeObject: |
35 return result.toString(); | 258 extractEntityFromTopLevelObject(*(JSONObject::cast(json.get())), |
259 entities); | |
260 break; | |
261 default: | |
262 return false; | |
263 } | |
264 } | |
265 } | |
266 return !entities.isEmpty(); | |
36 } | 267 } |
37 | 268 |
38 } // namespace | 269 } // namespace |
39 | 270 |
40 String CopylessPasteExtractor::extract(Document& document) { | 271 bool CopylessPasteExtractor::extract(const Document& document, |
272 mojom::blink::WebPage& page) { | |
41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); | 273 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); |
42 | 274 |
43 if (!document.frame() || !document.frame()->isMainFrame()) | 275 if (!document.frame() || !document.frame()->isMainFrame()) |
44 return emptyString; | 276 return false; |
45 | 277 |
46 DCHECK(document.hasFinishedParsing()); | 278 DCHECK(document.hasFinishedParsing()); |
wychen
2017/04/05 23:07:48
Oops. This doesn't hold anymore, since it is no lo
dproctor
2017/04/06 00:39:17
Done.
| |
47 | 279 |
48 Element* html = document.documentElement(); | 280 Element* html = document.documentElement(); |
49 if (!html) | 281 if (!html) |
50 return emptyString; | 282 return false; |
51 | 283 |
52 double startTime = monotonicallyIncreasingTime(); | 284 double startTime = monotonicallyIncreasingTime(); |
53 | 285 |
54 // Traverse the DOM tree and extract the metadata. | 286 // Traverse the DOM tree and extract the metadata. |
55 String result = extractMetadata(*html); | 287 if (!extractMetadata(*html, page.entities)) |
288 return false; | |
289 page.url = document.url(); | |
290 page.title = document.title(); | |
56 | 291 |
57 double elapsedTime = monotonicallyIncreasingTime() - startTime; | 292 double elapsedTime = monotonicallyIncreasingTime() - startTime; |
58 | 293 |
59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, | 294 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, |
60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); | 295 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); |
61 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); | 296 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); |
62 return result; | 297 return true; |
63 } | 298 } |
64 | 299 |
65 } // namespace blink | 300 } // namespace blink |
OLD | NEW |