OLD | NEW |
1 // Copyright 2017 The Chromium Authors. All rights reserved. | 1 // Copyright 2017 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "modules/document_metadata/CopylessPasteExtractor.h" | 5 #include "modules/document_metadata/CopylessPasteExtractor.h" |
6 | 6 |
| 7 #include <memory> |
| 8 #include <algorithm> |
7 #include "core/HTMLNames.h" | 9 #include "core/HTMLNames.h" |
8 #include "core/dom/Document.h" | 10 #include "core/dom/Document.h" |
9 #include "core/dom/ElementTraversal.h" | 11 #include "core/dom/ElementTraversal.h" |
10 #include "core/frame/LocalFrame.h" | 12 #include "core/frame/LocalFrame.h" |
11 #include "core/html/HTMLElement.h" | 13 #include "core/html/HTMLElement.h" |
12 #include "platform/Histogram.h" | 14 #include "platform/Histogram.h" |
13 #include "platform/instrumentation/tracing/TraceEvent.h" | 15 #include "platform/instrumentation/tracing/TraceEvent.h" |
| 16 #include "platform/json/JSONParser.h" |
| 17 #include "wtf/Vector.h" |
14 #include "wtf/text/StringBuilder.h" | 18 #include "wtf/text/StringBuilder.h" |
15 | 19 |
16 namespace blink { | 20 namespace blink { |
17 | 21 |
| 22 // TODO(dproctor): Temporary structs to hold entity return value. Replace with |
| 23 // whatever the mojo service uses, once that is ready. |
| 24 struct Property; |
| 25 |
| 26 struct Entity { |
| 27 Vector<Property> properties; |
| 28 }; |
| 29 bool operator==(const Entity&, const Entity&); |
| 30 std::ostream& operator<<(std::ostream&, const Entity&); |
| 31 |
| 32 struct Property { |
| 33 String name; |
| 34 JSONValue::ValueType type; |
| 35 |
| 36 Vector<bool> boolVal; |
| 37 Vector<int> intVal; |
| 38 Vector<double> doubleVal; |
| 39 Vector<String> strVal; |
| 40 Vector<Entity> entityVal; |
| 41 }; |
| 42 bool operator==(const Property&, const Property&); |
| 43 std::ostream& operator<<(std::ostream&, const Property&); |
| 44 |
| 45 struct WebPage { |
| 46 String url; |
| 47 String title; |
| 48 Vector<Entity> entities; |
| 49 }; |
| 50 bool operator==(const WebPage&, const WebPage&); |
| 51 std::ostream& operator<<(std::ostream&, const WebPage&); |
| 52 |
| 53 bool operator==(const Entity& lhs, const Entity& rhs) { |
| 54 return lhs.properties == rhs.properties; |
| 55 } |
| 56 bool operator==(const Property& lhs, const Property& rhs) { |
| 57 if (!(lhs.name == rhs.name && lhs.type == rhs.type)) { |
| 58 return false; |
| 59 } |
| 60 switch (lhs.type) { |
| 61 case JSONValue::ValueType::TypeBoolean: |
| 62 return lhs.boolVal == rhs.boolVal; |
| 63 case JSONValue::ValueType::TypeInteger: |
| 64 return lhs.intVal == rhs.intVal; |
| 65 case JSONValue::ValueType::TypeDouble: |
| 66 return lhs.doubleVal == rhs.doubleVal; |
| 67 case JSONValue::ValueType::TypeString: |
| 68 return lhs.strVal == rhs.strVal; |
| 69 case JSONValue::ValueType::TypeObject: |
| 70 return lhs.entityVal == rhs.entityVal; |
| 71 default: |
| 72 return false; |
| 73 } |
| 74 } |
| 75 bool operator==(const WebPage& lhs, const WebPage& rhs) { |
| 76 return lhs.url == rhs.url && lhs.title == rhs.title && |
| 77 lhs.entities == rhs.entities; |
| 78 } |
| 79 std::ostream& operator<<(std::ostream& os, const Entity& v) { |
| 80 os << "ENTITY: ["; |
| 81 for (auto p : v.properties) { |
| 82 os << p; |
| 83 } |
| 84 os << "]"; |
| 85 return os; |
| 86 } |
| 87 std::ostream& operator<<(std::ostream& os, const Property& v) { |
| 88 os << "Name: " << v.name << " TYPE : " << v.type << " VALUE: [ "; |
| 89 switch (v.type) { |
| 90 case JSONValue::ValueType::TypeBoolean: |
| 91 for (auto b : v.boolVal) |
| 92 os << b; |
| 93 case JSONValue::ValueType::TypeInteger: |
| 94 for (auto i : v.intVal) |
| 95 os << i; |
| 96 break; |
| 97 case JSONValue::ValueType::TypeDouble: |
| 98 for (auto d : v.doubleVal) |
| 99 os << d; |
| 100 break; |
| 101 case JSONValue::ValueType::TypeString: |
| 102 for (auto s : v.strVal) |
| 103 os << s; |
| 104 break; |
| 105 case JSONValue::ValueType::TypeObject: |
| 106 for (auto e : v.entityVal) |
| 107 os << e; |
| 108 break; |
| 109 default: |
| 110 break; |
| 111 } |
| 112 os << " ]"; |
| 113 return os; |
| 114 } |
| 115 std::ostream& operator<<(std::ostream& os, const WebPage& v) { |
| 116 os << "URL: " << v.url << "TITLE: " << v.title << "ENTITIES: ["; |
| 117 for (auto e : v.entities) { |
| 118 os << e; |
| 119 } |
| 120 os << "]"; |
| 121 return os; |
| 122 } |
| 123 |
18 namespace { | 124 namespace { |
19 | 125 |
20 String extractMetadata(Element& root) { | 126 // App Indexing enforces a max nesting depth of 5. Our top level message |
21 StringBuilder result; | 127 // corresponds to the WebPage, so this only leaves 4 more levels. |
22 result.append("["); | 128 // TODO(dproctor): Do we want to fail parsing, or (more likely) only pass the |
23 bool multiple = false; | 129 // top levels to Icing? |
| 130 constexpr int kMaxDepth = 4; |
| 131 // Some strings are very long, and we don't currently use those, so limit string |
| 132 // length to something reasonable to avoid undue pressure on Icing. Note that |
| 133 // App Indexing supports strings up to length 20k. |
| 134 constexpr int kMaxStringLength = 200; |
| 135 // Enforced by App Indexing, so stop processing early if possible. |
| 136 constexpr size_t kMaxNumFields = 20; |
| 137 // Enforced by App Indexing, so stop processing early if possible. |
| 138 constexpr size_t kMaxRepeatedSize = 100; |
| 139 |
| 140 constexpr char kJSONLDKeyName[] = "name"; |
| 141 constexpr char kJSONLDKeyType[] = "@type"; |
| 142 constexpr char kJSONLDKeyGraph[] = "@graph"; |
| 143 constexpr char kJSONLDKeyContext[] = "@context"; |
| 144 |
| 145 void extractEntity(JSONObject* val, Entity* entity) { |
| 146 for (size_t i = 0; i < std::min(val->size(), kMaxNumFields); ++i) { |
| 147 Property property; |
| 148 JSONObject::Entry entry = val->at(i); |
| 149 property.name = entry.first; |
| 150 property.type = entry.second->getType(); |
| 151 bool addProperty = true; |
| 152 |
| 153 switch (property.type) { |
| 154 case JSONValue::ValueType::TypeBoolean: { |
| 155 bool v; |
| 156 val->getBoolean(entry.first, &v); |
| 157 property.boolVal.push_back(v); |
| 158 } break; |
| 159 case JSONValue::ValueType::TypeInteger: { |
| 160 int v; |
| 161 val->getInteger(entry.first, &v); |
| 162 property.intVal.push_back(v); |
| 163 } break; |
| 164 case JSONValue::ValueType::TypeDouble: { |
| 165 double v; |
| 166 val->getDouble(entry.first, &v); |
| 167 property.doubleVal.push_back(v); |
| 168 } break; |
| 169 case JSONValue::ValueType::TypeString: { |
| 170 String v; |
| 171 val->getString(entry.first, &v); |
| 172 v.truncate(kMaxStringLength); |
| 173 property.strVal.push_back(v); |
| 174 } break; |
| 175 case JSONValue::ValueType::TypeObject: { |
| 176 property.entityVal.push_back(Entity()); |
| 177 extractEntity(val->getObject(entry.first), &(property.entityVal.at(0))); |
| 178 } break; |
| 179 case JSONValue::ValueType::TypeArray: { |
| 180 JSONArray* arr = val->getArray(entry.first); |
| 181 if (arr->size() < 1) { |
| 182 addProperty = false; |
| 183 break; |
| 184 } |
| 185 |
| 186 property.type = arr->at(0)->getType(); |
| 187 if (property.type == JSONArray::ValueType::TypeArray) { |
| 188 // App Indexing doesn't support nested arrays. |
| 189 addProperty = false; |
| 190 break; |
| 191 } |
| 192 for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) { |
| 193 JSONValue* innerVal = arr->at(j); |
| 194 if (innerVal->getType() != property.type) { |
| 195 addProperty = false; |
| 196 break; |
| 197 } |
| 198 switch (innerVal->getType()) { |
| 199 case JSONValue::ValueType::TypeBoolean: { |
| 200 bool v; |
| 201 innerVal->asBoolean(&v); |
| 202 property.boolVal.push_back(v); |
| 203 } break; |
| 204 case JSONValue::ValueType::TypeInteger: { |
| 205 int v; |
| 206 innerVal->asInteger(&v); |
| 207 property.intVal.push_back(v); |
| 208 } break; |
| 209 case JSONValue::ValueType::TypeDouble: { |
| 210 double v; |
| 211 innerVal->asDouble(&v); |
| 212 property.doubleVal.push_back(v); |
| 213 } break; |
| 214 case JSONValue::ValueType::TypeString: { |
| 215 String v; |
| 216 innerVal->asString(&v); |
| 217 property.strVal.push_back(v); |
| 218 } break; |
| 219 case JSONValue::ValueType::TypeObject: |
| 220 property.entityVal.push_back(Entity()); |
| 221 extractEntity(JSONObject::cast(innerVal), |
| 222 &(property.entityVal.at(j))); |
| 223 break; |
| 224 default: |
| 225 break; |
| 226 } |
| 227 } |
| 228 } break; |
| 229 default: |
| 230 break; |
| 231 } |
| 232 if (addProperty) |
| 233 entity->properties.push_back(property); |
| 234 } |
| 235 } |
| 236 |
| 237 bool isWhitelistedType(String type) { |
| 238 DEFINE_STATIC_LOCAL(HashSet<String>, elements, |
| 239 ({// Common types that include addresses. |
| 240 "AutoDealer", "Hotel", "LocalBusiness", "Organization", |
| 241 "Person", "Place", "PostalAddress", "Product", |
| 242 "Residence", "Restaurant", "SingleFamilyResidence", |
| 243 // Common types including phone numbers |
| 244 "Store", "ContactPoint", "LodgingBusiness"})); |
| 245 return type && elements.contains(type); |
| 246 } |
| 247 |
| 248 void extractTopLevelEntity(JSONObject* val, Vector<Entity>* entities) { |
| 249 // Now we have a JSONObject which corresponds to a single (possibly nested) |
| 250 // entity. |
| 251 Entity entity; |
| 252 String type; |
| 253 val->getString(kJSONLDKeyType, &type); |
| 254 if (!isWhitelistedType(type)) { |
| 255 return; |
| 256 } |
| 257 extractEntity(val, &entity); |
| 258 entities->push_back(entity); |
| 259 } |
| 260 |
| 261 void extractEntitiesFromArray(JSONArray* arr, Vector<Entity>* entities) { |
| 262 for (size_t i = 0; i < arr->size(); ++i) { |
| 263 JSONValue* val = arr->at(i); |
| 264 switch (val->getType()) { |
| 265 case JSONValue::ValueType::TypeObject: |
| 266 extractTopLevelEntity(JSONObject::cast(val), entities); |
| 267 break; |
| 268 default: |
| 269 // TODO(dproctor): :( |
| 270 return; |
| 271 } |
| 272 } |
| 273 } |
| 274 |
| 275 void extractEntityFromTopLevelObject(JSONObject* val, |
| 276 Vector<Entity>* entities) { |
| 277 JSONArray* graph = val->getArray(kJSONLDKeyGraph); |
| 278 if (graph) { |
| 279 extractEntitiesFromArray(graph, entities); |
| 280 } |
| 281 extractTopLevelEntity(val, entities); |
| 282 } |
| 283 |
| 284 bool extractMetadata(const Element& root, Vector<Entity>* entities) { |
24 for (Element& element : ElementTraversal::descendantsOf(root)) { | 285 for (Element& element : ElementTraversal::descendantsOf(root)) { |
25 if (element.hasTagName(HTMLNames::scriptTag) && | 286 if (element.hasTagName(HTMLNames::scriptTag) && |
26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { | 287 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { |
27 if (multiple) { | 288 std::unique_ptr<JSONValue> json = |
28 result.append(","); | 289 parseJSON(element.textContent(), kMaxDepth); |
| 290 if (!json.get()) { |
| 291 LOG(ERROR) << "Failed to parse json."; |
| 292 return false; |
29 } | 293 } |
30 result.append(element.textContent()); | 294 LOG(ERROR) << "PARSED JSON: " << json->toPrettyJSONString(); |
31 multiple = true; | 295 switch (json->getType()) { |
| 296 case JSONValue::ValueType::TypeArray: |
| 297 extractEntitiesFromArray(JSONArray::cast(json.get()), entities); |
| 298 break; |
| 299 case JSONValue::ValueType::TypeObject: |
| 300 extractEntityFromTopLevelObject(JSONObject::cast(json.get()), |
| 301 entities); |
| 302 break; |
| 303 default: |
| 304 return false; |
| 305 } |
32 } | 306 } |
33 } | 307 } |
34 result.append("]"); | 308 return !entities->isEmpty(); |
35 return result.toString(); | |
36 } | 309 } |
37 | 310 |
38 } // namespace | 311 } // namespace |
39 | 312 |
40 String CopylessPasteExtractor::extract(Document& document) { | 313 bool CopylessPasteExtractor::extract(const Document& document, WebPage* page) { |
41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); | 314 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); |
42 | 315 |
43 if (!document.frame() || !document.frame()->isMainFrame()) | 316 if (!document.frame() || !document.frame()->isMainFrame()) |
44 return emptyString; | 317 return false; |
45 | 318 |
46 DCHECK(document.hasFinishedParsing()); | 319 DCHECK(document.hasFinishedParsing()); |
47 | 320 |
48 Element* html = document.documentElement(); | 321 Element* html = document.documentElement(); |
49 if (!html) | 322 if (!html) |
50 return emptyString; | 323 return false; |
51 | 324 |
52 double startTime = monotonicallyIncreasingTime(); | 325 double startTime = monotonicallyIncreasingTime(); |
53 | 326 |
54 // Traverse the DOM tree and extract the metadata. | 327 // Traverse the DOM tree and extract the metadata. |
55 String result = extractMetadata(*html); | 328 if (!extractMetadata(*html, &(page->entities))) |
| 329 return false; |
| 330 page->url = document.url().getString(); |
| 331 page->title = document.title(); |
56 | 332 |
57 double elapsedTime = monotonicallyIncreasingTime() - startTime; | 333 double elapsedTime = monotonicallyIncreasingTime() - startTime; |
58 | 334 |
59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, | 335 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, |
60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); | 336 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); |
61 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); | 337 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); |
62 return result; | 338 return true; |
63 } | 339 } |
64 | 340 |
65 } // namespace blink | 341 } // namespace blink |
OLD | NEW |