Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2017 The Chromium Authors. All rights reserved. | 1 // Copyright 2017 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "modules/document_metadata/CopylessPasteExtractor.h" | 5 #include "modules/document_metadata/CopylessPasteExtractor.h" |
| 6 | 6 |
| 7 #include <algorithm> | |
| 8 #include <memory> | |
| 9 #include <utility> | |
| 7 #include "core/HTMLNames.h" | 10 #include "core/HTMLNames.h" |
| 8 #include "core/dom/Document.h" | 11 #include "core/dom/Document.h" |
| 9 #include "core/dom/ElementTraversal.h" | 12 #include "core/dom/ElementTraversal.h" |
| 10 #include "core/frame/LocalFrame.h" | 13 #include "core/frame/LocalFrame.h" |
| 11 #include "core/html/HTMLElement.h" | 14 #include "core/html/HTMLElement.h" |
| 12 #include "platform/Histogram.h" | 15 #include "platform/Histogram.h" |
| 13 #include "platform/instrumentation/tracing/TraceEvent.h" | 16 #include "platform/instrumentation/tracing/TraceEvent.h" |
| 17 #include "platform/json/JSONParser.h" | |
| 18 #include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h " | |
| 19 #include "wtf/Vector.h" | |
| 20 #include "wtf/text/AtomicString.h" | |
| 14 #include "wtf/text/StringBuilder.h" | 21 #include "wtf/text/StringBuilder.h" |
| 15 | 22 |
| 16 namespace blink { | 23 namespace blink { |
| 17 | 24 |
| 18 namespace { | 25 namespace { |
| 19 | 26 |
| 20 String extractMetadata(Element& root) { | 27 using mojom::blink::Entity; |
| 21 StringBuilder result; | 28 using mojom::blink::EntityPtr; |
| 22 result.append("["); | 29 using mojom::blink::Property; |
| 23 bool multiple = false; | 30 using mojom::blink::PropertyPtr; |
| 31 using mojom::blink::Values; | |
| 32 using mojom::blink::ValuesPtr; | |
| 33 using mojom::blink::WebPage; | |
| 34 using mojom::blink::WebPagePtr; | |
| 35 | |
| 36 // App Indexing enforces a max nesting depth of 5. Our top level message | |
| 37 // corresponds to the WebPage, so this only leaves 4 more levels. We will parse | |
| 38 // entites up to this depth, and ignore any further nesting. If an object at the | |
| 39 // max nesting depth has a property corresponding to an entity, that property | |
| 40 // will be dropped. Note that we will still parse json-ld blocks deeper than | |
| 41 // this, but it won't be passed to App Indexing. | |
| 42 constexpr int kMaxDepth = 4; | |
| 43 // Some strings are very long, and we don't currently use those, so limit string | |
| 44 // length to something reasonable to avoid undue pressure on Icing. Note that | |
| 45 // App Indexing supports strings up to length 20k. | |
| 46 constexpr int kMaxStringLength = 200; | |
| 47 // Enforced by App Indexing, so stop processing early if possible. | |
| 48 constexpr size_t kMaxNumFields = 20; | |
| 49 // Enforced by App Indexing, so stop processing early if possible. | |
| 50 constexpr size_t kMaxRepeatedSize = 100; | |
| 51 | |
| 52 constexpr char kJSONLDKeyType[] = "@type"; | |
| 53 constexpr char kJSONLDKeyGraph[] = "@graph"; | |
| 54 | |
| 55 void extractEntity(const JSONObject& val, Entity& entity, int recursionLevel) { | |
|
wychen
2017/04/04 18:40:19
This function feels a bit long.
dproctor
2017/04/04 20:44:05
Split out the nested switch statement.
| |
| 56 if (recursionLevel >= kMaxDepth) { | |
| 57 return; | |
| 58 } | |
| 59 | |
| 60 String type; | |
| 61 val.getString(kJSONLDKeyType, &type); | |
| 62 if (!type) { | |
| 63 type = "Thing"; | |
| 64 } | |
| 65 entity.type = type; | |
| 66 for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) { | |
| 67 PropertyPtr property = Property::New(); | |
| 68 const JSONObject::Entry entry = val.at(i); | |
|
wychen
2017/04/04 18:40:19
Would Entry& work?
dproctor
2017/04/04 20:44:05
Done.
| |
| 69 property->name = entry.first; | |
| 70 if (property->name == kJSONLDKeyType) { | |
| 71 continue; | |
| 72 } | |
| 73 property->values = Values::New(); | |
| 74 JSONValue::ValueType type = entry.second->getType(); | |
|
wychen
2017/04/04 18:40:19
inline in the switch to avoid confusion from Strin
dproctor
2017/04/04 20:44:04
Done.
| |
| 75 | |
| 76 bool addProperty = true; | |
| 77 | |
| 78 switch (type) { | |
| 79 case JSONValue::ValueType::TypeBoolean: { | |
| 80 bool v; | |
| 81 val.getBoolean(entry.first, &v); | |
| 82 property->values->set_bool_values(Vector<bool>(1, v)); | |
| 83 } break; | |
| 84 case JSONValue::ValueType::TypeInteger: { | |
| 85 int v; | |
| 86 val.getInteger(entry.first, &v); | |
| 87 property->values->set_long_values(Vector<int64_t>(1, v)); | |
| 88 } break; | |
| 89 case JSONValue::ValueType::TypeDouble: { | |
| 90 double v; | |
| 91 val.getDouble(entry.first, &v); | |
| 92 String s = String::number(v); | |
| 93 s.truncate(kMaxStringLength); | |
| 94 property->values->set_string_values(Vector<String>(1, s)); | |
| 95 } break; | |
| 96 case JSONValue::ValueType::TypeString: { | |
| 97 String v; | |
| 98 val.getString(entry.first, &v); | |
| 99 v.truncate(kMaxStringLength); | |
| 100 property->values->set_string_values(Vector<String>(1, v)); | |
| 101 } break; | |
| 102 case JSONValue::ValueType::TypeObject: { | |
| 103 if (recursionLevel + 1 >= kMaxDepth) { | |
| 104 addProperty = false; | |
| 105 break; | |
| 106 } | |
| 107 property->values->set_entity_values(Vector<EntityPtr>()); | |
| 108 property->values->get_entity_values().push_back(Entity::New()); | |
| 109 | |
| 110 extractEntity(*(val.getObject(entry.first)), | |
| 111 *(property->values->get_entity_values().at(0)), | |
| 112 recursionLevel + 1); | |
| 113 } break; | |
| 114 case JSONValue::ValueType::TypeArray: { | |
|
wychen
2017/04/04 18:40:19
Probably split this part out. The nested switch an
dproctor
2017/04/04 20:44:04
Done.
| |
| 115 JSONArray* arr = val.getArray(entry.first); | |
|
wychen
2017/04/04 18:40:19
const
dproctor
2017/04/04 20:44:04
JSONArray::at isn't const.
| |
| 116 if (arr->size() < 1) { | |
| 117 addProperty = false; | |
| 118 break; | |
| 119 } | |
| 120 | |
| 121 type = arr->at(0)->getType(); | |
| 122 if (type == JSONArray::ValueType::TypeArray) { | |
| 123 // App Indexing doesn't support nested arrays. | |
| 124 addProperty = false; | |
| 125 break; | |
| 126 } | |
| 127 for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) { | |
| 128 JSONValue* innerVal = arr->at(j); | |
| 129 if (innerVal->getType() != type) { | |
| 130 // App Indexing doesn't support mixed types. If there are mixed | |
| 131 // types in the parsed object, we will drop the property. | |
| 132 addProperty = false; | |
| 133 break; | |
| 134 } | |
| 135 switch (innerVal->getType()) { | |
| 136 case JSONValue::ValueType::TypeBoolean: { | |
| 137 if (!property->values->is_bool_values()) { | |
| 138 property->values->set_bool_values(Vector<bool>()); | |
| 139 } | |
| 140 bool v; | |
| 141 innerVal->asBoolean(&v); | |
| 142 property->values->get_bool_values().push_back(v); | |
| 143 } break; | |
| 144 case JSONValue::ValueType::TypeInteger: { | |
| 145 if (!property->values->is_long_values()) { | |
| 146 property->values->set_long_values(Vector<int64_t>()); | |
| 147 } | |
| 148 int v; | |
| 149 innerVal->asInteger(&v); | |
| 150 property->values->get_long_values().push_back(v); | |
| 151 } break; | |
| 152 case JSONValue::ValueType::TypeDouble: { | |
| 153 if (!property->values->is_string_values()) { | |
| 154 property->values->set_string_values(Vector<String>()); | |
| 155 } | |
| 156 double v; | |
| 157 val.getDouble(entry.first, &v); | |
| 158 String s = String::number(v); | |
| 159 s.truncate(kMaxStringLength); | |
| 160 property->values->get_string_values().push_back(s); | |
| 161 } break; | |
| 162 case JSONValue::ValueType::TypeString: { | |
| 163 if (!property->values->is_string_values()) { | |
| 164 property->values->set_string_values(Vector<String>()); | |
| 165 } | |
| 166 String v; | |
| 167 innerVal->asString(&v); | |
| 168 v.truncate(kMaxStringLength); | |
| 169 property->values->get_string_values().push_back(v); | |
| 170 } break; | |
| 171 case JSONValue::ValueType::TypeObject: | |
| 172 if (recursionLevel + 1 >= kMaxDepth) { | |
| 173 addProperty = false; | |
| 174 break; | |
| 175 } | |
| 176 if (!property->values->is_entity_values()) { | |
| 177 property->values->set_entity_values(Vector<EntityPtr>()); | |
| 178 } | |
| 179 property->values->get_entity_values().push_back(Entity::New()); | |
| 180 extractEntity(*(JSONObject::cast(innerVal)), | |
| 181 *(property->values->get_entity_values().at(j)), | |
| 182 recursionLevel + 1); | |
| 183 break; | |
| 184 default: | |
| 185 break; | |
| 186 } | |
| 187 } | |
| 188 } break; | |
| 189 default: | |
| 190 break; | |
| 191 } | |
| 192 if (addProperty) | |
| 193 entity.properties.push_back(std::move(property)); | |
| 194 } | |
| 195 } | |
| 196 | |
| 197 bool isWhitelistedType(AtomicString type) { | |
| 198 DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements, | |
| 199 ({// Common types that include addresses. | |
| 200 "AutoDealer", "Hotel", "LocalBusiness", "Organization", | |
| 201 "Person", "Place", "PostalAddress", "Product", | |
| 202 "Residence", "Restaurant", "SingleFamilyResidence", | |
| 203 // Common types including phone numbers | |
| 204 "Store", "ContactPoint", "LodgingBusiness"})); | |
| 205 return type && elements.contains(type); | |
| 206 } | |
| 207 | |
| 208 void extractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) { | |
| 209 // Now we have a JSONObject which corresponds to a single (possibly nested) | |
| 210 // entity. | |
| 211 EntityPtr entity = Entity::New(); | |
| 212 String type; | |
| 213 val.getString(kJSONLDKeyType, &type); | |
| 214 if (!isWhitelistedType(AtomicString(type))) { | |
| 215 return; | |
| 216 } | |
| 217 extractEntity(val, *(entity.get()), 0); | |
| 218 entities.push_back(std::move(entity)); | |
| 219 } | |
| 220 | |
| 221 void extractEntitiesFromArray(JSONArray& arr, Vector<EntityPtr>& entities) { | |
|
wychen
2017/04/04 18:40:19
const arr
dproctor
2017/04/04 20:44:04
JSONArray::at isn't const.
wychen
2017/04/04 23:15:32
Hmm. This is unexpected, but OK.
Let's see how fas
wychen
2017/04/05 16:29:41
https://codereview.chromium.org/2795393002/ has la
dproctor
2017/04/05 17:41:44
Done.
| |
| 222 for (size_t i = 0; i < arr.size(); ++i) { | |
| 223 JSONValue* val = arr.at(i); | |
|
wychen
2017/04/04 18:40:19
const
dproctor
2017/04/04 20:44:04
Done.
| |
| 224 if (val->getType() == JSONValue::ValueType::TypeObject) { | |
| 225 extractTopLevelEntity(*(JSONObject::cast(val)), entities); | |
| 226 } | |
| 227 } | |
| 228 } | |
| 229 | |
| 230 void extractEntityFromTopLevelObject(const JSONObject& val, | |
| 231 Vector<EntityPtr>& entities) { | |
| 232 JSONArray* graph = val.getArray(kJSONLDKeyGraph); | |
|
wychen
2017/04/04 18:40:19
const
dproctor
2017/04/04 20:44:05
Done.
| |
| 233 if (graph) { | |
| 234 extractEntitiesFromArray(*graph, entities); | |
| 235 } | |
| 236 extractTopLevelEntity(val, entities); | |
| 237 } | |
| 238 | |
| 239 bool extractMetadata(const Element& root, Vector<EntityPtr>& entities) { | |
| 24 for (Element& element : ElementTraversal::descendantsOf(root)) { | 240 for (Element& element : ElementTraversal::descendantsOf(root)) { |
| 25 if (element.hasTagName(HTMLNames::scriptTag) && | 241 if (element.hasTagName(HTMLNames::scriptTag) && |
| 26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { | 242 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { |
| 27 if (multiple) { | 243 std::unique_ptr<JSONValue> json = parseJSON(element.textContent()); |
|
wychen
2017/04/04 18:40:19
We could use the maxDepth version.
dproctor
2017/04/04 20:44:04
So, there are actually two different notions of ma
wychen
2017/04/04 23:15:32
My bad. JSONParser returns nullptr when exceeding
| |
| 28 result.append(","); | 244 if (!json.get()) { |
| 245 LOG(ERROR) << "Failed to parse json."; | |
| 246 return false; | |
| 29 } | 247 } |
| 30 result.append(element.textContent()); | 248 switch (json->getType()) { |
| 31 multiple = true; | 249 case JSONValue::ValueType::TypeArray: |
| 32 } | 250 extractEntitiesFromArray(*(JSONArray::cast(json.get())), entities); |
| 33 } | 251 break; |
| 34 result.append("]"); | 252 case JSONValue::ValueType::TypeObject: |
| 35 return result.toString(); | 253 extractEntityFromTopLevelObject(*(JSONObject::cast(json.get())), |
| 254 entities); | |
| 255 break; | |
| 256 default: | |
| 257 return false; | |
| 258 } | |
| 259 } | |
| 260 } | |
| 261 return !entities.isEmpty(); | |
| 36 } | 262 } |
| 37 | 263 |
| 38 } // namespace | 264 } // namespace |
| 39 | 265 |
| 40 String CopylessPasteExtractor::extract(Document& document) { | 266 bool CopylessPasteExtractor::extract(const Document& document, |
| 267 mojom::blink::WebPage& page) { | |
| 41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); | 268 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); |
| 42 | 269 |
| 43 if (!document.frame() || !document.frame()->isMainFrame()) | 270 if (!document.frame() || !document.frame()->isMainFrame()) |
| 44 return emptyString; | 271 return false; |
| 45 | 272 |
| 46 DCHECK(document.hasFinishedParsing()); | 273 DCHECK(document.hasFinishedParsing()); |
| 47 | 274 |
| 48 Element* html = document.documentElement(); | 275 Element* html = document.documentElement(); |
| 49 if (!html) | 276 if (!html) |
| 50 return emptyString; | 277 return false; |
| 51 | 278 |
| 52 double startTime = monotonicallyIncreasingTime(); | 279 double startTime = monotonicallyIncreasingTime(); |
| 53 | 280 |
| 54 // Traverse the DOM tree and extract the metadata. | 281 // Traverse the DOM tree and extract the metadata. |
| 55 String result = extractMetadata(*html); | 282 if (!extractMetadata(*html, page.entities)) |
| 283 return false; | |
| 284 page.url = document.url().getString(); | |
|
wychen
2017/04/04 18:40:19
The mojo type will change from string to GURL.
dproctor
2017/04/04 20:44:04
Acknowledged.
| |
| 285 page.title = document.title(); | |
| 56 | 286 |
| 57 double elapsedTime = monotonicallyIncreasingTime() - startTime; | 287 double elapsedTime = monotonicallyIncreasingTime() - startTime; |
| 58 | 288 |
| 59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, | 289 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, |
| 60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); | 290 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); |
| 61 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); | 291 extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); |
| 62 return result; | 292 return true; |
| 63 } | 293 } |
| 64 | 294 |
| 65 } // namespace blink | 295 } // namespace blink |
| OLD | NEW |