OLD | NEW |
1 // Copyright 2017 The Chromium Authors. All rights reserved. | 1 // Copyright 2017 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "modules/document_metadata/CopylessPasteExtractor.h" | 5 #include "modules/document_metadata/CopylessPasteExtractor.h" |
6 | 6 |
| 7 #include <algorithm> |
| 8 #include <memory> |
| 9 #include <utility> |
| 10 |
7 #include "core/HTMLNames.h" | 11 #include "core/HTMLNames.h" |
8 #include "core/dom/Document.h" | 12 #include "core/dom/Document.h" |
9 #include "core/dom/ElementTraversal.h" | 13 #include "core/dom/ElementTraversal.h" |
10 #include "core/frame/LocalFrame.h" | 14 #include "core/frame/LocalFrame.h" |
11 #include "core/html/HTMLElement.h" | 15 #include "core/html/HTMLElement.h" |
12 #include "platform/Histogram.h" | 16 #include "platform/Histogram.h" |
13 #include "platform/instrumentation/tracing/TraceEvent.h" | 17 #include "platform/instrumentation/tracing/TraceEvent.h" |
| 18 #include "platform/json/JSONParser.h" |
| 19 #include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h
" |
| 20 #include "wtf/Vector.h" |
| 21 #include "wtf/text/AtomicString.h" |
14 #include "wtf/text/StringBuilder.h" | 22 #include "wtf/text/StringBuilder.h" |
15 | 23 |
16 namespace blink { | 24 namespace blink { |
17 | 25 |
18 namespace { | 26 namespace { |
19 | 27 |
20 String ExtractMetadata(Element& root) { | 28 using mojom::document_metadata::blink::Entity; |
21 StringBuilder result; | 29 using mojom::document_metadata::blink::EntityPtr; |
22 result.Append("["); | 30 using mojom::document_metadata::blink::Property; |
23 bool multiple = false; | 31 using mojom::document_metadata::blink::PropertyPtr; |
| 32 using mojom::document_metadata::blink::Values; |
| 33 using mojom::document_metadata::blink::ValuesPtr; |
| 34 using mojom::document_metadata::blink::WebPage; |
| 35 using mojom::document_metadata::blink::WebPagePtr; |
| 36 |
| 37 // App Indexing enforces a max nesting depth of 5. Our top level message |
| 38 // corresponds to the WebPage, so this only leaves 4 more levels. We will parse |
| 39 // entites up to this depth, and ignore any further nesting. If an object at the |
| 40 // max nesting depth has a property corresponding to an entity, that property |
| 41 // will be dropped. Note that we will still parse json-ld blocks deeper than |
| 42 // this, but it won't be passed to App Indexing. |
| 43 constexpr int kMaxDepth = 4; |
| 44 // Some strings are very long, and we don't currently use those, so limit string |
| 45 // length to something reasonable to avoid undue pressure on Icing. Note that |
| 46 // App Indexing supports strings up to length 20k. |
| 47 constexpr int kMaxStringLength = 200; |
| 48 // Enforced by App Indexing, so stop processing early if possible. |
| 49 constexpr size_t kMaxNumFields = 20; |
| 50 // Enforced by App Indexing, so stop processing early if possible. |
| 51 constexpr size_t kMaxRepeatedSize = 100; |
| 52 |
| 53 constexpr char kJSONLDKeyType[] = "@type"; |
| 54 constexpr char kJSONLDKeyGraph[] = "@graph"; |
| 55 bool isWhitelistedType(AtomicString type) { |
| 56 DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements, |
| 57 ({// Common types that include addresses. |
| 58 "AutoDealer", "Hotel", "LocalBusiness", "Organization", |
| 59 "Person", "Place", "PostalAddress", "Product", |
| 60 "Residence", "Restaurant", "SingleFamilyResidence", |
| 61 // Common types including phone numbers |
| 62 "Store", "ContactPoint", "LodgingBusiness"})); |
| 63 return type && elements.Contains(type); |
| 64 } |
| 65 |
| 66 void extractEntity(const JSONObject&, Entity&, int recursionLevel); |
| 67 |
| 68 bool parseRepeatedValue(const JSONArray& arr, |
| 69 Values& values, |
| 70 int recursionLevel) { |
| 71 if (arr.size() < 1) { |
| 72 return false; |
| 73 } |
| 74 |
| 75 const JSONValue::ValueType type = arr.at(0)->GetType(); |
| 76 switch (type) { |
| 77 case JSONValue::ValueType::kTypeBoolean: |
| 78 values.set_bool_values(Vector<bool>()); |
| 79 break; |
| 80 case JSONValue::ValueType::kTypeInteger: |
| 81 values.set_long_values(Vector<int64_t>()); |
| 82 break; |
| 83 case JSONValue::ValueType::kTypeDouble: |
| 84 // App Indexing doesn't support double type, so just encode its decimal |
| 85 // value as a string instead. |
| 86 values.set_string_values(Vector<String>()); |
| 87 break; |
| 88 case JSONValue::ValueType::kTypeString: |
| 89 values.set_string_values(Vector<String>()); |
| 90 break; |
| 91 case JSONValue::ValueType::kTypeObject: |
| 92 if (recursionLevel + 1 >= kMaxDepth) { |
| 93 return false; |
| 94 } |
| 95 values.set_entity_values(Vector<EntityPtr>()); |
| 96 break; |
| 97 case JSONArray::ValueType::kTypeArray: |
| 98 // App Indexing doesn't support nested arrays. |
| 99 return false; |
| 100 default: |
| 101 break; |
| 102 } |
| 103 for (size_t j = 0; j < std::min(arr.size(), kMaxRepeatedSize); ++j) { |
| 104 const JSONValue* innerVal = arr.at(j); |
| 105 if (innerVal->GetType() != type) { |
| 106 // App Indexing doesn't support mixed types. If there are mixed |
| 107 // types in the parsed object, we will drop the property. |
| 108 return false; |
| 109 } |
| 110 switch (innerVal->GetType()) { |
| 111 case JSONValue::ValueType::kTypeBoolean: { |
| 112 bool v; |
| 113 innerVal->AsBoolean(&v); |
| 114 values.get_bool_values().push_back(v); |
| 115 } break; |
| 116 case JSONValue::ValueType::kTypeInteger: { |
| 117 int v; |
| 118 innerVal->AsInteger(&v); |
| 119 values.get_long_values().push_back(v); |
| 120 } break; |
| 121 case JSONValue::ValueType::kTypeDouble: { |
| 122 // App Indexing doesn't support double type, so just encode its decimal |
| 123 // value as a string instead. |
| 124 double v; |
| 125 innerVal->AsDouble(&v); |
| 126 String s = String::Number(v); |
| 127 s.Truncate(kMaxStringLength); |
| 128 values.get_string_values().push_back(s); |
| 129 } break; |
| 130 case JSONValue::ValueType::kTypeString: { |
| 131 String v; |
| 132 innerVal->AsString(&v); |
| 133 v.Truncate(kMaxStringLength); |
| 134 values.get_string_values().push_back(v); |
| 135 } break; |
| 136 case JSONValue::ValueType::kTypeObject: |
| 137 values.get_entity_values().push_back(Entity::New()); |
| 138 extractEntity(*(JSONObject::Cast(innerVal)), |
| 139 *(values.get_entity_values().at(j)), recursionLevel + 1); |
| 140 break; |
| 141 default: |
| 142 break; |
| 143 } |
| 144 } |
| 145 return true; |
| 146 } |
| 147 |
| 148 void extractEntity(const JSONObject& val, Entity& entity, int recursionLevel) { |
| 149 if (recursionLevel >= kMaxDepth) { |
| 150 return; |
| 151 } |
| 152 |
| 153 String type; |
| 154 val.GetString(kJSONLDKeyType, &type); |
| 155 if (!type) { |
| 156 type = "Thing"; |
| 157 } |
| 158 entity.type = type; |
| 159 for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) { |
| 160 PropertyPtr property = Property::New(); |
| 161 const JSONObject::Entry& entry = val.at(i); |
| 162 property->name = entry.first; |
| 163 if (property->name == kJSONLDKeyType) { |
| 164 continue; |
| 165 } |
| 166 property->values = Values::New(); |
| 167 |
| 168 bool addProperty = true; |
| 169 |
| 170 switch (entry.second->GetType()) { |
| 171 case JSONValue::ValueType::kTypeBoolean: { |
| 172 bool v; |
| 173 val.GetBoolean(entry.first, &v); |
| 174 property->values->set_bool_values({v}); |
| 175 } break; |
| 176 case JSONValue::ValueType::kTypeInteger: { |
| 177 int v; |
| 178 val.GetInteger(entry.first, &v); |
| 179 property->values->set_long_values({v}); |
| 180 } break; |
| 181 case JSONValue::ValueType::kTypeDouble: { |
| 182 double v; |
| 183 val.GetDouble(entry.first, &v); |
| 184 String s = String::Number(v); |
| 185 s.Truncate(kMaxStringLength); |
| 186 property->values->set_string_values({s}); |
| 187 } break; |
| 188 case JSONValue::ValueType::kTypeString: { |
| 189 String v; |
| 190 val.GetString(entry.first, &v); |
| 191 v.Truncate(kMaxStringLength); |
| 192 property->values->set_string_values({v}); |
| 193 } break; |
| 194 case JSONValue::ValueType::kTypeObject: { |
| 195 if (recursionLevel + 1 >= kMaxDepth) { |
| 196 addProperty = false; |
| 197 break; |
| 198 } |
| 199 property->values->set_entity_values(Vector<EntityPtr>()); |
| 200 property->values->get_entity_values().push_back(Entity::New()); |
| 201 |
| 202 extractEntity(*(val.GetObject(entry.first)), |
| 203 *(property->values->get_entity_values().at(0)), |
| 204 recursionLevel + 1); |
| 205 } break; |
| 206 case JSONValue::ValueType::kTypeArray: |
| 207 addProperty = parseRepeatedValue(*(val.GetArray(entry.first)), |
| 208 *(property->values), recursionLevel); |
| 209 break; |
| 210 default: |
| 211 break; |
| 212 } |
| 213 if (addProperty) |
| 214 entity.properties.push_back(std::move(property)); |
| 215 } |
| 216 } |
| 217 |
| 218 void extractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) { |
| 219 // Now we have a JSONObject which corresponds to a single (possibly nested) |
| 220 // entity. |
| 221 EntityPtr entity = Entity::New(); |
| 222 String type; |
| 223 val.GetString(kJSONLDKeyType, &type); |
| 224 if (!isWhitelistedType(AtomicString(type))) { |
| 225 return; |
| 226 } |
| 227 extractEntity(val, *entity, 0); |
| 228 entities.push_back(std::move(entity)); |
| 229 } |
| 230 |
| 231 void extractEntitiesFromArray(const JSONArray& arr, |
| 232 Vector<EntityPtr>& entities) { |
| 233 for (size_t i = 0; i < arr.size(); ++i) { |
| 234 const JSONValue* val = arr.at(i); |
| 235 if (val->GetType() == JSONValue::ValueType::kTypeObject) { |
| 236 extractTopLevelEntity(*(JSONObject::Cast(val)), entities); |
| 237 } |
| 238 } |
| 239 } |
| 240 |
| 241 void extractEntityFromTopLevelObject(const JSONObject& val, |
| 242 Vector<EntityPtr>& entities) { |
| 243 const JSONArray* graph = val.GetArray(kJSONLDKeyGraph); |
| 244 if (graph) { |
| 245 extractEntitiesFromArray(*graph, entities); |
| 246 } |
| 247 extractTopLevelEntity(val, entities); |
| 248 } |
| 249 |
| 250 bool extractMetadata(const Element& root, Vector<EntityPtr>& entities) { |
24 for (Element& element : ElementTraversal::DescendantsOf(root)) { | 251 for (Element& element : ElementTraversal::DescendantsOf(root)) { |
25 if (element.HasTagName(HTMLNames::scriptTag) && | 252 if (element.HasTagName(HTMLNames::scriptTag) && |
26 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { | 253 element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { |
27 if (multiple) { | 254 std::unique_ptr<JSONValue> json = ParseJSON(element.textContent()); |
28 result.Append(","); | 255 if (!json) { |
| 256 LOG(ERROR) << "Failed to parse json."; |
| 257 return false; |
29 } | 258 } |
30 result.Append(element.textContent()); | 259 switch (json->GetType()) { |
31 multiple = true; | 260 case JSONValue::ValueType::kTypeArray: |
32 } | 261 extractEntitiesFromArray(*(JSONArray::Cast(json.get())), entities); |
33 } | 262 break; |
34 result.Append("]"); | 263 case JSONValue::ValueType::kTypeObject: |
35 return result.ToString(); | 264 extractEntityFromTopLevelObject(*(JSONObject::Cast(json.get())), |
| 265 entities); |
| 266 break; |
| 267 default: |
| 268 return false; |
| 269 } |
| 270 } |
| 271 } |
| 272 return !entities.IsEmpty(); |
36 } | 273 } |
37 | 274 |
38 } // namespace | 275 } // namespace |
39 | 276 |
40 String CopylessPasteExtractor::Extract(Document& document) { | 277 WebPagePtr CopylessPasteExtractor::extract(const Document& document) { |
41 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); | 278 TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); |
42 | 279 |
43 if (!document.GetFrame() || !document.GetFrame()->IsMainFrame()) | 280 if (!document.GetFrame() || !document.GetFrame()->IsMainFrame()) |
44 return g_empty_string; | 281 return nullptr; |
45 | |
46 DCHECK(document.HasFinishedParsing()); | |
47 | 282 |
48 Element* html = document.documentElement(); | 283 Element* html = document.documentElement(); |
49 if (!html) | 284 if (!html) |
50 return g_empty_string; | 285 return nullptr; |
51 | 286 |
52 double start_time = MonotonicallyIncreasingTime(); | 287 double start_time = MonotonicallyIncreasingTime(); |
53 | 288 |
| 289 WebPagePtr page = WebPage::New(); |
| 290 |
54 // Traverse the DOM tree and extract the metadata. | 291 // Traverse the DOM tree and extract the metadata. |
55 String result = ExtractMetadata(*html); | 292 if (!extractMetadata(*html, page->entities)) |
| 293 return nullptr; |
| 294 page->url = document.Url(); |
| 295 page->title = document.title(); |
56 | 296 |
57 double elapsed_time = MonotonicallyIncreasingTime() - start_time; | 297 double elapsed_time = MonotonicallyIncreasingTime() - start_time; |
58 | 298 |
59 DEFINE_STATIC_LOCAL(CustomCountHistogram, extraction_histogram, | 299 DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, |
60 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); | 300 ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); |
61 extraction_histogram.Count(static_cast<int>(1e6 * elapsed_time)); | 301 extractionHistogram.Count(static_cast<int>(1e6 * elapsed_time)); |
62 return result; | 302 return page; |
63 } | 303 } |
64 | 304 |
65 } // namespace blink | 305 } // namespace blink |
OLD | NEW |