Index: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
diff --git a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
index ed11bf5f3ced4de45468ba1ee7912cbd227070ff..c29520b6a53a516f3c0e5b1cae545ce773167aa5 100644 |
--- a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
+++ b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
@@ -4,6 +4,10 @@ |
#include "modules/document_metadata/CopylessPasteExtractor.h" |
+#include <algorithm> |
+#include <memory> |
+#include <utility> |
+ |
#include "core/HTMLNames.h" |
#include "core/dom/Document.h" |
#include "core/dom/ElementTraversal.h" |
@@ -11,55 +15,291 @@ |
#include "core/html/HTMLElement.h" |
#include "platform/Histogram.h" |
#include "platform/instrumentation/tracing/TraceEvent.h" |
+#include "platform/json/JSONParser.h" |
+#include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h" |
+#include "wtf/Vector.h" |
+#include "wtf/text/AtomicString.h" |
#include "wtf/text/StringBuilder.h" |
namespace blink { |
namespace { |
-String ExtractMetadata(Element& root) { |
- StringBuilder result; |
- result.Append("["); |
- bool multiple = false; |
+using mojom::document_metadata::blink::Entity; |
+using mojom::document_metadata::blink::EntityPtr; |
+using mojom::document_metadata::blink::Property; |
+using mojom::document_metadata::blink::PropertyPtr; |
+using mojom::document_metadata::blink::Values; |
+using mojom::document_metadata::blink::ValuesPtr; |
+using mojom::document_metadata::blink::WebPage; |
+using mojom::document_metadata::blink::WebPagePtr; |
+ |
+// App Indexing enforces a max nesting depth of 5. Our top level message |
+// corresponds to the WebPage, so this only leaves 4 more levels. We will parse |
+// entites up to this depth, and ignore any further nesting. If an object at the |
+// max nesting depth has a property corresponding to an entity, that property |
+// will be dropped. Note that we will still parse json-ld blocks deeper than |
+// this, but it won't be passed to App Indexing. |
+constexpr int kMaxDepth = 4; |
+// Some strings are very long, and we don't currently use those, so limit string |
+// length to something reasonable to avoid undue pressure on Icing. Note that |
+// App Indexing supports strings up to length 20k. |
+constexpr int kMaxStringLength = 200; |
+// Enforced by App Indexing, so stop processing early if possible. |
+constexpr size_t kMaxNumFields = 20; |
+// Enforced by App Indexing, so stop processing early if possible. |
+constexpr size_t kMaxRepeatedSize = 100; |
+ |
+constexpr char kJSONLDKeyType[] = "@type"; |
+constexpr char kJSONLDKeyGraph[] = "@graph"; |
+bool isWhitelistedType(AtomicString type) { |
+ DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements, |
+ ({// Common types that include addresses. |
+ "AutoDealer", "Hotel", "LocalBusiness", "Organization", |
+ "Person", "Place", "PostalAddress", "Product", |
+ "Residence", "Restaurant", "SingleFamilyResidence", |
+ // Common types including phone numbers |
+ "Store", "ContactPoint", "LodgingBusiness"})); |
+ return type && elements.Contains(type); |
+} |
+ |
+void extractEntity(const JSONObject&, Entity&, int recursionLevel); |
+ |
+bool parseRepeatedValue(const JSONArray& arr, |
+ Values& values, |
+ int recursionLevel) { |
+ if (arr.size() < 1) { |
+ return false; |
+ } |
+ |
+ const JSONValue::ValueType type = arr.at(0)->GetType(); |
+ switch (type) { |
+ case JSONValue::ValueType::kTypeBoolean: |
+ values.set_bool_values(Vector<bool>()); |
+ break; |
+ case JSONValue::ValueType::kTypeInteger: |
+ values.set_long_values(Vector<int64_t>()); |
+ break; |
+ case JSONValue::ValueType::kTypeDouble: |
+ // App Indexing doesn't support double type, so just encode its decimal |
+ // value as a string instead. |
+ values.set_string_values(Vector<String>()); |
+ break; |
+ case JSONValue::ValueType::kTypeString: |
+ values.set_string_values(Vector<String>()); |
+ break; |
+ case JSONValue::ValueType::kTypeObject: |
+ if (recursionLevel + 1 >= kMaxDepth) { |
+ return false; |
+ } |
+ values.set_entity_values(Vector<EntityPtr>()); |
+ break; |
+ case JSONArray::ValueType::kTypeArray: |
+ // App Indexing doesn't support nested arrays. |
+ return false; |
+ default: |
+ break; |
+ } |
+ for (size_t j = 0; j < std::min(arr.size(), kMaxRepeatedSize); ++j) { |
+ const JSONValue* innerVal = arr.at(j); |
+ if (innerVal->GetType() != type) { |
+ // App Indexing doesn't support mixed types. If there are mixed |
+ // types in the parsed object, we will drop the property. |
+ return false; |
+ } |
+ switch (innerVal->GetType()) { |
+ case JSONValue::ValueType::kTypeBoolean: { |
+ bool v; |
+ innerVal->AsBoolean(&v); |
+ values.get_bool_values().push_back(v); |
+ } break; |
+ case JSONValue::ValueType::kTypeInteger: { |
+ int v; |
+ innerVal->AsInteger(&v); |
+ values.get_long_values().push_back(v); |
+ } break; |
+ case JSONValue::ValueType::kTypeDouble: { |
+ // App Indexing doesn't support double type, so just encode its decimal |
+ // value as a string instead. |
+ double v; |
+ innerVal->AsDouble(&v); |
+ String s = String::Number(v); |
+ s.Truncate(kMaxStringLength); |
+ values.get_string_values().push_back(s); |
+ } break; |
+ case JSONValue::ValueType::kTypeString: { |
+ String v; |
+ innerVal->AsString(&v); |
+ v.Truncate(kMaxStringLength); |
+ values.get_string_values().push_back(v); |
+ } break; |
+ case JSONValue::ValueType::kTypeObject: |
+ values.get_entity_values().push_back(Entity::New()); |
+ extractEntity(*(JSONObject::Cast(innerVal)), |
+ *(values.get_entity_values().at(j)), recursionLevel + 1); |
+ break; |
+ default: |
+ break; |
+ } |
+ } |
+ return true; |
+} |
+ |
+void extractEntity(const JSONObject& val, Entity& entity, int recursionLevel) { |
+ if (recursionLevel >= kMaxDepth) { |
+ return; |
+ } |
+ |
+ String type; |
+ val.GetString(kJSONLDKeyType, &type); |
+ if (!type) { |
+ type = "Thing"; |
+ } |
+ entity.type = type; |
+ for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) { |
+ PropertyPtr property = Property::New(); |
+ const JSONObject::Entry& entry = val.at(i); |
+ property->name = entry.first; |
+ if (property->name == kJSONLDKeyType) { |
+ continue; |
+ } |
+ property->values = Values::New(); |
+ |
+ bool addProperty = true; |
+ |
+ switch (entry.second->GetType()) { |
+ case JSONValue::ValueType::kTypeBoolean: { |
+ bool v; |
+ val.GetBoolean(entry.first, &v); |
+ property->values->set_bool_values({v}); |
+ } break; |
+ case JSONValue::ValueType::kTypeInteger: { |
+ int v; |
+ val.GetInteger(entry.first, &v); |
+ property->values->set_long_values({v}); |
+ } break; |
+ case JSONValue::ValueType::kTypeDouble: { |
+ double v; |
+ val.GetDouble(entry.first, &v); |
+ String s = String::Number(v); |
+ s.Truncate(kMaxStringLength); |
+ property->values->set_string_values({s}); |
+ } break; |
+ case JSONValue::ValueType::kTypeString: { |
+ String v; |
+ val.GetString(entry.first, &v); |
+ v.Truncate(kMaxStringLength); |
+ property->values->set_string_values({v}); |
+ } break; |
+ case JSONValue::ValueType::kTypeObject: { |
+ if (recursionLevel + 1 >= kMaxDepth) { |
+ addProperty = false; |
+ break; |
+ } |
+ property->values->set_entity_values(Vector<EntityPtr>()); |
+ property->values->get_entity_values().push_back(Entity::New()); |
+ |
+ extractEntity(*(val.GetObject(entry.first)), |
+ *(property->values->get_entity_values().at(0)), |
+ recursionLevel + 1); |
+ } break; |
+ case JSONValue::ValueType::kTypeArray: |
+ addProperty = parseRepeatedValue(*(val.GetArray(entry.first)), |
+ *(property->values), recursionLevel); |
+ break; |
+ default: |
+ break; |
+ } |
+ if (addProperty) |
+ entity.properties.push_back(std::move(property)); |
+ } |
+} |
+ |
+void extractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) { |
+ // Now we have a JSONObject which corresponds to a single (possibly nested) |
+ // entity. |
+ EntityPtr entity = Entity::New(); |
+ String type; |
+ val.GetString(kJSONLDKeyType, &type); |
+ if (!isWhitelistedType(AtomicString(type))) { |
+ return; |
+ } |
+ extractEntity(val, *entity, 0); |
+ entities.push_back(std::move(entity)); |
+} |
+ |
+void extractEntitiesFromArray(const JSONArray& arr, |
+ Vector<EntityPtr>& entities) { |
+ for (size_t i = 0; i < arr.size(); ++i) { |
+ const JSONValue* val = arr.at(i); |
+ if (val->GetType() == JSONValue::ValueType::kTypeObject) { |
+ extractTopLevelEntity(*(JSONObject::Cast(val)), entities); |
+ } |
+ } |
+} |
+ |
+void extractEntityFromTopLevelObject(const JSONObject& val, |
+ Vector<EntityPtr>& entities) { |
+ const JSONArray* graph = val.GetArray(kJSONLDKeyGraph); |
+ if (graph) { |
+ extractEntitiesFromArray(*graph, entities); |
+ } |
+ extractTopLevelEntity(val, entities); |
+} |
+ |
+bool extractMetadata(const Element& root, Vector<EntityPtr>& entities) { |
for (Element& element : ElementTraversal::DescendantsOf(root)) { |
if (element.HasTagName(HTMLNames::scriptTag) && |
element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { |
- if (multiple) { |
- result.Append(","); |
+ std::unique_ptr<JSONValue> json = ParseJSON(element.textContent()); |
+ if (!json) { |
+ LOG(ERROR) << "Failed to parse json."; |
+ return false; |
+ } |
+ switch (json->GetType()) { |
+ case JSONValue::ValueType::kTypeArray: |
+ extractEntitiesFromArray(*(JSONArray::Cast(json.get())), entities); |
+ break; |
+ case JSONValue::ValueType::kTypeObject: |
+ extractEntityFromTopLevelObject(*(JSONObject::Cast(json.get())), |
+ entities); |
+ break; |
+ default: |
+ return false; |
} |
- result.Append(element.textContent()); |
- multiple = true; |
} |
} |
- result.Append("]"); |
- return result.ToString(); |
+ return !entities.IsEmpty(); |
} |
} // namespace |
-String CopylessPasteExtractor::Extract(Document& document) { |
+WebPagePtr CopylessPasteExtractor::extract(const Document& document) { |
TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); |
if (!document.GetFrame() || !document.GetFrame()->IsMainFrame()) |
- return g_empty_string; |
- |
- DCHECK(document.HasFinishedParsing()); |
+ return nullptr; |
Element* html = document.documentElement(); |
if (!html) |
- return g_empty_string; |
+ return nullptr; |
double start_time = MonotonicallyIncreasingTime(); |
+ WebPagePtr page = WebPage::New(); |
+ |
// Traverse the DOM tree and extract the metadata. |
- String result = ExtractMetadata(*html); |
+ if (!extractMetadata(*html, page->entities)) |
+ return nullptr; |
+ page->url = document.Url(); |
+ page->title = document.title(); |
double elapsed_time = MonotonicallyIncreasingTime() - start_time; |
- DEFINE_STATIC_LOCAL(CustomCountHistogram, extraction_histogram, |
+ DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, |
("CopylessPaste.ExtractionUs", 1, 1000000, 50)); |
- extraction_histogram.Count(static_cast<int>(1e6 * elapsed_time)); |
- return result; |
+ extractionHistogram.Count(static_cast<int>(1e6 * elapsed_time)); |
+ return page; |
} |
} // namespace blink |