| Index: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
|
| diff --git a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
|
| index ed11bf5f3ced4de45468ba1ee7912cbd227070ff..c29520b6a53a516f3c0e5b1cae545ce773167aa5 100644
|
| --- a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
|
| +++ b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
|
| @@ -4,6 +4,10 @@
|
|
|
| #include "modules/document_metadata/CopylessPasteExtractor.h"
|
|
|
| +#include <algorithm>
|
| +#include <memory>
|
| +#include <utility>
|
| +
|
| #include "core/HTMLNames.h"
|
| #include "core/dom/Document.h"
|
| #include "core/dom/ElementTraversal.h"
|
| @@ -11,55 +15,291 @@
|
| #include "core/html/HTMLElement.h"
|
| #include "platform/Histogram.h"
|
| #include "platform/instrumentation/tracing/TraceEvent.h"
|
| +#include "platform/json/JSONParser.h"
|
| +#include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h"
|
| +#include "wtf/Vector.h"
|
| +#include "wtf/text/AtomicString.h"
|
| #include "wtf/text/StringBuilder.h"
|
|
|
| namespace blink {
|
|
|
| namespace {
|
|
|
| -String ExtractMetadata(Element& root) {
|
| - StringBuilder result;
|
| - result.Append("[");
|
| - bool multiple = false;
|
| +using mojom::document_metadata::blink::Entity;
|
| +using mojom::document_metadata::blink::EntityPtr;
|
| +using mojom::document_metadata::blink::Property;
|
| +using mojom::document_metadata::blink::PropertyPtr;
|
| +using mojom::document_metadata::blink::Values;
|
| +using mojom::document_metadata::blink::ValuesPtr;
|
| +using mojom::document_metadata::blink::WebPage;
|
| +using mojom::document_metadata::blink::WebPagePtr;
|
| +
|
| +// App Indexing enforces a max nesting depth of 5. Our top level message
|
| +// corresponds to the WebPage, so this only leaves 4 more levels. We will parse
|
| +// entites up to this depth, and ignore any further nesting. If an object at the
|
| +// max nesting depth has a property corresponding to an entity, that property
|
| +// will be dropped. Note that we will still parse json-ld blocks deeper than
|
| +// this, but it won't be passed to App Indexing.
|
| +constexpr int kMaxDepth = 4;
|
| +// Some strings are very long, and we don't currently use those, so limit string
|
| +// length to something reasonable to avoid undue pressure on Icing. Note that
|
| +// App Indexing supports strings up to length 20k.
|
| +constexpr int kMaxStringLength = 200;
|
| +// Enforced by App Indexing, so stop processing early if possible.
|
| +constexpr size_t kMaxNumFields = 20;
|
| +// Enforced by App Indexing, so stop processing early if possible.
|
| +constexpr size_t kMaxRepeatedSize = 100;
|
| +
|
| +constexpr char kJSONLDKeyType[] = "@type";
|
| +constexpr char kJSONLDKeyGraph[] = "@graph";
|
| +bool isWhitelistedType(AtomicString type) {
|
| + DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements,
|
| + ({// Common types that include addresses.
|
| + "AutoDealer", "Hotel", "LocalBusiness", "Organization",
|
| + "Person", "Place", "PostalAddress", "Product",
|
| + "Residence", "Restaurant", "SingleFamilyResidence",
|
| + // Common types including phone numbers
|
| + "Store", "ContactPoint", "LodgingBusiness"}));
|
| + return type && elements.Contains(type);
|
| +}
|
| +
|
| +void extractEntity(const JSONObject&, Entity&, int recursionLevel);
|
| +
|
| +bool parseRepeatedValue(const JSONArray& arr,
|
| + Values& values,
|
| + int recursionLevel) {
|
| + if (arr.size() < 1) {
|
| + return false;
|
| + }
|
| +
|
| + const JSONValue::ValueType type = arr.at(0)->GetType();
|
| + switch (type) {
|
| + case JSONValue::ValueType::kTypeBoolean:
|
| + values.set_bool_values(Vector<bool>());
|
| + break;
|
| + case JSONValue::ValueType::kTypeInteger:
|
| + values.set_long_values(Vector<int64_t>());
|
| + break;
|
| + case JSONValue::ValueType::kTypeDouble:
|
| + // App Indexing doesn't support double type, so just encode its decimal
|
| + // value as a string instead.
|
| + values.set_string_values(Vector<String>());
|
| + break;
|
| + case JSONValue::ValueType::kTypeString:
|
| + values.set_string_values(Vector<String>());
|
| + break;
|
| + case JSONValue::ValueType::kTypeObject:
|
| + if (recursionLevel + 1 >= kMaxDepth) {
|
| + return false;
|
| + }
|
| + values.set_entity_values(Vector<EntityPtr>());
|
| + break;
|
| + case JSONArray::ValueType::kTypeArray:
|
| + // App Indexing doesn't support nested arrays.
|
| + return false;
|
| + default:
|
| + break;
|
| + }
|
| + for (size_t j = 0; j < std::min(arr.size(), kMaxRepeatedSize); ++j) {
|
| + const JSONValue* innerVal = arr.at(j);
|
| + if (innerVal->GetType() != type) {
|
| + // App Indexing doesn't support mixed types. If there are mixed
|
| + // types in the parsed object, we will drop the property.
|
| + return false;
|
| + }
|
| + switch (innerVal->GetType()) {
|
| + case JSONValue::ValueType::kTypeBoolean: {
|
| + bool v;
|
| + innerVal->AsBoolean(&v);
|
| + values.get_bool_values().push_back(v);
|
| + } break;
|
| + case JSONValue::ValueType::kTypeInteger: {
|
| + int v;
|
| + innerVal->AsInteger(&v);
|
| + values.get_long_values().push_back(v);
|
| + } break;
|
| + case JSONValue::ValueType::kTypeDouble: {
|
| + // App Indexing doesn't support double type, so just encode its decimal
|
| + // value as a string instead.
|
| + double v;
|
| + innerVal->AsDouble(&v);
|
| + String s = String::Number(v);
|
| + s.Truncate(kMaxStringLength);
|
| + values.get_string_values().push_back(s);
|
| + } break;
|
| + case JSONValue::ValueType::kTypeString: {
|
| + String v;
|
| + innerVal->AsString(&v);
|
| + v.Truncate(kMaxStringLength);
|
| + values.get_string_values().push_back(v);
|
| + } break;
|
| + case JSONValue::ValueType::kTypeObject:
|
| + values.get_entity_values().push_back(Entity::New());
|
| + extractEntity(*(JSONObject::Cast(innerVal)),
|
| + *(values.get_entity_values().at(j)), recursionLevel + 1);
|
| + break;
|
| + default:
|
| + break;
|
| + }
|
| + }
|
| + return true;
|
| +}
|
| +
|
| +void extractEntity(const JSONObject& val, Entity& entity, int recursionLevel) {
|
| + if (recursionLevel >= kMaxDepth) {
|
| + return;
|
| + }
|
| +
|
| + String type;
|
| + val.GetString(kJSONLDKeyType, &type);
|
| + if (!type) {
|
| + type = "Thing";
|
| + }
|
| + entity.type = type;
|
| + for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) {
|
| + PropertyPtr property = Property::New();
|
| + const JSONObject::Entry& entry = val.at(i);
|
| + property->name = entry.first;
|
| + if (property->name == kJSONLDKeyType) {
|
| + continue;
|
| + }
|
| + property->values = Values::New();
|
| +
|
| + bool addProperty = true;
|
| +
|
| + switch (entry.second->GetType()) {
|
| + case JSONValue::ValueType::kTypeBoolean: {
|
| + bool v;
|
| + val.GetBoolean(entry.first, &v);
|
| + property->values->set_bool_values({v});
|
| + } break;
|
| + case JSONValue::ValueType::kTypeInteger: {
|
| + int v;
|
| + val.GetInteger(entry.first, &v);
|
| + property->values->set_long_values({v});
|
| + } break;
|
| + case JSONValue::ValueType::kTypeDouble: {
|
| + double v;
|
| + val.GetDouble(entry.first, &v);
|
| + String s = String::Number(v);
|
| + s.Truncate(kMaxStringLength);
|
| + property->values->set_string_values({s});
|
| + } break;
|
| + case JSONValue::ValueType::kTypeString: {
|
| + String v;
|
| + val.GetString(entry.first, &v);
|
| + v.Truncate(kMaxStringLength);
|
| + property->values->set_string_values({v});
|
| + } break;
|
| + case JSONValue::ValueType::kTypeObject: {
|
| + if (recursionLevel + 1 >= kMaxDepth) {
|
| + addProperty = false;
|
| + break;
|
| + }
|
| + property->values->set_entity_values(Vector<EntityPtr>());
|
| + property->values->get_entity_values().push_back(Entity::New());
|
| +
|
| + extractEntity(*(val.GetObject(entry.first)),
|
| + *(property->values->get_entity_values().at(0)),
|
| + recursionLevel + 1);
|
| + } break;
|
| + case JSONValue::ValueType::kTypeArray:
|
| + addProperty = parseRepeatedValue(*(val.GetArray(entry.first)),
|
| + *(property->values), recursionLevel);
|
| + break;
|
| + default:
|
| + break;
|
| + }
|
| + if (addProperty)
|
| + entity.properties.push_back(std::move(property));
|
| + }
|
| +}
|
| +
|
| +void extractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) {
|
| + // Now we have a JSONObject which corresponds to a single (possibly nested)
|
| + // entity.
|
| + EntityPtr entity = Entity::New();
|
| + String type;
|
| + val.GetString(kJSONLDKeyType, &type);
|
| + if (!isWhitelistedType(AtomicString(type))) {
|
| + return;
|
| + }
|
| + extractEntity(val, *entity, 0);
|
| + entities.push_back(std::move(entity));
|
| +}
|
| +
|
| +void extractEntitiesFromArray(const JSONArray& arr,
|
| + Vector<EntityPtr>& entities) {
|
| + for (size_t i = 0; i < arr.size(); ++i) {
|
| + const JSONValue* val = arr.at(i);
|
| + if (val->GetType() == JSONValue::ValueType::kTypeObject) {
|
| + extractTopLevelEntity(*(JSONObject::Cast(val)), entities);
|
| + }
|
| + }
|
| +}
|
| +
|
| +void extractEntityFromTopLevelObject(const JSONObject& val,
|
| + Vector<EntityPtr>& entities) {
|
| + const JSONArray* graph = val.GetArray(kJSONLDKeyGraph);
|
| + if (graph) {
|
| + extractEntitiesFromArray(*graph, entities);
|
| + }
|
| + extractTopLevelEntity(val, entities);
|
| +}
|
| +
|
| +bool extractMetadata(const Element& root, Vector<EntityPtr>& entities) {
|
| for (Element& element : ElementTraversal::DescendantsOf(root)) {
|
| if (element.HasTagName(HTMLNames::scriptTag) &&
|
| element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") {
|
| - if (multiple) {
|
| - result.Append(",");
|
| + std::unique_ptr<JSONValue> json = ParseJSON(element.textContent());
|
| + if (!json) {
|
| + LOG(ERROR) << "Failed to parse json.";
|
| + return false;
|
| + }
|
| + switch (json->GetType()) {
|
| + case JSONValue::ValueType::kTypeArray:
|
| + extractEntitiesFromArray(*(JSONArray::Cast(json.get())), entities);
|
| + break;
|
| + case JSONValue::ValueType::kTypeObject:
|
| + extractEntityFromTopLevelObject(*(JSONObject::Cast(json.get())),
|
| + entities);
|
| + break;
|
| + default:
|
| + return false;
|
| }
|
| - result.Append(element.textContent());
|
| - multiple = true;
|
| }
|
| }
|
| - result.Append("]");
|
| - return result.ToString();
|
| + return !entities.IsEmpty();
|
| }
|
|
|
| } // namespace
|
|
|
| -String CopylessPasteExtractor::Extract(Document& document) {
|
| +WebPagePtr CopylessPasteExtractor::extract(const Document& document) {
|
| TRACE_EVENT0("blink", "CopylessPasteExtractor::extract");
|
|
|
| if (!document.GetFrame() || !document.GetFrame()->IsMainFrame())
|
| - return g_empty_string;
|
| -
|
| - DCHECK(document.HasFinishedParsing());
|
| + return nullptr;
|
|
|
| Element* html = document.documentElement();
|
| if (!html)
|
| - return g_empty_string;
|
| + return nullptr;
|
|
|
| double start_time = MonotonicallyIncreasingTime();
|
|
|
| + WebPagePtr page = WebPage::New();
|
| +
|
| // Traverse the DOM tree and extract the metadata.
|
| - String result = ExtractMetadata(*html);
|
| + if (!extractMetadata(*html, page->entities))
|
| + return nullptr;
|
| + page->url = document.Url();
|
| + page->title = document.title();
|
|
|
| double elapsed_time = MonotonicallyIncreasingTime() - start_time;
|
|
|
| - DEFINE_STATIC_LOCAL(CustomCountHistogram, extraction_histogram,
|
| + DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram,
|
| ("CopylessPaste.ExtractionUs", 1, 1000000, 50));
|
| - extraction_histogram.Count(static_cast<int>(1e6 * elapsed_time));
|
| - return result;
|
| + extractionHistogram.Count(static_cast<int>(1e6 * elapsed_time));
|
| + return page;
|
| }
|
|
|
| } // namespace blink
|
|
|