| Index: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
|
| diff --git a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
|
| index 6c3d28c0bdca950a58155c1dc2f00a1453193d71..b3a8885b97abc20dd378761991bee23ab52b1848 100644
|
| --- a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
|
| +++ b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
|
| @@ -4,6 +4,8 @@
|
|
|
| #include "modules/document_metadata/CopylessPasteExtractor.h"
|
|
|
| +#include <memory>
|
| +#include <algorithm>
|
| #include "core/HTMLNames.h"
|
| #include "core/dom/Document.h"
|
| #include "core/dom/ElementTraversal.h"
|
| @@ -11,55 +13,329 @@
|
| #include "core/html/HTMLElement.h"
|
| #include "platform/Histogram.h"
|
| #include "platform/instrumentation/tracing/TraceEvent.h"
|
| +#include "platform/json/JSONParser.h"
|
| +#include "wtf/Vector.h"
|
| #include "wtf/text/StringBuilder.h"
|
|
|
| namespace blink {
|
|
|
| +// TODO(dproctor): Temporary structs to hold entity return value. Replace with
|
| +// whatever the mojo service uses, once that is ready.
|
| +struct Property;
|
| +
|
| +struct Entity {
|
| + Vector<Property> properties;
|
| +};
|
| +bool operator==(const Entity&, const Entity&);
|
| +std::ostream& operator<<(std::ostream&, const Entity&);
|
| +
|
| +struct Property {
|
| + String name;
|
| + JSONValue::ValueType type;
|
| +
|
| + Vector<bool> boolVal;
|
| + Vector<int> intVal;
|
| + Vector<double> doubleVal;
|
| + Vector<String> strVal;
|
| + Vector<Entity> entityVal;
|
| +};
|
| +bool operator==(const Property&, const Property&);
|
| +std::ostream& operator<<(std::ostream&, const Property&);
|
| +
|
| +struct WebPage {
|
| + String url;
|
| + String title;
|
| + Vector<Entity> entities;
|
| +};
|
| +bool operator==(const WebPage&, const WebPage&);
|
| +std::ostream& operator<<(std::ostream&, const WebPage&);
|
| +
|
| +bool operator==(const Entity& lhs, const Entity& rhs) {
|
| + return lhs.properties == rhs.properties;
|
| +}
|
| +bool operator==(const Property& lhs, const Property& rhs) {
|
| + if (!(lhs.name == rhs.name && lhs.type == rhs.type)) {
|
| + return false;
|
| + }
|
| + switch (lhs.type) {
|
| + case JSONValue::ValueType::TypeBoolean:
|
| + return lhs.boolVal == rhs.boolVal;
|
| + case JSONValue::ValueType::TypeInteger:
|
| + return lhs.intVal == rhs.intVal;
|
| + case JSONValue::ValueType::TypeDouble:
|
| + return lhs.doubleVal == rhs.doubleVal;
|
| + case JSONValue::ValueType::TypeString:
|
| + return lhs.strVal == rhs.strVal;
|
| + case JSONValue::ValueType::TypeObject:
|
| + return lhs.entityVal == rhs.entityVal;
|
| + default:
|
| + return false;
|
| + }
|
| +}
|
| +bool operator==(const WebPage& lhs, const WebPage& rhs) {
|
| + return lhs.url == rhs.url && lhs.title == rhs.title &&
|
| + lhs.entities == rhs.entities;
|
| +}
|
| +std::ostream& operator<<(std::ostream& os, const Entity& v) {
|
| + os << "ENTITY: [";
|
| + for (auto p : v.properties) {
|
| + os << p;
|
| + }
|
| + os << "]";
|
| + return os;
|
| +}
|
| +std::ostream& operator<<(std::ostream& os, const Property& v) {
|
| + os << "Name: " << v.name << " TYPE : " << v.type << " VALUE: [ ";
|
| + switch (v.type) {
|
| + case JSONValue::ValueType::TypeBoolean:
|
| + for (auto b : v.boolVal)
|
| + os << b;
|
| + case JSONValue::ValueType::TypeInteger:
|
| + for (auto i : v.intVal)
|
| + os << i;
|
| + break;
|
| + case JSONValue::ValueType::TypeDouble:
|
| + for (auto d : v.doubleVal)
|
| + os << d;
|
| + break;
|
| + case JSONValue::ValueType::TypeString:
|
| + for (auto s : v.strVal)
|
| + os << s;
|
| + break;
|
| + case JSONValue::ValueType::TypeObject:
|
| + for (auto e : v.entityVal)
|
| + os << e;
|
| + break;
|
| + default:
|
| + break;
|
| + }
|
| + os << " ]";
|
| + return os;
|
| +}
|
| +std::ostream& operator<<(std::ostream& os, const WebPage& v) {
|
| + os << "URL: " << v.url << "TITLE: " << v.title << "ENTITIES: [";
|
| + for (auto e : v.entities) {
|
| + os << e;
|
| + }
|
| + os << "]";
|
| + return os;
|
| +}
|
| +
|
| namespace {
|
|
|
| -String extractMetadata(Element& root) {
|
| - StringBuilder result;
|
| - result.append("[");
|
| - bool multiple = false;
|
| +// App Indexing enforces a max nesting depth of 5. Our top level message
|
| +// corresponds to the WebPage, so this only leaves 4 more levels.
|
| +// TODO(dproctor): Do we want to fail parsing, or (more likely) only pass the
|
| +// top levels to Icing?
|
| +constexpr int kMaxDepth = 4;
|
| +// Some strings are very long, and we don't currently use those, so limit string
|
| +// length to something reasonable to avoid undue pressure on Icing. Note that
|
| +// App Indexing supports strings up to length 20k.
|
| +constexpr int kMaxStringLength = 200;
|
| +// Enforced by App Indexing, so stop processing early if possible.
|
| +constexpr size_t kMaxNumFields = 20;
|
| +// Enforced by App Indexing, so stop processing early if possible.
|
| +constexpr size_t kMaxRepeatedSize = 100;
|
| +
|
| +constexpr char kJSONLDKeyName[] = "name";
|
| +constexpr char kJSONLDKeyType[] = "@type";
|
| +constexpr char kJSONLDKeyGraph[] = "@graph";
|
| +constexpr char kJSONLDKeyContext[] = "@context";
|
| +
|
| +void extractEntity(JSONObject* val, Entity* entity) {
|
| + for (size_t i = 0; i < std::min(val->size(), kMaxNumFields); ++i) {
|
| + Property property;
|
| + JSONObject::Entry entry = val->at(i);
|
| + property.name = entry.first;
|
| + property.type = entry.second->getType();
|
| + bool addProperty = true;
|
| +
|
| + switch (property.type) {
|
| + case JSONValue::ValueType::TypeBoolean: {
|
| + bool v;
|
| + val->getBoolean(entry.first, &v);
|
| + property.boolVal.push_back(v);
|
| + } break;
|
| + case JSONValue::ValueType::TypeInteger: {
|
| + int v;
|
| + val->getInteger(entry.first, &v);
|
| + property.intVal.push_back(v);
|
| + } break;
|
| + case JSONValue::ValueType::TypeDouble: {
|
| + double v;
|
| + val->getDouble(entry.first, &v);
|
| + property.doubleVal.push_back(v);
|
| + } break;
|
| + case JSONValue::ValueType::TypeString: {
|
| + String v;
|
| + val->getString(entry.first, &v);
|
| + v.truncate(kMaxStringLength);
|
| + property.strVal.push_back(v);
|
| + } break;
|
| + case JSONValue::ValueType::TypeObject: {
|
| + property.entityVal.push_back(Entity());
|
| + extractEntity(val->getObject(entry.first), &(property.entityVal.at(0)));
|
| + } break;
|
| + case JSONValue::ValueType::TypeArray: {
|
| + JSONArray* arr = val->getArray(entry.first);
|
| + if (arr->size() < 1) {
|
| + addProperty = false;
|
| + break;
|
| + }
|
| +
|
| + property.type = arr->at(0)->getType();
|
| + if (property.type == JSONArray::ValueType::TypeArray) {
|
| + // App Indexing doesn't support nested arrays.
|
| + addProperty = false;
|
| + break;
|
| + }
|
| + for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) {
|
| + JSONValue* innerVal = arr->at(j);
|
| + if (innerVal->getType() != property.type) {
|
| + addProperty = false;
|
| + break;
|
| + }
|
| + switch (innerVal->getType()) {
|
| + case JSONValue::ValueType::TypeBoolean: {
|
| + bool v;
|
| + innerVal->asBoolean(&v);
|
| + property.boolVal.push_back(v);
|
| + } break;
|
| + case JSONValue::ValueType::TypeInteger: {
|
| + int v;
|
| + innerVal->asInteger(&v);
|
| + property.intVal.push_back(v);
|
| + } break;
|
| + case JSONValue::ValueType::TypeDouble: {
|
| + double v;
|
| + innerVal->asDouble(&v);
|
| + property.doubleVal.push_back(v);
|
| + } break;
|
| + case JSONValue::ValueType::TypeString: {
|
| + String v;
|
| + innerVal->asString(&v);
|
| + property.strVal.push_back(v);
|
| + } break;
|
| + case JSONValue::ValueType::TypeObject:
|
| + property.entityVal.push_back(Entity());
|
| + extractEntity(JSONObject::cast(innerVal),
|
| + &(property.entityVal.at(j)));
|
| + break;
|
| + default:
|
| + break;
|
| + }
|
| + }
|
| + } break;
|
| + default:
|
| + break;
|
| + }
|
| + if (addProperty)
|
| + entity->properties.push_back(property);
|
| + }
|
| +}
|
| +
|
| +bool isWhitelistedType(String type) {
|
| + DEFINE_STATIC_LOCAL(HashSet<String>, elements,
|
| + ({// Common types that include addresses.
|
| + "AutoDealer", "Hotel", "LocalBusiness", "Organization",
|
| + "Person", "Place", "PostalAddress", "Product",
|
| + "Residence", "Restaurant", "SingleFamilyResidence",
|
| + // Common types including phone numbers
|
| + "Store", "ContactPoint", "LodgingBusiness"}));
|
| + return type && elements.contains(type);
|
| +}
|
| +
|
| +void extractTopLevelEntity(JSONObject* val, Vector<Entity>* entities) {
|
| + // Now we have a JSONObject which corresponds to a single (possibly nested)
|
| + // entity.
|
| + Entity entity;
|
| + String type;
|
| + val->getString(kJSONLDKeyType, &type);
|
| + if (!isWhitelistedType(type)) {
|
| + return;
|
| + }
|
| + extractEntity(val, &entity);
|
| + entities->push_back(entity);
|
| +}
|
| +
|
| +void extractEntitiesFromArray(JSONArray* arr, Vector<Entity>* entities) {
|
| + for (size_t i = 0; i < arr->size(); ++i) {
|
| + JSONValue* val = arr->at(i);
|
| + switch (val->getType()) {
|
| + case JSONValue::ValueType::TypeObject:
|
| + extractTopLevelEntity(JSONObject::cast(val), entities);
|
| + break;
|
| + default:
|
| + // TODO(dproctor): :(
|
| + return;
|
| + }
|
| + }
|
| +}
|
| +
|
| +void extractEntityFromTopLevelObject(JSONObject* val,
|
| + Vector<Entity>* entities) {
|
| + JSONArray* graph = val->getArray(kJSONLDKeyGraph);
|
| + if (graph) {
|
| + extractEntitiesFromArray(graph, entities);
|
| + }
|
| + extractTopLevelEntity(val, entities);
|
| +}
|
| +
|
| +bool extractMetadata(const Element& root, Vector<Entity>* entities) {
|
| for (Element& element : ElementTraversal::descendantsOf(root)) {
|
| if (element.hasTagName(HTMLNames::scriptTag) &&
|
| element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") {
|
| - if (multiple) {
|
| - result.append(",");
|
| + std::unique_ptr<JSONValue> json =
|
| + parseJSON(element.textContent(), kMaxDepth);
|
| + if (!json.get()) {
|
| + LOG(ERROR) << "Failed to parse json.";
|
| + return false;
|
| + }
|
| + LOG(ERROR) << "PARSED JSON: " << json->toPrettyJSONString();
|
| + switch (json->getType()) {
|
| + case JSONValue::ValueType::TypeArray:
|
| + extractEntitiesFromArray(JSONArray::cast(json.get()), entities);
|
| + break;
|
| + case JSONValue::ValueType::TypeObject:
|
| + extractEntityFromTopLevelObject(JSONObject::cast(json.get()),
|
| + entities);
|
| + break;
|
| + default:
|
| + return false;
|
| }
|
| - result.append(element.textContent());
|
| - multiple = true;
|
| }
|
| }
|
| - result.append("]");
|
| - return result.toString();
|
| + return !entities->isEmpty();
|
| }
|
|
|
| } // namespace
|
|
|
| -String CopylessPasteExtractor::extract(Document& document) {
|
| +bool CopylessPasteExtractor::extract(const Document& document, WebPage* page) {
|
| TRACE_EVENT0("blink", "CopylessPasteExtractor::extract");
|
|
|
| if (!document.frame() || !document.frame()->isMainFrame())
|
| - return emptyString;
|
| + return false;
|
|
|
| DCHECK(document.hasFinishedParsing());
|
|
|
| Element* html = document.documentElement();
|
| if (!html)
|
| - return emptyString;
|
| + return false;
|
|
|
| double startTime = monotonicallyIncreasingTime();
|
|
|
| // Traverse the DOM tree and extract the metadata.
|
| - String result = extractMetadata(*html);
|
| + if (!extractMetadata(*html, &(page->entities)))
|
| + return false;
|
| + page->url = document.url().getString();
|
| + page->title = document.title();
|
|
|
| double elapsedTime = monotonicallyIncreasingTime() - startTime;
|
|
|
| DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram,
|
| ("CopylessPaste.ExtractionUs", 1, 1000000, 50));
|
| extractionHistogram.count(static_cast<int>(1e6 * elapsedTime));
|
| - return result;
|
| + return true;
|
| }
|
|
|
| } // namespace blink
|
|
|