Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(706)

Unified Diff: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp

Issue 2777623002: Move json-ld parsing to Blink.
Patch Set: update policy enforcement in blink, clank handling of repeated values Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
diff --git a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
index 6c3d28c0bdca950a58155c1dc2f00a1453193d71..b3a8885b97abc20dd378761991bee23ab52b1848 100644
--- a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
+++ b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
@@ -4,6 +4,8 @@
#include "modules/document_metadata/CopylessPasteExtractor.h"
+#include <memory>
+#include <algorithm>
#include "core/HTMLNames.h"
#include "core/dom/Document.h"
#include "core/dom/ElementTraversal.h"
@@ -11,55 +13,329 @@
#include "core/html/HTMLElement.h"
#include "platform/Histogram.h"
#include "platform/instrumentation/tracing/TraceEvent.h"
+#include "platform/json/JSONParser.h"
+#include "wtf/Vector.h"
#include "wtf/text/StringBuilder.h"
namespace blink {
+// TODO(dproctor): Temporary structs to hold entity return value. Replace with
+// whatever the mojo service uses, once that is ready.
+struct Property;
+
+struct Entity {
+ Vector<Property> properties;
+};
+bool operator==(const Entity&, const Entity&);
+std::ostream& operator<<(std::ostream&, const Entity&);
+
+struct Property {
+ String name;
+ JSONValue::ValueType type;
+
+ Vector<bool> boolVal;
+ Vector<int> intVal;
+ Vector<double> doubleVal;
+ Vector<String> strVal;
+ Vector<Entity> entityVal;
+};
+bool operator==(const Property&, const Property&);
+std::ostream& operator<<(std::ostream&, const Property&);
+
+struct WebPage {
+ String url;
+ String title;
+ Vector<Entity> entities;
+};
+bool operator==(const WebPage&, const WebPage&);
+std::ostream& operator<<(std::ostream&, const WebPage&);
+
+bool operator==(const Entity& lhs, const Entity& rhs) {
+ return lhs.properties == rhs.properties;
+}
+bool operator==(const Property& lhs, const Property& rhs) {
+ if (!(lhs.name == rhs.name && lhs.type == rhs.type)) {
+ return false;
+ }
+ switch (lhs.type) {
+ case JSONValue::ValueType::TypeBoolean:
+ return lhs.boolVal == rhs.boolVal;
+ case JSONValue::ValueType::TypeInteger:
+ return lhs.intVal == rhs.intVal;
+ case JSONValue::ValueType::TypeDouble:
+ return lhs.doubleVal == rhs.doubleVal;
+ case JSONValue::ValueType::TypeString:
+ return lhs.strVal == rhs.strVal;
+ case JSONValue::ValueType::TypeObject:
+ return lhs.entityVal == rhs.entityVal;
+ default:
+ return false;
+ }
+}
+bool operator==(const WebPage& lhs, const WebPage& rhs) {
+ return lhs.url == rhs.url && lhs.title == rhs.title &&
+ lhs.entities == rhs.entities;
+}
+std::ostream& operator<<(std::ostream& os, const Entity& v) {
+ os << "ENTITY: [";
+ for (auto p : v.properties) {
+ os << p;
+ }
+ os << "]";
+ return os;
+}
+std::ostream& operator<<(std::ostream& os, const Property& v) {
+ os << "Name: " << v.name << " TYPE : " << v.type << " VALUE: [ ";
+ switch (v.type) {
+ case JSONValue::ValueType::TypeBoolean:
+ for (auto b : v.boolVal)
+ os << b;
+ case JSONValue::ValueType::TypeInteger:
+ for (auto i : v.intVal)
+ os << i;
+ break;
+ case JSONValue::ValueType::TypeDouble:
+ for (auto d : v.doubleVal)
+ os << d;
+ break;
+ case JSONValue::ValueType::TypeString:
+ for (auto s : v.strVal)
+ os << s;
+ break;
+ case JSONValue::ValueType::TypeObject:
+ for (auto e : v.entityVal)
+ os << e;
+ break;
+ default:
+ break;
+ }
+ os << " ]";
+ return os;
+}
+std::ostream& operator<<(std::ostream& os, const WebPage& v) {
+ os << "URL: " << v.url << "TITLE: " << v.title << "ENTITIES: [";
+ for (auto e : v.entities) {
+ os << e;
+ }
+ os << "]";
+ return os;
+}
+
namespace {
-String extractMetadata(Element& root) {
- StringBuilder result;
- result.append("[");
- bool multiple = false;
+// App Indexing enforces a max nesting depth of 5. Our top level message
+// corresponds to the WebPage, so this only leaves 4 more levels.
+// TODO(dproctor): Do we want to fail parsing, or (more likely) only pass the
+// top levels to Icing?
+constexpr int kMaxDepth = 4;
+// Some strings are very long, and we don't currently use those, so limit string
+// length to something reasonable to avoid undue pressure on Icing. Note that
+// App Indexing supports strings up to length 20k.
+constexpr int kMaxStringLength = 200;
+// Enforced by App Indexing, so stop processing early if possible.
+constexpr size_t kMaxNumFields = 20;
+// Enforced by App Indexing, so stop processing early if possible.
+constexpr size_t kMaxRepeatedSize = 100;
+
+constexpr char kJSONLDKeyName[] = "name";
+constexpr char kJSONLDKeyType[] = "@type";
+constexpr char kJSONLDKeyGraph[] = "@graph";
+constexpr char kJSONLDKeyContext[] = "@context";
+
+void extractEntity(JSONObject* val, Entity* entity) {
+ for (size_t i = 0; i < std::min(val->size(), kMaxNumFields); ++i) {
+ Property property;
+ JSONObject::Entry entry = val->at(i);
+ property.name = entry.first;
+ property.type = entry.second->getType();
+ bool addProperty = true;
+
+ switch (property.type) {
+ case JSONValue::ValueType::TypeBoolean: {
+ bool v;
+ val->getBoolean(entry.first, &v);
+ property.boolVal.push_back(v);
+ } break;
+ case JSONValue::ValueType::TypeInteger: {
+ int v;
+ val->getInteger(entry.first, &v);
+ property.intVal.push_back(v);
+ } break;
+ case JSONValue::ValueType::TypeDouble: {
+ double v;
+ val->getDouble(entry.first, &v);
+ property.doubleVal.push_back(v);
+ } break;
+ case JSONValue::ValueType::TypeString: {
+ String v;
+ val->getString(entry.first, &v);
+ v.truncate(kMaxStringLength);
+ property.strVal.push_back(v);
+ } break;
+ case JSONValue::ValueType::TypeObject: {
+ property.entityVal.push_back(Entity());
+ extractEntity(val->getObject(entry.first), &(property.entityVal.at(0)));
+ } break;
+ case JSONValue::ValueType::TypeArray: {
+ JSONArray* arr = val->getArray(entry.first);
+ if (arr->size() < 1) {
+ addProperty = false;
+ break;
+ }
+
+ property.type = arr->at(0)->getType();
+ if (property.type == JSONArray::ValueType::TypeArray) {
+ // App Indexing doesn't support nested arrays.
+ addProperty = false;
+ break;
+ }
+ for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) {
+ JSONValue* innerVal = arr->at(j);
+ if (innerVal->getType() != property.type) {
+ addProperty = false;
+ break;
+ }
+ switch (innerVal->getType()) {
+ case JSONValue::ValueType::TypeBoolean: {
+ bool v;
+ innerVal->asBoolean(&v);
+ property.boolVal.push_back(v);
+ } break;
+ case JSONValue::ValueType::TypeInteger: {
+ int v;
+ innerVal->asInteger(&v);
+ property.intVal.push_back(v);
+ } break;
+ case JSONValue::ValueType::TypeDouble: {
+ double v;
+ innerVal->asDouble(&v);
+ property.doubleVal.push_back(v);
+ } break;
+ case JSONValue::ValueType::TypeString: {
+ String v;
+ innerVal->asString(&v);
+ property.strVal.push_back(v);
+ } break;
+ case JSONValue::ValueType::TypeObject:
+ property.entityVal.push_back(Entity());
+ extractEntity(JSONObject::cast(innerVal),
+ &(property.entityVal.at(j)));
+ break;
+ default:
+ break;
+ }
+ }
+ } break;
+ default:
+ break;
+ }
+ if (addProperty)
+ entity->properties.push_back(property);
+ }
+}
+
+bool isWhitelistedType(String type) {
+ DEFINE_STATIC_LOCAL(HashSet<String>, elements,
+ ({// Common types that include addresses.
+ "AutoDealer", "Hotel", "LocalBusiness", "Organization",
+ "Person", "Place", "PostalAddress", "Product",
+ "Residence", "Restaurant", "SingleFamilyResidence",
+ // Common types including phone numbers
+ "Store", "ContactPoint", "LodgingBusiness"}));
+ return type && elements.contains(type);
+}
+
+void extractTopLevelEntity(JSONObject* val, Vector<Entity>* entities) {
+ // Now we have a JSONObject which corresponds to a single (possibly nested)
+ // entity.
+ Entity entity;
+ String type;
+ val->getString(kJSONLDKeyType, &type);
+ if (!isWhitelistedType(type)) {
+ return;
+ }
+ extractEntity(val, &entity);
+ entities->push_back(entity);
+}
+
+void extractEntitiesFromArray(JSONArray* arr, Vector<Entity>* entities) {
+ for (size_t i = 0; i < arr->size(); ++i) {
+ JSONValue* val = arr->at(i);
+ switch (val->getType()) {
+ case JSONValue::ValueType::TypeObject:
+ extractTopLevelEntity(JSONObject::cast(val), entities);
+ break;
+ default:
+ // TODO(dproctor): :(
+ return;
+ }
+ }
+}
+
+void extractEntityFromTopLevelObject(JSONObject* val,
+ Vector<Entity>* entities) {
+ JSONArray* graph = val->getArray(kJSONLDKeyGraph);
+ if (graph) {
+ extractEntitiesFromArray(graph, entities);
+ }
+ extractTopLevelEntity(val, entities);
+}
+
+bool extractMetadata(const Element& root, Vector<Entity>* entities) {
for (Element& element : ElementTraversal::descendantsOf(root)) {
if (element.hasTagName(HTMLNames::scriptTag) &&
element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") {
- if (multiple) {
- result.append(",");
+ std::unique_ptr<JSONValue> json =
+ parseJSON(element.textContent(), kMaxDepth);
+ if (!json.get()) {
+ LOG(ERROR) << "Failed to parse json.";
+ return false;
+ }
+ LOG(ERROR) << "PARSED JSON: " << json->toPrettyJSONString();
+ switch (json->getType()) {
+ case JSONValue::ValueType::TypeArray:
+ extractEntitiesFromArray(JSONArray::cast(json.get()), entities);
+ break;
+ case JSONValue::ValueType::TypeObject:
+ extractEntityFromTopLevelObject(JSONObject::cast(json.get()),
+ entities);
+ break;
+ default:
+ return false;
}
- result.append(element.textContent());
- multiple = true;
}
}
- result.append("]");
- return result.toString();
+ return !entities->isEmpty();
}
} // namespace
-String CopylessPasteExtractor::extract(Document& document) {
+bool CopylessPasteExtractor::extract(const Document& document, WebPage* page) {
TRACE_EVENT0("blink", "CopylessPasteExtractor::extract");
if (!document.frame() || !document.frame()->isMainFrame())
- return emptyString;
+ return false;
DCHECK(document.hasFinishedParsing());
Element* html = document.documentElement();
if (!html)
- return emptyString;
+ return false;
double startTime = monotonicallyIncreasingTime();
// Traverse the DOM tree and extract the metadata.
- String result = extractMetadata(*html);
+ if (!extractMetadata(*html, &(page->entities)))
+ return false;
+ page->url = document.url().getString();
+ page->title = document.title();
double elapsedTime = monotonicallyIncreasingTime() - startTime;
DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram,
("CopylessPaste.ExtractionUs", 1, 1000000, 50));
extractionHistogram.count(static_cast<int>(1e6 * elapsedTime));
- return result;
+ return true;
}
} // namespace blink

Powered by Google App Engine
This is Rietveld 408576698