Index: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
diff --git a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
index 6c3d28c0bdca950a58155c1dc2f00a1453193d71..b3a8885b97abc20dd378761991bee23ab52b1848 100644 |
--- a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
+++ b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
@@ -4,6 +4,8 @@ |
#include "modules/document_metadata/CopylessPasteExtractor.h" |
+#include <memory> |
+#include <algorithm> |
#include "core/HTMLNames.h" |
#include "core/dom/Document.h" |
#include "core/dom/ElementTraversal.h" |
@@ -11,55 +13,329 @@ |
#include "core/html/HTMLElement.h" |
#include "platform/Histogram.h" |
#include "platform/instrumentation/tracing/TraceEvent.h" |
+#include "platform/json/JSONParser.h" |
+#include "wtf/Vector.h" |
#include "wtf/text/StringBuilder.h" |
namespace blink { |
+// TODO(dproctor): Temporary structs to hold entity return value. Replace with |
+// whatever the mojo service uses, once that is ready. |
+struct Property; |
+ |
+struct Entity { |
+ Vector<Property> properties; |
+}; |
+bool operator==(const Entity&, const Entity&); |
+std::ostream& operator<<(std::ostream&, const Entity&); |
+ |
+struct Property { |
+ String name; |
+ JSONValue::ValueType type; |
+ |
+ Vector<bool> boolVal; |
+ Vector<int> intVal; |
+ Vector<double> doubleVal; |
+ Vector<String> strVal; |
+ Vector<Entity> entityVal; |
+}; |
+bool operator==(const Property&, const Property&); |
+std::ostream& operator<<(std::ostream&, const Property&); |
+ |
+struct WebPage { |
+ String url; |
+ String title; |
+ Vector<Entity> entities; |
+}; |
+bool operator==(const WebPage&, const WebPage&); |
+std::ostream& operator<<(std::ostream&, const WebPage&); |
+ |
+bool operator==(const Entity& lhs, const Entity& rhs) { |
+ return lhs.properties == rhs.properties; |
+} |
+bool operator==(const Property& lhs, const Property& rhs) { |
+ if (!(lhs.name == rhs.name && lhs.type == rhs.type)) { |
+ return false; |
+ } |
+ switch (lhs.type) { |
+ case JSONValue::ValueType::TypeBoolean: |
+ return lhs.boolVal == rhs.boolVal; |
+ case JSONValue::ValueType::TypeInteger: |
+ return lhs.intVal == rhs.intVal; |
+ case JSONValue::ValueType::TypeDouble: |
+ return lhs.doubleVal == rhs.doubleVal; |
+ case JSONValue::ValueType::TypeString: |
+ return lhs.strVal == rhs.strVal; |
+ case JSONValue::ValueType::TypeObject: |
+ return lhs.entityVal == rhs.entityVal; |
+ default: |
+ return false; |
+ } |
+} |
+bool operator==(const WebPage& lhs, const WebPage& rhs) { |
+ return lhs.url == rhs.url && lhs.title == rhs.title && |
+ lhs.entities == rhs.entities; |
+} |
+std::ostream& operator<<(std::ostream& os, const Entity& v) { |
+ os << "ENTITY: ["; |
+ for (auto p : v.properties) { |
+ os << p; |
+ } |
+ os << "]"; |
+ return os; |
+} |
+std::ostream& operator<<(std::ostream& os, const Property& v) { |
+ os << "Name: " << v.name << " TYPE : " << v.type << " VALUE: [ "; |
+ switch (v.type) { |
+ case JSONValue::ValueType::TypeBoolean: |
+ for (auto b : v.boolVal) |
+ os << b; |
+ case JSONValue::ValueType::TypeInteger: |
+ for (auto i : v.intVal) |
+ os << i; |
+ break; |
+ case JSONValue::ValueType::TypeDouble: |
+ for (auto d : v.doubleVal) |
+ os << d; |
+ break; |
+ case JSONValue::ValueType::TypeString: |
+ for (auto s : v.strVal) |
+ os << s; |
+ break; |
+ case JSONValue::ValueType::TypeObject: |
+ for (auto e : v.entityVal) |
+ os << e; |
+ break; |
+ default: |
+ break; |
+ } |
+ os << " ]"; |
+ return os; |
+} |
+std::ostream& operator<<(std::ostream& os, const WebPage& v) { |
+ os << "URL: " << v.url << "TITLE: " << v.title << "ENTITIES: ["; |
+ for (auto e : v.entities) { |
+ os << e; |
+ } |
+ os << "]"; |
+ return os; |
+} |
+ |
namespace { |
-String extractMetadata(Element& root) { |
- StringBuilder result; |
- result.append("["); |
- bool multiple = false; |
+// App Indexing enforces a max nesting depth of 5. Our top level message |
+// corresponds to the WebPage, so this only leaves 4 more levels. |
+// TODO(dproctor): Do we want to fail parsing, or (more likely) only pass the |
+// top levels to Icing? |
+constexpr int kMaxDepth = 4; |
+// Some strings are very long, and we don't currently use those, so limit string |
+// length to something reasonable to avoid undue pressure on Icing. Note that |
+// App Indexing supports strings up to length 20k. |
+constexpr int kMaxStringLength = 200; |
+// Enforced by App Indexing, so stop processing early if possible. |
+constexpr size_t kMaxNumFields = 20; |
+// Enforced by App Indexing, so stop processing early if possible. |
+constexpr size_t kMaxRepeatedSize = 100; |
+ |
+constexpr char kJSONLDKeyName[] = "name"; |
+constexpr char kJSONLDKeyType[] = "@type"; |
+constexpr char kJSONLDKeyGraph[] = "@graph"; |
+constexpr char kJSONLDKeyContext[] = "@context"; |
+ |
+void extractEntity(JSONObject* val, Entity* entity) { |
+ for (size_t i = 0; i < std::min(val->size(), kMaxNumFields); ++i) { |
+ Property property; |
+ JSONObject::Entry entry = val->at(i); |
+ property.name = entry.first; |
+ property.type = entry.second->getType(); |
+ bool addProperty = true; |
+ |
+ switch (property.type) { |
+ case JSONValue::ValueType::TypeBoolean: { |
+ bool v; |
+ val->getBoolean(entry.first, &v); |
+ property.boolVal.push_back(v); |
+ } break; |
+ case JSONValue::ValueType::TypeInteger: { |
+ int v; |
+ val->getInteger(entry.first, &v); |
+ property.intVal.push_back(v); |
+ } break; |
+ case JSONValue::ValueType::TypeDouble: { |
+ double v; |
+ val->getDouble(entry.first, &v); |
+ property.doubleVal.push_back(v); |
+ } break; |
+ case JSONValue::ValueType::TypeString: { |
+ String v; |
+ val->getString(entry.first, &v); |
+ v.truncate(kMaxStringLength); |
+ property.strVal.push_back(v); |
+ } break; |
+ case JSONValue::ValueType::TypeObject: { |
+ property.entityVal.push_back(Entity()); |
+ extractEntity(val->getObject(entry.first), &(property.entityVal.at(0))); |
+ } break; |
+ case JSONValue::ValueType::TypeArray: { |
+ JSONArray* arr = val->getArray(entry.first); |
+ if (arr->size() < 1) { |
+ addProperty = false; |
+ break; |
+ } |
+ |
+ property.type = arr->at(0)->getType(); |
+ if (property.type == JSONArray::ValueType::TypeArray) { |
+ // App Indexing doesn't support nested arrays. |
+ addProperty = false; |
+ break; |
+ } |
+ for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) { |
+ JSONValue* innerVal = arr->at(j); |
+ if (innerVal->getType() != property.type) { |
+ addProperty = false; |
+ break; |
+ } |
+ switch (innerVal->getType()) { |
+ case JSONValue::ValueType::TypeBoolean: { |
+ bool v; |
+ innerVal->asBoolean(&v); |
+ property.boolVal.push_back(v); |
+ } break; |
+ case JSONValue::ValueType::TypeInteger: { |
+ int v; |
+ innerVal->asInteger(&v); |
+ property.intVal.push_back(v); |
+ } break; |
+ case JSONValue::ValueType::TypeDouble: { |
+ double v; |
+ innerVal->asDouble(&v); |
+ property.doubleVal.push_back(v); |
+ } break; |
+ case JSONValue::ValueType::TypeString: { |
+ String v; |
+ innerVal->asString(&v); |
+ property.strVal.push_back(v); |
+ } break; |
+ case JSONValue::ValueType::TypeObject: |
+ property.entityVal.push_back(Entity()); |
+ extractEntity(JSONObject::cast(innerVal), |
+ &(property.entityVal.at(j))); |
+ break; |
+ default: |
+ break; |
+ } |
+ } |
+ } break; |
+ default: |
+ break; |
+ } |
+ if (addProperty) |
+ entity->properties.push_back(property); |
+ } |
+} |
+ |
+bool isWhitelistedType(String type) { |
+ DEFINE_STATIC_LOCAL(HashSet<String>, elements, |
+ ({// Common types that include addresses. |
+ "AutoDealer", "Hotel", "LocalBusiness", "Organization", |
+ "Person", "Place", "PostalAddress", "Product", |
+ "Residence", "Restaurant", "SingleFamilyResidence", |
+ // Common types including phone numbers |
+ "Store", "ContactPoint", "LodgingBusiness"})); |
+ return type && elements.contains(type); |
+} |
+ |
+void extractTopLevelEntity(JSONObject* val, Vector<Entity>* entities) { |
+ // Now we have a JSONObject which corresponds to a single (possibly nested) |
+ // entity. |
+ Entity entity; |
+ String type; |
+ val->getString(kJSONLDKeyType, &type); |
+ if (!isWhitelistedType(type)) { |
+ return; |
+ } |
+ extractEntity(val, &entity); |
+ entities->push_back(entity); |
+} |
+ |
+void extractEntitiesFromArray(JSONArray* arr, Vector<Entity>* entities) { |
+ for (size_t i = 0; i < arr->size(); ++i) { |
+ JSONValue* val = arr->at(i); |
+ switch (val->getType()) { |
+ case JSONValue::ValueType::TypeObject: |
+ extractTopLevelEntity(JSONObject::cast(val), entities); |
+ break; |
+ default: |
+ // TODO(dproctor): :( |
+ return; |
+ } |
+ } |
+} |
+ |
+void extractEntityFromTopLevelObject(JSONObject* val, |
+ Vector<Entity>* entities) { |
+ JSONArray* graph = val->getArray(kJSONLDKeyGraph); |
+ if (graph) { |
+ extractEntitiesFromArray(graph, entities); |
+ } |
+ extractTopLevelEntity(val, entities); |
+} |
+ |
+bool extractMetadata(const Element& root, Vector<Entity>* entities) { |
for (Element& element : ElementTraversal::descendantsOf(root)) { |
if (element.hasTagName(HTMLNames::scriptTag) && |
element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { |
- if (multiple) { |
- result.append(","); |
+ std::unique_ptr<JSONValue> json = |
+ parseJSON(element.textContent(), kMaxDepth); |
+ if (!json.get()) { |
+ LOG(ERROR) << "Failed to parse json."; |
+ return false; |
+ } |
+ LOG(ERROR) << "PARSED JSON: " << json->toPrettyJSONString(); |
+ switch (json->getType()) { |
+ case JSONValue::ValueType::TypeArray: |
+ extractEntitiesFromArray(JSONArray::cast(json.get()), entities); |
+ break; |
+ case JSONValue::ValueType::TypeObject: |
+ extractEntityFromTopLevelObject(JSONObject::cast(json.get()), |
+ entities); |
+ break; |
+ default: |
+ return false; |
} |
- result.append(element.textContent()); |
- multiple = true; |
} |
} |
- result.append("]"); |
- return result.toString(); |
+ return !entities->isEmpty(); |
} |
} // namespace |
-String CopylessPasteExtractor::extract(Document& document) { |
+bool CopylessPasteExtractor::extract(const Document& document, WebPage* page) { |
TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); |
if (!document.frame() || !document.frame()->isMainFrame()) |
- return emptyString; |
+ return false; |
DCHECK(document.hasFinishedParsing()); |
Element* html = document.documentElement(); |
if (!html) |
- return emptyString; |
+ return false; |
double startTime = monotonicallyIncreasingTime(); |
// Traverse the DOM tree and extract the metadata. |
- String result = extractMetadata(*html); |
+ if (!extractMetadata(*html, &(page->entities))) |
+ return false; |
+ page->url = document.url().getString(); |
+ page->title = document.title(); |
double elapsedTime = monotonicallyIncreasingTime() - startTime; |
DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, |
("CopylessPaste.ExtractionUs", 1, 1000000, 50)); |
extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); |
- return result; |
+ return true; |
} |
} // namespace blink |