Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(144)

Unified Diff: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp

Issue 2793103002: Parse JSON in Blink for CopylessPaste. (Closed)
Patch Set: Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
diff --git a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
index 6c3d28c0bdca950a58155c1dc2f00a1453193d71..8498bf884725a48122974e4bc3b388488834e6d8 100644
--- a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
+++ b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp
@@ -4,6 +4,9 @@
#include "modules/document_metadata/CopylessPasteExtractor.h"
+#include <algorithm>
+#include <memory>
+#include <utility>
#include "core/HTMLNames.h"
#include "core/dom/Document.h"
#include "core/dom/ElementTraversal.h"
@@ -11,55 +14,269 @@
#include "core/html/HTMLElement.h"
#include "platform/Histogram.h"
#include "platform/instrumentation/tracing/TraceEvent.h"
+#include "platform/json/JSONParser.h"
+#include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h"
+#include "wtf/Vector.h"
#include "wtf/text/StringBuilder.h"
namespace blink {
namespace {
-String extractMetadata(Element& root) {
- StringBuilder result;
- result.append("[");
- bool multiple = false;
+using mojom::blink::Entity;
+using mojom::blink::EntityPtr;
+using mojom::blink::Property;
+using mojom::blink::PropertyPtr;
+using mojom::blink::Values;
+using mojom::blink::ValuesPtr;
+using mojom::blink::WebPage;
+using mojom::blink::WebPagePtr;
+
+// App Indexing enforces a max nesting depth of 5. Our top level message
+// corresponds to the WebPage, so this only leaves 4 more levels.
+// TODO(dproctor): Do we want to fail parsing, or (more likely) only pass the
wychen 2017/04/04 02:07:14 Best-effort parsing sounds good.
dproctor 2017/04/04 07:13:34 Done.
+// top levels to Icing?
+constexpr int kMaxDepth = 4;
+// Some strings are very long, and we don't currently use those, so limit string
+// length to something reasonable to avoid undue pressure on Icing. Note that
+// App Indexing supports strings up to length 20k.
+constexpr int kMaxStringLength = 200;
+// Enforced by App Indexing, so stop processing early if possible.
+constexpr size_t kMaxNumFields = 20;
+// Enforced by App Indexing, so stop processing early if possible.
+constexpr size_t kMaxRepeatedSize = 100;
+
+constexpr char kJSONLDKeyType[] = "@type";
+constexpr char kJSONLDKeyGraph[] = "@graph";
+
+void extractEntity(JSONObject* val, EntityPtr* entity) {
wychen 2017/04/04 02:07:14 Sorry for the confusion. It might be better to pas
dproctor 2017/04/04 06:21:19 Done.
+ String type;
+ val->getString(kJSONLDKeyType, &type);
+ if (!type) {
+ type = "Thing";
+ }
+ (*entity)->type = type;
+ for (size_t i = 0; i < std::min(val->size(), kMaxNumFields); ++i) {
+ PropertyPtr property = Property::New();
+ JSONObject::Entry entry = val->at(i);
+ property->name = entry.first;
+ if (property->name == kJSONLDKeyType) {
+ continue;
+ }
+ property->values = Values::New();
+ JSONValue::ValueType type = entry.second->getType();
+
+ bool addProperty = true;
+
+ switch (type) {
+ case JSONValue::ValueType::TypeBoolean: {
+ bool v;
+ val->getBoolean(entry.first, &v);
+ property->values->get_bool_values().push_back(v);
wychen 2017/04/04 02:07:14 Why is this different from the rest?
dproctor 2017/04/04 06:21:19 Done.
+ } break;
+ case JSONValue::ValueType::TypeInteger: {
+ int v;
+ val->getInteger(entry.first, &v);
+ property->values->set_long_values(Vector<int64_t>(1, v));
+ } break;
+ case JSONValue::ValueType::TypeDouble: {
+ double v;
+ val->getDouble(entry.first, &v);
+ String s = String::number(v);
+ s.truncate(kMaxStringLength);
+ property->values->set_string_values(Vector<String>(1, s));
+ } break;
+ case JSONValue::ValueType::TypeString: {
+ String v;
+ val->getString(entry.first, &v);
+ v.truncate(kMaxStringLength);
+ property->values->set_string_values(Vector<String>(1, v));
+ } break;
+ case JSONValue::ValueType::TypeObject: {
+ property->values->set_entity_values(Vector<EntityPtr>());
+ property->values->get_entity_values().push_back(Entity::New());
+
+ extractEntity(val->getObject(entry.first),
+ &(property->values->get_entity_values().at(0)));
+ } break;
+ case JSONValue::ValueType::TypeArray: {
+ JSONArray* arr = val->getArray(entry.first);
+ if (arr->size() < 1) {
+ addProperty = false;
+ break;
+ }
+
+ type = arr->at(0)->getType();
+ if (type == JSONArray::ValueType::TypeArray) {
+ // App Indexing doesn't support nested arrays.
+ addProperty = false;
+ break;
+ }
+ for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) {
+ JSONValue* innerVal = arr->at(j);
+ if (innerVal->getType() != type) {
+ addProperty = false;
wychen 2017/04/04 02:07:14 Add a comment about mixed types.
dproctor 2017/04/04 06:21:19 Done.
+ break;
+ }
+ switch (innerVal->getType()) {
+ case JSONValue::ValueType::TypeBoolean: {
+ if (!property->values->is_bool_values()) {
+ property->values->set_bool_values(Vector<bool>());
+ }
+ bool v;
+ innerVal->asBoolean(&v);
+ property->values->get_bool_values().push_back(v);
+ } break;
+ case JSONValue::ValueType::TypeInteger: {
+ if (!property->values->is_long_values()) {
+ property->values->set_long_values(Vector<int64_t>());
+ }
+ int v;
+ innerVal->asInteger(&v);
+ property->values->get_long_values().push_back(v);
+ } break;
+ case JSONValue::ValueType::TypeDouble: {
+ if (!property->values->is_string_values()) {
+ property->values->set_string_values(Vector<String>());
+ }
+ double v;
+ val->getDouble(entry.first, &v);
+ String s = String::number(v);
+ s.truncate(kMaxStringLength);
+ property->values->get_string_values().push_back(s);
+ } break;
+ case JSONValue::ValueType::TypeString: {
+ if (!property->values->is_string_values()) {
+ property->values->set_string_values(Vector<String>());
+ }
+ String v;
+ innerVal->asString(&v);
+ v.truncate(kMaxStringLength);
+ property->values->get_string_values().push_back(v);
+ } break;
+ case JSONValue::ValueType::TypeObject:
+ if (!property->values->is_entity_values()) {
+ property->values->set_entity_values(Vector<EntityPtr>());
+ }
+ property->values->get_entity_values().push_back(Entity::New());
+ extractEntity(JSONObject::cast(innerVal),
+ &(property->values->get_entity_values().at(j)));
+ break;
+ default:
+ break;
+ }
+ }
+ } break;
+ default:
+ break;
+ }
+ if (addProperty)
+ (*entity)->properties.push_back(std::move(property));
+ }
+}
+
+bool isWhitelistedType(String type) {
wychen 2017/04/04 02:07:15 Would AtomicString be a premature optimization?
dproctor 2017/04/04 06:21:19 Done.
+ DEFINE_STATIC_LOCAL(HashSet<String>, elements,
+ ({// Common types that include addresses.
+ "AutoDealer", "Hotel", "LocalBusiness", "Organization",
+ "Person", "Place", "PostalAddress", "Product",
+ "Residence", "Restaurant", "SingleFamilyResidence",
+ // Common types including phone numbers
+ "Store", "ContactPoint", "LodgingBusiness"}));
+ return type && elements.contains(type);
+}
+
+void extractTopLevelEntity(JSONObject* val, Vector<EntityPtr>* entities) {
+ // Now we have a JSONObject which corresponds to a single (possibly nested)
+ // entity.
+ EntityPtr entity = Entity::New();
+ String type;
+ val->getString(kJSONLDKeyType, &type);
+ if (!isWhitelistedType(type)) {
+ return;
+ }
+ extractEntity(val, &entity);
+ entities->push_back(std::move(entity));
+}
+
+void extractEntitiesFromArray(JSONArray* arr, Vector<EntityPtr>* entities) {
+ for (size_t i = 0; i < arr->size(); ++i) {
+ JSONValue* val = arr->at(i);
+ switch (val->getType()) {
+ case JSONValue::ValueType::TypeObject:
+ extractTopLevelEntity(JSONObject::cast(val), entities);
+ break;
+ default:
+ // TODO(dproctor): :(
+ return;
+ }
+ }
+}
+
+void extractEntityFromTopLevelObject(JSONObject* val,
+ Vector<EntityPtr>* entities) {
+ JSONArray* graph = val->getArray(kJSONLDKeyGraph);
+ if (graph) {
+ extractEntitiesFromArray(graph, entities);
+ }
+ extractTopLevelEntity(val, entities);
+}
+
+bool extractMetadata(const Element& root, Vector<EntityPtr>* entities) {
wychen 2017/04/04 02:07:15 Similarly, Vector<>& might be better.
dproctor 2017/04/04 06:21:19 Done.
for (Element& element : ElementTraversal::descendantsOf(root)) {
if (element.hasTagName(HTMLNames::scriptTag) &&
element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") {
- if (multiple) {
- result.append(",");
+ std::unique_ptr<JSONValue> json =
+ parseJSON(element.textContent(), kMaxDepth);
+ if (!json.get()) {
+ LOG(ERROR) << "Failed to parse json.";
wychen 2017/04/04 02:07:14 Needs to be more specific.
dproctor 2017/04/04 06:21:19 What did you have in mind? I don't believe that th
+ return false;
+ }
+ switch (json->getType()) {
+ case JSONValue::ValueType::TypeArray:
+ extractEntitiesFromArray(JSONArray::cast(json.get()), entities);
+ break;
+ case JSONValue::ValueType::TypeObject:
+ extractEntityFromTopLevelObject(JSONObject::cast(json.get()),
+ entities);
+ break;
+ default:
+ return false;
}
- result.append(element.textContent());
- multiple = true;
}
}
- result.append("]");
- return result.toString();
+ return !entities->isEmpty();
}
} // namespace
-String CopylessPasteExtractor::extract(Document& document) {
+bool CopylessPasteExtractor::extract(const Document& document,
+ mojom::blink::WebPagePtr* page) {
wychen 2017/04/04 02:07:14 Similarly, using WebPage& here might be better.
dproctor 2017/04/04 06:21:19 Done.
TRACE_EVENT0("blink", "CopylessPasteExtractor::extract");
if (!document.frame() || !document.frame()->isMainFrame())
- return emptyString;
+ return false;
DCHECK(document.hasFinishedParsing());
Element* html = document.documentElement();
if (!html)
- return emptyString;
+ return false;
double startTime = monotonicallyIncreasingTime();
// Traverse the DOM tree and extract the metadata.
- String result = extractMetadata(*html);
+ if (!extractMetadata(*html, &(page->get()->entities)))
+ return false;
+ page->get()->url = document.url().getString();
+ page->get()->title = document.title();
double elapsedTime = monotonicallyIncreasingTime() - startTime;
DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram,
("CopylessPaste.ExtractionUs", 1, 1000000, 50));
extractionHistogram.count(static_cast<int>(1e6 * elapsedTime));
- return result;
+ return true;
}
} // namespace blink

Powered by Google App Engine
This is Rietveld 408576698