Chromium Code Reviews| Index: third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
| diff --git a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
| index 6c3d28c0bdca950a58155c1dc2f00a1453193d71..9db9d36c61f66750f28834f355cb2710206b1373 100644 |
| --- a/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
| +++ b/third_party/WebKit/Source/modules/document_metadata/CopylessPasteExtractor.cpp |
| @@ -4,6 +4,9 @@ |
| #include "modules/document_metadata/CopylessPasteExtractor.h" |
| +#include <algorithm> |
| +#include <memory> |
| +#include <utility> |
| #include "core/HTMLNames.h" |
| #include "core/dom/Document.h" |
| #include "core/dom/ElementTraversal.h" |
| @@ -11,55 +14,282 @@ |
| #include "core/html/HTMLElement.h" |
| #include "platform/Histogram.h" |
| #include "platform/instrumentation/tracing/TraceEvent.h" |
| +#include "platform/json/JSONParser.h" |
| +#include "public/platform/modules/document_metadata/copyless_paste.mojom-blink.h" |
| +#include "wtf/Vector.h" |
| +#include "wtf/text/AtomicString.h" |
| #include "wtf/text/StringBuilder.h" |
| namespace blink { |
| namespace { |
| -String extractMetadata(Element& root) { |
| - StringBuilder result; |
| - result.append("["); |
| - bool multiple = false; |
| +using mojom::blink::Entity; |
| +using mojom::blink::EntityPtr; |
| +using mojom::blink::Property; |
| +using mojom::blink::PropertyPtr; |
| +using mojom::blink::Values; |
| +using mojom::blink::ValuesPtr; |
| +using mojom::blink::WebPage; |
| +using mojom::blink::WebPagePtr; |
| + |
| +// App Indexing enforces a max nesting depth of 5. Our top level message |
| +// corresponds to the WebPage, so this only leaves 4 more levels. We will parse |
| +// entites up to this depth, and ignore any further nesting. If an object at the |
| +// max nesting depth has a property corresponding to an entity, that property |
| +// will be dropped. Note that we will still parse json-ld blocks deeper than |
| +// this, but it won't be passed to App Indexing. |
| +constexpr int kMaxDepth = 4; |
| +// Some strings are very long, and we don't currently use those, so limit string |
| +// length to something reasonable to avoid undue pressure on Icing. Note that |
| +// App Indexing supports strings up to length 20k. |
| +constexpr int kMaxStringLength = 200; |
| +// Enforced by App Indexing, so stop processing early if possible. |
| +constexpr size_t kMaxNumFields = 20; |
| +// Enforced by App Indexing, so stop processing early if possible. |
| +constexpr size_t kMaxRepeatedSize = 100; |
| + |
| +constexpr char kJSONLDKeyType[] = "@type"; |
| +constexpr char kJSONLDKeyGraph[] = "@graph"; |
| + |
| +void extractEntity(const JSONObject& val, Entity& entity, int recursionLevel) { |
|
wychen
2017/04/04 18:40:19
This function feels a bit long.
dproctor
2017/04/04 20:44:05
Split out the nested switch statement.
|
| + if (recursionLevel >= kMaxDepth) { |
| + return; |
| + } |
| + |
| + String type; |
| + val.getString(kJSONLDKeyType, &type); |
| + if (!type) { |
| + type = "Thing"; |
| + } |
| + entity.type = type; |
| + for (size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) { |
| + PropertyPtr property = Property::New(); |
| + const JSONObject::Entry entry = val.at(i); |
|
wychen
2017/04/04 18:40:19
Would Entry& work?
dproctor
2017/04/04 20:44:05
Done.
|
| + property->name = entry.first; |
| + if (property->name == kJSONLDKeyType) { |
| + continue; |
| + } |
| + property->values = Values::New(); |
| + JSONValue::ValueType type = entry.second->getType(); |
|
wychen
2017/04/04 18:40:19
inline in the switch to avoid confusion from Strin
dproctor
2017/04/04 20:44:04
Done.
|
| + |
| + bool addProperty = true; |
| + |
| + switch (type) { |
| + case JSONValue::ValueType::TypeBoolean: { |
| + bool v; |
| + val.getBoolean(entry.first, &v); |
| + property->values->set_bool_values(Vector<bool>(1, v)); |
| + } break; |
| + case JSONValue::ValueType::TypeInteger: { |
| + int v; |
| + val.getInteger(entry.first, &v); |
| + property->values->set_long_values(Vector<int64_t>(1, v)); |
| + } break; |
| + case JSONValue::ValueType::TypeDouble: { |
| + double v; |
| + val.getDouble(entry.first, &v); |
| + String s = String::number(v); |
| + s.truncate(kMaxStringLength); |
| + property->values->set_string_values(Vector<String>(1, s)); |
| + } break; |
| + case JSONValue::ValueType::TypeString: { |
| + String v; |
| + val.getString(entry.first, &v); |
| + v.truncate(kMaxStringLength); |
| + property->values->set_string_values(Vector<String>(1, v)); |
| + } break; |
| + case JSONValue::ValueType::TypeObject: { |
| + if (recursionLevel + 1 >= kMaxDepth) { |
| + addProperty = false; |
| + break; |
| + } |
| + property->values->set_entity_values(Vector<EntityPtr>()); |
| + property->values->get_entity_values().push_back(Entity::New()); |
| + |
| + extractEntity(*(val.getObject(entry.first)), |
| + *(property->values->get_entity_values().at(0)), |
| + recursionLevel + 1); |
| + } break; |
| + case JSONValue::ValueType::TypeArray: { |
|
wychen
2017/04/04 18:40:19
Probably split this part out. The nested switch an
dproctor
2017/04/04 20:44:04
Done.
|
| + JSONArray* arr = val.getArray(entry.first); |
|
wychen
2017/04/04 18:40:19
const
dproctor
2017/04/04 20:44:04
JSONArray::at isn't const.
|
| + if (arr->size() < 1) { |
| + addProperty = false; |
| + break; |
| + } |
| + |
| + type = arr->at(0)->getType(); |
| + if (type == JSONArray::ValueType::TypeArray) { |
| + // App Indexing doesn't support nested arrays. |
| + addProperty = false; |
| + break; |
| + } |
| + for (size_t j = 0; j < std::min(arr->size(), kMaxRepeatedSize); ++j) { |
| + JSONValue* innerVal = arr->at(j); |
| + if (innerVal->getType() != type) { |
| + // App Indexing doesn't support mixed types. If there are mixed |
| + // types in the parsed object, we will drop the property. |
| + addProperty = false; |
| + break; |
| + } |
| + switch (innerVal->getType()) { |
| + case JSONValue::ValueType::TypeBoolean: { |
| + if (!property->values->is_bool_values()) { |
| + property->values->set_bool_values(Vector<bool>()); |
| + } |
| + bool v; |
| + innerVal->asBoolean(&v); |
| + property->values->get_bool_values().push_back(v); |
| + } break; |
| + case JSONValue::ValueType::TypeInteger: { |
| + if (!property->values->is_long_values()) { |
| + property->values->set_long_values(Vector<int64_t>()); |
| + } |
| + int v; |
| + innerVal->asInteger(&v); |
| + property->values->get_long_values().push_back(v); |
| + } break; |
| + case JSONValue::ValueType::TypeDouble: { |
| + if (!property->values->is_string_values()) { |
| + property->values->set_string_values(Vector<String>()); |
| + } |
| + double v; |
| + val.getDouble(entry.first, &v); |
| + String s = String::number(v); |
| + s.truncate(kMaxStringLength); |
| + property->values->get_string_values().push_back(s); |
| + } break; |
| + case JSONValue::ValueType::TypeString: { |
| + if (!property->values->is_string_values()) { |
| + property->values->set_string_values(Vector<String>()); |
| + } |
| + String v; |
| + innerVal->asString(&v); |
| + v.truncate(kMaxStringLength); |
| + property->values->get_string_values().push_back(v); |
| + } break; |
| + case JSONValue::ValueType::TypeObject: |
| + if (recursionLevel + 1 >= kMaxDepth) { |
| + addProperty = false; |
| + break; |
| + } |
| + if (!property->values->is_entity_values()) { |
| + property->values->set_entity_values(Vector<EntityPtr>()); |
| + } |
| + property->values->get_entity_values().push_back(Entity::New()); |
| + extractEntity(*(JSONObject::cast(innerVal)), |
| + *(property->values->get_entity_values().at(j)), |
| + recursionLevel + 1); |
| + break; |
| + default: |
| + break; |
| + } |
| + } |
| + } break; |
| + default: |
| + break; |
| + } |
| + if (addProperty) |
| + entity.properties.push_back(std::move(property)); |
| + } |
| +} |
| + |
| +bool isWhitelistedType(AtomicString type) { |
| + DEFINE_STATIC_LOCAL(HashSet<AtomicString>, elements, |
| + ({// Common types that include addresses. |
| + "AutoDealer", "Hotel", "LocalBusiness", "Organization", |
| + "Person", "Place", "PostalAddress", "Product", |
| + "Residence", "Restaurant", "SingleFamilyResidence", |
| + // Common types including phone numbers |
| + "Store", "ContactPoint", "LodgingBusiness"})); |
| + return type && elements.contains(type); |
| +} |
| + |
| +void extractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) { |
| + // Now we have a JSONObject which corresponds to a single (possibly nested) |
| + // entity. |
| + EntityPtr entity = Entity::New(); |
| + String type; |
| + val.getString(kJSONLDKeyType, &type); |
| + if (!isWhitelistedType(AtomicString(type))) { |
| + return; |
| + } |
| + extractEntity(val, *(entity.get()), 0); |
| + entities.push_back(std::move(entity)); |
| +} |
| + |
| +void extractEntitiesFromArray(JSONArray& arr, Vector<EntityPtr>& entities) { |
|
wychen
2017/04/04 18:40:19
const arr
dproctor
2017/04/04 20:44:04
JSONArray::at isn't const.
wychen
2017/04/04 23:15:32
Hmm. This is unexpected, but OK.
Let's see how fas
wychen
2017/04/05 16:29:41
https://codereview.chromium.org/2795393002/ has la
dproctor
2017/04/05 17:41:44
Done.
|
| + for (size_t i = 0; i < arr.size(); ++i) { |
| + JSONValue* val = arr.at(i); |
|
wychen
2017/04/04 18:40:19
const
dproctor
2017/04/04 20:44:04
Done.
|
| + if (val->getType() == JSONValue::ValueType::TypeObject) { |
| + extractTopLevelEntity(*(JSONObject::cast(val)), entities); |
| + } |
| + } |
| +} |
| + |
| +void extractEntityFromTopLevelObject(const JSONObject& val, |
| + Vector<EntityPtr>& entities) { |
| + JSONArray* graph = val.getArray(kJSONLDKeyGraph); |
|
wychen
2017/04/04 18:40:19
const
dproctor
2017/04/04 20:44:05
Done.
|
| + if (graph) { |
| + extractEntitiesFromArray(*graph, entities); |
| + } |
| + extractTopLevelEntity(val, entities); |
| +} |
| + |
| +bool extractMetadata(const Element& root, Vector<EntityPtr>& entities) { |
| for (Element& element : ElementTraversal::descendantsOf(root)) { |
| if (element.hasTagName(HTMLNames::scriptTag) && |
| element.getAttribute(HTMLNames::typeAttr) == "application/ld+json") { |
| - if (multiple) { |
| - result.append(","); |
| + std::unique_ptr<JSONValue> json = parseJSON(element.textContent()); |
|
wychen
2017/04/04 18:40:19
We could use the maxDepth version.
dproctor
2017/04/04 20:44:04
So, there are actually two different notions of ma
wychen
2017/04/04 23:15:32
My bad. JSONParser returns nullptr when exceeding
|
| + if (!json.get()) { |
| + LOG(ERROR) << "Failed to parse json."; |
| + return false; |
| + } |
| + switch (json->getType()) { |
| + case JSONValue::ValueType::TypeArray: |
| + extractEntitiesFromArray(*(JSONArray::cast(json.get())), entities); |
| + break; |
| + case JSONValue::ValueType::TypeObject: |
| + extractEntityFromTopLevelObject(*(JSONObject::cast(json.get())), |
| + entities); |
| + break; |
| + default: |
| + return false; |
| } |
| - result.append(element.textContent()); |
| - multiple = true; |
| } |
| } |
| - result.append("]"); |
| - return result.toString(); |
| + return !entities.isEmpty(); |
| } |
| } // namespace |
| -String CopylessPasteExtractor::extract(Document& document) { |
| +bool CopylessPasteExtractor::extract(const Document& document, |
| + mojom::blink::WebPage& page) { |
| TRACE_EVENT0("blink", "CopylessPasteExtractor::extract"); |
| if (!document.frame() || !document.frame()->isMainFrame()) |
| - return emptyString; |
| + return false; |
| DCHECK(document.hasFinishedParsing()); |
| Element* html = document.documentElement(); |
| if (!html) |
| - return emptyString; |
| + return false; |
| double startTime = monotonicallyIncreasingTime(); |
| // Traverse the DOM tree and extract the metadata. |
| - String result = extractMetadata(*html); |
| + if (!extractMetadata(*html, page.entities)) |
| + return false; |
| + page.url = document.url().getString(); |
|
wychen
2017/04/04 18:40:19
The mojo type will change from string to GURL.
dproctor
2017/04/04 20:44:04
Acknowledged.
|
| + page.title = document.title(); |
| double elapsedTime = monotonicallyIncreasingTime() - startTime; |
| DEFINE_STATIC_LOCAL(CustomCountHistogram, extractionHistogram, |
| ("CopylessPaste.ExtractionUs", 1, 1000000, 50)); |
| extractionHistogram.count(static_cast<int>(1e6 * elapsedTime)); |
| - return result; |
| + return true; |
| } |
| } // namespace blink |