| Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
|
| diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
|
| index 0b5e1445ceffa2b14d487ae6a4cada024a6e6ac1..9a699a9da71495313ca70c0e2df38f874ae1dc07 100644
|
| --- a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
|
| +++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
|
| @@ -9,13 +9,18 @@
|
| #include "core/dom/ElementTraversal.h"
|
| #include "core/dom/NodeComputedStyle.h"
|
| #include "core/dom/Text.h"
|
| +#include "core/editing/EphemeralRange.h"
|
| +#include "core/editing/iterators/TextIterator.h"
|
| #include "core/frame/FrameHost.h"
|
| #include "core/html/HTMLHeadElement.h"
|
| #include "core/html/HTMLInputElement.h"
|
| #include "core/html/HTMLMetaElement.h"
|
| +#include "core/layout/LayoutPart.h"
|
| +#include "core/layout/LayoutView.h"
|
| #include "platform/Histogram.h"
|
| #include "public/platform/Platform.h"
|
| #include "public/platform/WebDistillability.h"
|
| +#include "public/platform/WebLanguageDetectionDetails.h"
|
|
|
| namespace blink {
|
|
|
| @@ -34,6 +39,9 @@ const double kMozScoreSaturation = 175.954539583; // 6 * sqrt(kTextContentLength
|
| const double kMozScoreAllSqrtSaturation = 189.73665961; // 6 * sqrt(kTextContentLengthSaturation);
|
| const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation;
|
|
|
| +// Maximum number of characters to be used for language detection
|
| +const size_t kMaxContentLengthForLanguageDetection = 65535;
|
| +
|
| unsigned textContentLengthSaturated(Element& root)
|
| {
|
| unsigned length = 0;
|
| @@ -212,9 +220,9 @@ bool isMobileFriendly(Document& document)
|
|
|
| } // namespace
|
|
|
| -WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document)
|
| +WebDistillabilityFeatures DocumentStatisticsCollector::collectDistillabilityFeatures(Document& document)
|
| {
|
| - TRACE_EVENT0("blink", "DocumentStatisticsCollector::collectStatistics");
|
| + TRACE_EVENT0("blink", "DocumentStatisticsCollector::collectDistillabilityFeatures");
|
|
|
| WebDistillabilityFeatures features = WebDistillabilityFeatures();
|
|
|
| @@ -237,7 +245,7 @@ WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen
|
|
|
| double startTime = monotonicallyIncreasingTime();
|
|
|
| - // This should be cheap since collectStatistics is only called right after layout.
|
| + // This should be cheap since collectDistillabilityFeatures is only called right after layout.
|
| document.updateLayoutTreeIfNeeded();
|
|
|
| // Traverse the DOM tree and collect statistics.
|
| @@ -252,4 +260,108 @@ WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen
|
| return features;
|
| }
|
|
|
| +void DocumentStatisticsCollector::frameTreeContentAsPlainText(size_t maxChars, LocalFrame* frame, StringBuilder& output)
|
| +{
|
| + Document* document = frame->document();
|
| + if (!document)
|
| + return;
|
| +
|
| + if (!frame->view())
|
| + return;
|
| +
|
| + // Select the document body.
|
| + if (document->body()) {
|
| + const EphemeralRange range = EphemeralRange::rangeOfContents(*document->body());
|
| +
|
| + // The text iterator will walk nodes giving us text. This is similar to
|
| + // the plainText() function in core/editing/TextIterator.h, but we implement the maximum
|
| + // size and also copy the results directly into a wstring, avoiding the
|
| + // string conversion.
|
| + for (TextIterator it(range.startPosition(), range.endPosition()); !it.atEnd(); it.advance()) {
|
| + it.text().appendTextToStringBuilder(output, 0, maxChars - output.length());
|
| + if (output.length() >= maxChars)
|
| + return; // Filled up the buffer.
|
| + }
|
| + }
|
| +
|
| + // The separator between frames when the frames are converted to plain text.
|
| + const LChar frameSeparator[] = { '\n', '\n' };
|
| + const size_t frameSeparatorLength = WTF_ARRAY_LENGTH(frameSeparator);
|
| +
|
| + // Recursively walk the children.
|
| + const FrameTree& frameTree = frame->tree();
|
| + for (Frame* curChild = frameTree.firstChild(); curChild; curChild = curChild->tree().nextSibling()) {
|
| + if (!curChild->isLocalFrame())
|
| + continue;
|
| + LocalFrame* curLocalChild = toLocalFrame(curChild);
|
| + // Ignore the text of non-visible frames.
|
| + LayoutView* contentLayoutObject = curLocalChild->contentLayoutObject();
|
| + LayoutPart* ownerLayoutObject = curLocalChild->ownerLayoutObject();
|
| + if (!contentLayoutObject || !contentLayoutObject->size().width() || !contentLayoutObject->size().height()
|
| + || (contentLayoutObject->location().x() + contentLayoutObject->size().width() <= 0) || (contentLayoutObject->location().y() + contentLayoutObject->size().height() <= 0)
|
| + || (ownerLayoutObject && ownerLayoutObject->style() && ownerLayoutObject->style()->visibility() != VISIBLE)) {
|
| + continue;
|
| + }
|
| +
|
| + // Make sure the frame separator won't fill up the buffer, and give up if
|
| + // it will. The danger is if the separator will make the buffer longer than
|
| + // maxChars. This will cause the computation above:
|
| + // maxChars - output->size()
|
| + // to be a negative number which will crash when the subframe is added.
|
| + if (output.length() >= maxChars - frameSeparatorLength)
|
| + return;
|
| +
|
| + output.append(frameSeparator, frameSeparatorLength);
|
| + frameTreeContentAsPlainText(maxChars, curLocalChild, output);
|
| + if (output.length() >= maxChars)
|
| + return; // Filled up the buffer.
|
| + }
|
| +}
|
| +
|
| +inline static bool hasNoTranslateMeta(Document& document)
|
| +{
|
| + DEFINE_STATIC_LOCAL(AtomicString, google, ("google"));
|
| + DEFINE_STATIC_LOCAL(AtomicString, notranslate, ("notranslate"));
|
| +
|
| + HTMLElement* head = document.head();
|
| + // Apparently, this is possible for PDF documents.
|
| + // TODO(dglazkov): Figure out why. Seems bad.
|
| + if (!head)
|
| + return false;
|
| +
|
| + for (const Element* child = ElementTraversal::firstChild(*head); child; child = ElementTraversal::nextSibling(*child)) {
|
| + if (!isHTMLMetaElement(*child))
|
| + continue;
|
| + const HTMLMetaElement& meta = toHTMLMetaElement(*child);
|
| + if (meta.name() != google)
|
| + continue;
|
| +
|
| + String content = meta.content();
|
| + if (!content)
|
| + content = meta.getAttribute(valueAttr);
|
| + if (!content)
|
| + continue;
|
| + if (equalIgnoringCase(content, notranslate))
|
| + return true;
|
| + }
|
| + return false;
|
| +}
|
| +
|
| +WebLanguageDetectionDetails DocumentStatisticsCollector::collectLanguageDetectionDetails(Document& document)
|
| +{
|
| + ASSERT(document.hasFinishedParsing());
|
| +
|
| + WebLanguageDetectionDetails details;
|
| +
|
| + details.url = document.url();
|
| + details.contentLanguage = document.contentLanguage();
|
| + details.htmlLanguage = document.documentElement()->getAttribute(langAttr);
|
| + details.hasNoTranslateMeta = hasNoTranslateMeta(document);
|
| + StringBuilder text;
|
| + frameTreeContentAsPlainText(kMaxContentLengthForLanguageDetection, document.frame(), text);
|
| + details.content = text.toString();
|
| +
|
| + return details;
|
| +}
|
| +
|
| } // namespace blink
|
|
|