Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(228)

Unified Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1650303002: Move DOM-inspecting language detection logic to Blink. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Rebased. Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
index 0b5e1445ceffa2b14d487ae6a4cada024a6e6ac1..9a699a9da71495313ca70c0e2df38f874ae1dc07 100644
--- a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
@@ -9,13 +9,18 @@
#include "core/dom/ElementTraversal.h"
#include "core/dom/NodeComputedStyle.h"
#include "core/dom/Text.h"
+#include "core/editing/EphemeralRange.h"
+#include "core/editing/iterators/TextIterator.h"
#include "core/frame/FrameHost.h"
#include "core/html/HTMLHeadElement.h"
#include "core/html/HTMLInputElement.h"
#include "core/html/HTMLMetaElement.h"
+#include "core/layout/LayoutPart.h"
+#include "core/layout/LayoutView.h"
#include "platform/Histogram.h"
#include "public/platform/Platform.h"
#include "public/platform/WebDistillability.h"
+#include "public/platform/WebLanguageDetectionDetails.h"
namespace blink {
@@ -34,6 +39,9 @@ const double kMozScoreSaturation = 175.954539583; // 6 * sqrt(kTextContentLength
const double kMozScoreAllSqrtSaturation = 189.73665961; // 6 * sqrt(kTextContentLengthSaturation);
const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation;
+// Maximum number of characters to be used for language detection
+const size_t kMaxContentLengthForLanguageDetection = 65535;
+
unsigned textContentLengthSaturated(Element& root)
{
unsigned length = 0;
@@ -212,9 +220,9 @@ bool isMobileFriendly(Document& document)
} // namespace
-WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document)
+WebDistillabilityFeatures DocumentStatisticsCollector::collectDistillabilityFeatures(Document& document)
{
- TRACE_EVENT0("blink", "DocumentStatisticsCollector::collectStatistics");
+ TRACE_EVENT0("blink", "DocumentStatisticsCollector::collectDistillabilityFeatures");
WebDistillabilityFeatures features = WebDistillabilityFeatures();
@@ -237,7 +245,7 @@ WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen
double startTime = monotonicallyIncreasingTime();
- // This should be cheap since collectStatistics is only called right after layout.
+ // This should be cheap since collectDistillabilityFeatures is only called right after layout.
document.updateLayoutTreeIfNeeded();
// Traverse the DOM tree and collect statistics.
@@ -252,4 +260,108 @@ WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen
return features;
}
+void DocumentStatisticsCollector::frameTreeContentAsPlainText(size_t maxChars, LocalFrame* frame, StringBuilder& output)
+{
+ Document* document = frame->document();
+ if (!document)
+ return;
+
+ if (!frame->view())
+ return;
+
+ // Select the document body.
+ if (document->body()) {
+ const EphemeralRange range = EphemeralRange::rangeOfContents(*document->body());
+
+ // The text iterator will walk nodes giving us text. This is similar to
+ // the plainText() function in core/editing/TextIterator.h, but we implement the maximum
+ // size and also copy the results directly into a wstring, avoiding the
+ // string conversion.
+ for (TextIterator it(range.startPosition(), range.endPosition()); !it.atEnd(); it.advance()) {
+ it.text().appendTextToStringBuilder(output, 0, maxChars - output.length());
+ if (output.length() >= maxChars)
+ return; // Filled up the buffer.
+ }
+ }
+
+ // The separator between frames when the frames are converted to plain text.
+ const LChar frameSeparator[] = { '\n', '\n' };
+ const size_t frameSeparatorLength = WTF_ARRAY_LENGTH(frameSeparator);
+
+ // Recursively walk the children.
+ const FrameTree& frameTree = frame->tree();
+ for (Frame* curChild = frameTree.firstChild(); curChild; curChild = curChild->tree().nextSibling()) {
+ if (!curChild->isLocalFrame())
+ continue;
+ LocalFrame* curLocalChild = toLocalFrame(curChild);
+ // Ignore the text of non-visible frames.
+ LayoutView* contentLayoutObject = curLocalChild->contentLayoutObject();
+ LayoutPart* ownerLayoutObject = curLocalChild->ownerLayoutObject();
+ if (!contentLayoutObject || !contentLayoutObject->size().width() || !contentLayoutObject->size().height()
+ || (contentLayoutObject->location().x() + contentLayoutObject->size().width() <= 0) || (contentLayoutObject->location().y() + contentLayoutObject->size().height() <= 0)
+ || (ownerLayoutObject && ownerLayoutObject->style() && ownerLayoutObject->style()->visibility() != VISIBLE)) {
+ continue;
+ }
+
+ // Make sure the frame separator won't fill up the buffer, and give up if
+ // it will. The danger is if the separator will make the buffer longer than
+ // maxChars. This will cause the computation above:
+ // maxChars - output->size()
+ // to be a negative number which will crash when the subframe is added.
+ if (output.length() >= maxChars - frameSeparatorLength)
+ return;
+
+ output.append(frameSeparator, frameSeparatorLength);
+ frameTreeContentAsPlainText(maxChars, curLocalChild, output);
+ if (output.length() >= maxChars)
+ return; // Filled up the buffer.
+ }
+}
+
+inline static bool hasNoTranslateMeta(Document& document)
+{
+ DEFINE_STATIC_LOCAL(AtomicString, google, ("google"));
+ DEFINE_STATIC_LOCAL(AtomicString, notranslate, ("notranslate"));
+
+ HTMLElement* head = document.head();
+ // Apparently, this is possible for PDF documents.
+ // TODO(dglazkov): Figure out why. Seems bad.
+ if (!head)
+ return false;
+
+ for (const Element* child = ElementTraversal::firstChild(*head); child; child = ElementTraversal::nextSibling(*child)) {
+ if (!isHTMLMetaElement(*child))
+ continue;
+ const HTMLMetaElement& meta = toHTMLMetaElement(*child);
+ if (meta.name() != google)
+ continue;
+
+ String content = meta.content();
+ if (!content)
+ content = meta.getAttribute(valueAttr);
+ if (!content)
+ continue;
+ if (equalIgnoringCase(content, notranslate))
+ return true;
+ }
+ return false;
+}
+
+WebLanguageDetectionDetails DocumentStatisticsCollector::collectLanguageDetectionDetails(Document& document)
+{
+ ASSERT(document.hasFinishedParsing());
+
+ WebLanguageDetectionDetails details;
+
+ details.url = document.url();
+ details.contentLanguage = document.contentLanguage();
+ details.htmlLanguage = document.documentElement()->getAttribute(langAttr);
+ details.hasNoTranslateMeta = hasNoTranslateMeta(document);
+ StringBuilder text;
+ frameTreeContentAsPlainText(kMaxContentLengthForLanguageDetection, document.frame(), text);
+ details.content = text.toString();
+
+ return details;
+}
+
} // namespace blink

Powered by Google App Engine
This is Rietveld 408576698