Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1229)

Unified Diff: third_party/document_image_extractor/third_party/src/document_image_extractor.js

Issue 1138123002: Update third_party/document_image_extractor (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/document_image_extractor/third_party/src/document_image_extractor.js
diff --git a/third_party/document_image_extractor/third_party/src/document_image_extractor.js b/third_party/document_image_extractor/third_party/src/document_image_extractor.js
deleted file mode 100644
index d82702d741d98a71ba53c5fcb91fd0626818d38d..0000000000000000000000000000000000000000
--- a/third_party/document_image_extractor/third_party/src/document_image_extractor.js
+++ /dev/null
@@ -1,400 +0,0 @@
-goog.provide('image.collections.extension.DocumentImageExtractor');
-
-goog.require('goog.Uri');
-goog.require('goog.asserts');
-goog.require('goog.dom');
-goog.require('goog.log');
-goog.require('goog.math.Size');
-goog.require('goog.object');
-goog.require('goog.string');
-goog.require('goog.style');
-goog.require('goog.uri.utils');
-goog.require('gws.collections.common.Constants');
-goog.require('image.collections.extension.AdElementFilter');
-goog.require('image.collections.extension.DocumentFeature');
-goog.require('image.collections.extension.DocumentFeatureExtractor');
-goog.require('image.collections.extension.DocumentImage');
-goog.require('image.collections.extension.VisibleElementFilter');
-
-goog.scope(function() {
-var AdElementFilter = image.collections.extension.AdElementFilter;
-var Constants = gws.collections.common.Constants;
-var DocumentFeature = image.collections.extension.DocumentFeature;
-var DocumentFeatureExtractor =
- image.collections.extension.DocumentFeatureExtractor;
-var DocumentImage = image.collections.extension.DocumentImage;
-var CustomAttribute = DocumentImage.CustomAttribute;
-var VisibleElementFilter = image.collections.extension.VisibleElementFilter;
-
-
-
-/**
- * This class is used for extracting a salient image from an HTML document.
- * @extends {DocumentFeatureExtractor}
- * @constructor
- */
-image.collections.extension.DocumentImageExtractor = function() {
- DocumentImageExtractor.base(this, 'constructor');
-
- this.addFilter(new AdElementFilter());
- this.addFilter(new VisibleElementFilter());
-};
-goog.inherits(image.collections.extension.DocumentImageExtractor,
- DocumentFeatureExtractor);
-var DocumentImageExtractor = image.collections.extension.DocumentImageExtractor;
-goog.addSingletonGetter(DocumentImageExtractor);
-
-
-/** @private {goog.log.Logger} Extractor logger. */
-DocumentImageExtractor.logger_ = goog.log.getLogger(
- 'image.collections.extension.DocumentImageExtractor');
-
-
-/** @enum {number} */
-DocumentImageExtractor.Parameters = {
- AREA_MULTIPLIER: -1e-5,
- ASPECT_RATIO_DEMOTION_FACTOR: 0.8,
- MAX_ASPECT_RATIO: 2,
- MAX_ELEMENTS_WITH_BACKGROUND: 30,
- MAX_OFFSET: 2000,
- NON_TOPLEVEL_DEMOTION_FACTOR: 0.5,
- OFFSET_MULTIPLIER: 1e-3,
- WEIGHT_APPLE: 0.7,
- WEIGHT_MICRODATA: 0.55,
- WEIGHT_MICROSOFT: 0.9,
- WEIGHT_OPEN_GRAPH: 1.0,
- WEIGHT_SRC: 0.6,
- WEIGHT_TWITTER: 0.8
-};
-var Parameters = DocumentImageExtractor.Parameters;
-
-
-/**
- * Map of image type to relevance multiplier.
- * @private {!Object.<string, number>}
- */
-DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_ =
- goog.object.create('.gif', 0.5);
-
-
-
-/** @constructor */
-DocumentImageExtractor.Context = function() {
- /** @type {number} */
- this.numElementsWithBackground = 0;
-
- /** @type {!Object.<string, number>} */
- this.urlToRelevance = {};
-};
-var Context = DocumentImageExtractor.Context;
-
-
-/**
- * Extracts salient images from an HTML document.
- * @param {!Document} doc
- * @return {!Array.<!DocumentImage>}
- * @override
- */
-DocumentImageExtractor.prototype.extractAllFromDocument = function(doc) {
- var context = new Context();
- return this.extractFromNodeList(doc.getElementsByTagName('*'), context);
-};
-
-
-/**
- * Extracts a salient image from an HTML element, returns null if failed.
- * @param {!Element} element HTML element.
- * @param {Object=} opt_context Optional context.
- * @return {DocumentImage}
- * @override
- */
-DocumentImageExtractor.prototype.extractFromElement = function(
- element, opt_context) {
- var image = null;
- switch (element.tagName.toLowerCase()) {
- case 'meta':
- image = this.extractOpenGraphImage_(element) ||
- this.extractMicrosoftImage_(element) ||
- this.extractTwitterImage_(element) ||
- this.extractMicrodataImage_(element);
- break;
- case 'link':
- image = this.extractAppleImage_(element) ||
- this.extractImageSrcImage_(element) ||
- this.extractMicrodataImage_(element);
- break;
- case 'img':
- if (this.filter(element)) {
- image = this.extractImage_(element);
- }
- break;
- default:
- if (this.filter(element) && opt_context.numElementsWithBackground <
- Parameters.MAX_ELEMENTS_WITH_BACKGROUND) {
- image = this.extractBackgroundImage_(element);
- if (image) {
- ++opt_context.numElementsWithBackground;
- }
- }
- }
-
- if (!image) {
- return null;
- }
-
- var size = image.getDisplaySize() || image.getSize();
- goog.asserts.assert(!goog.isNull(size));
- if (image.getUrl() != document.location.href) {
- // Ignore images that are too small.
- if (size.width < Constants.EXTRACT_MIN_WIDTH ||
- size.height < Constants.EXTRACT_MIN_HEIGHT) {
- return null;
- }
- }
- // Demote smaller images (squash area using the sigmoid function).
- var relevance = image.getRelevance();
- relevance /= (1 + Math.exp(Parameters.AREA_MULTIPLIER * size.area()));
- // Demote images with bad aspect ratio.
- var aspectRatio = size.aspectRatio();
- if (aspectRatio < 1) {
- aspectRatio = 1 / aspectRatio;
- }
- if (aspectRatio > Parameters.MAX_ASPECT_RATIO) {
- relevance *= Parameters.ASPECT_RATIO_DEMOTION_FACTOR;
- }
-
- // TODO(busaryev): use the following features:
- // - relative size of the image comparing to neighbors;
- // - area of the visible portion of the image;
- // - position (demote images on the border of the page).
-
- var url = image.getUrl();
- try {
- // Make sure that image url is absolute.
- var documentUrl = goog.dom.getOwnerDocument(element).documentURI;
- url = goog.Uri.resolve(documentUrl, url).toString();
- } catch (e) {
- goog.log.info(DocumentImageExtractor.logger_,
- 'Cannot resolve url: ' + url);
- return null;
- }
-
- var imagePath = goog.string.makeSafe(goog.uri.utils.getPath(url));
- var lastDot = imagePath.lastIndexOf('.');
- if (lastDot > 0) {
- var imageType = imagePath.slice(lastDot);
- var multiplier =
- DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_[imageType];
- if (multiplier) {
- relevance *= multiplier;
- }
- }
-
- if (url in opt_context.urlToRelevance &&
- opt_context.urlToRelevance[url] > relevance) {
- return null;
- }
- opt_context.urlToRelevance[url] = relevance;
- return new DocumentImage(relevance, url, image.getSize(),
- image.getDisplaySize());
-};
-
-
-/**
- * @param {!Element} element
- * @return {DocumentImage}
- * @private
- */
-DocumentImageExtractor.prototype.extractOpenGraphImage_ = function(element) {
- return this.extractCanonicalImage_(
- element, Parameters.WEIGHT_OPEN_GRAPH, 'property', 'og:image', 'content');
-};
-
-
-/**
- * @param {!Element} element
- * @return {DocumentImage}
- * @private
- */
-DocumentImageExtractor.prototype.extractMicrosoftImage_ = function(element) {
- return this.extractCanonicalImage_(element, Parameters.WEIGHT_MICROSOFT,
- 'name', 'msapplication-tileimage', 'content');
-};
-
-
-/**
- * @param {!Element} element
- * @return {DocumentImage}
- * @private
- */
-DocumentImageExtractor.prototype.extractTwitterImage_ = function(element) {
- return this.extractCanonicalImage_(
- element, Parameters.WEIGHT_TWITTER, 'name', 'twitter:image', 'content');
-};
-
-
-/**
- * @param {!Element} element
- * @return {DocumentImage}
- * @private
- */
-DocumentImageExtractor.prototype.extractAppleImage_ = function(element) {
- return this.extractCanonicalImage_(
- element, Parameters.WEIGHT_APPLE, 'rel', 'apple-touch-icon', 'href');
-};
-
-
-/**
- * @param {!Element} element
- * @return {DocumentImage}
- * @private
- */
-DocumentImageExtractor.prototype.extractImageSrcImage_ = function(element) {
- return this.extractCanonicalImage_(
- element, Parameters.WEIGHT_SRC, 'rel', 'image_src', 'href');
-};
-
-
-/**
- * @param {!Element} element
- * @param {!number} relevance
- * @param {string} attributeName
- * @param {string} attribute
- * @param {string} urlAttributeName
- * @return {DocumentImage}
- * @private
- */
-DocumentImageExtractor.prototype.extractCanonicalImage_ = function(
- element, relevance, attributeName, attribute, urlAttributeName) {
- goog.asserts.assert(goog.isNumber(relevance));
- goog.asserts.assert(goog.isString(attributeName));
- goog.asserts.assert(goog.isString(attribute));
-
- if (goog.string.caseInsensitiveEquals(
- goog.string.makeSafe(element.getAttribute(attributeName)), attribute)) {
- var url = element.getAttribute(urlAttributeName);
- if (!url || goog.string.startsWith(url, 'data:')) {
- return null;
- }
- var width = goog.string.parseInt(
- element.getAttribute(CustomAttribute.WIDTH));
- var height = goog.string.parseInt(
- element.getAttribute(CustomAttribute.HEIGHT));
- if (width && height) {
- // For non-toplevel urls, demote the image if it is not in the document.
- var ownerDocument = goog.dom.getOwnerDocument(element);
- if (goog.uri.utils.getPath(ownerDocument.documentURI) != '/' &&
- ownerDocument.body.innerHTML.indexOf(url) == -1) {
- relevance *= Parameters.NON_TOPLEVEL_DEMOTION_FACTOR;
- }
- var size = new goog.math.Size(width, height);
- return new DocumentImage(relevance, url, size);
- }
- }
- return null;
-};
-
-
-/**
- * @param {!Element} element
- * @return {DocumentImage}
- * @private
- */
-DocumentImageExtractor.prototype.extractMicrodataImage_ = function(element) {
- var itemProp = element.getAttribute('itemprop');
- if (itemProp && itemProp.toLowerCase() == 'thumbnailurl') {
- var url = element.getAttribute('href') || element.getAttribute('content');
- if (!url || goog.string.startsWith(url, 'data:')) {
- return null;
- }
- var width = goog.string.parseInt(
- element.getAttribute(CustomAttribute.WIDTH));
- var height = goog.string.parseInt(
- element.getAttribute(CustomAttribute.HEIGHT));
- if (width && height) {
- var size = new goog.math.Size(width, height);
- return new DocumentImage(Parameters.WEIGHT_MICRODATA, url, size);
- }
- }
- return null;
-};
-
-
-/**
- * @param {!Element} element
- * @return {number}
- * @private
- */
-DocumentImageExtractor.prototype.getElementRelevance_ = function(element) {
- var offset = goog.style.getPageOffsetTop(element);
- if (offset > Parameters.MAX_OFFSET) {
- return 0;
- }
- return 1 / (1 + Math.exp(Parameters.OFFSET_MULTIPLIER * offset));
-};
-
-
-/**
- * Extracts an image from the <img> HTML element.
- * @param {!Element} element
- * @return {DocumentImage}
- * @private
- */
-DocumentImageExtractor.prototype.extractImage_ = function(element) {
- goog.asserts.assert(element.tagName.toLowerCase() == 'img');
- var url = element.src;
- // We cannot handle data URIs.
- if (url && !goog.string.startsWith(url, 'data:')) {
- var naturalSize = new goog.math.Size(
- element.naturalWidth, element.naturalHeight);
- var displaySize = goog.style.getSize(element);
- var size = naturalSize.area() < displaySize.area() ?
- naturalSize : displaySize;
- if (size.width && size.height) {
- var relevance = this.getElementRelevance_(element);
- return new DocumentImage(relevance, url, naturalSize, displaySize);
- }
- }
- return null;
-};
-
-
-/**
- * Extracts an image specified in 'background-image' property of an element.
- * @param {!Element} element
- * @return {DocumentImage}
- * @private
- */
-DocumentImageExtractor.prototype.extractBackgroundImage_ = function(element) {
- var backgroundImage = goog.style.getComputedStyle(
- element, 'background-image');
- var backgroundRepeat = goog.style.getComputedStyle(
- element, 'background-repeat');
- var backgroundSize = goog.style.getComputedStyle(
- element, 'background-size');
- if (backgroundImage &&
- (backgroundRepeat == 'no-repeat' || backgroundSize == 'cover') &&
- goog.string.startsWith(backgroundImage, 'url(') &&
- goog.string.endsWith(backgroundImage, ')')) {
- var url = backgroundImage.substr(4, backgroundImage.length - 5);
- if (url && !goog.string.startsWith(url, 'data:')) {
- var size = goog.style.getSize(element);
- if (size.width && size.height) {
- var relevance = this.getElementRelevance_(element);
- var children = goog.dom.getChildren(element);
- for (var i = 0; i < children.length; ++i) {
- var child = children[i];
- if (goog.style.getComputedStyle(child, 'display') != 'none' &&
- goog.style.getSize(child).area() > 0.1 * size.area()) {
- relevance *= 0.1;
- break;
- }
- }
- return new DocumentImage(relevance, url,
- undefined /* image size is unknown */, size);
- }
- }
- }
- return null;
-};
-}); // goog.scope

Powered by Google App Engine
This is Rietveld 408576698