Index: third_party/document_image_extractor/third_party/src/document_image_extractor.js |
diff --git a/third_party/document_image_extractor/third_party/src/document_image_extractor.js b/third_party/document_image_extractor/third_party/src/document_image_extractor.js |
deleted file mode 100644 |
index d82702d741d98a71ba53c5fcb91fd0626818d38d..0000000000000000000000000000000000000000 |
--- a/third_party/document_image_extractor/third_party/src/document_image_extractor.js |
+++ /dev/null |
@@ -1,400 +0,0 @@ |
-goog.provide('image.collections.extension.DocumentImageExtractor'); |
- |
-goog.require('goog.Uri'); |
-goog.require('goog.asserts'); |
-goog.require('goog.dom'); |
-goog.require('goog.log'); |
-goog.require('goog.math.Size'); |
-goog.require('goog.object'); |
-goog.require('goog.string'); |
-goog.require('goog.style'); |
-goog.require('goog.uri.utils'); |
-goog.require('gws.collections.common.Constants'); |
-goog.require('image.collections.extension.AdElementFilter'); |
-goog.require('image.collections.extension.DocumentFeature'); |
-goog.require('image.collections.extension.DocumentFeatureExtractor'); |
-goog.require('image.collections.extension.DocumentImage'); |
-goog.require('image.collections.extension.VisibleElementFilter'); |
- |
-goog.scope(function() { |
-var AdElementFilter = image.collections.extension.AdElementFilter; |
-var Constants = gws.collections.common.Constants; |
-var DocumentFeature = image.collections.extension.DocumentFeature; |
-var DocumentFeatureExtractor = |
- image.collections.extension.DocumentFeatureExtractor; |
-var DocumentImage = image.collections.extension.DocumentImage; |
-var CustomAttribute = DocumentImage.CustomAttribute; |
-var VisibleElementFilter = image.collections.extension.VisibleElementFilter; |
- |
- |
- |
-/** |
- * This class is used for extracting a salient image from an HTML document. |
- * @extends {DocumentFeatureExtractor} |
- * @constructor |
- */ |
-image.collections.extension.DocumentImageExtractor = function() { |
- DocumentImageExtractor.base(this, 'constructor'); |
- |
- this.addFilter(new AdElementFilter()); |
- this.addFilter(new VisibleElementFilter()); |
-}; |
-goog.inherits(image.collections.extension.DocumentImageExtractor, |
- DocumentFeatureExtractor); |
-var DocumentImageExtractor = image.collections.extension.DocumentImageExtractor; |
-goog.addSingletonGetter(DocumentImageExtractor); |
- |
- |
-/** @private {goog.log.Logger} Extractor logger. */ |
-DocumentImageExtractor.logger_ = goog.log.getLogger( |
- 'image.collections.extension.DocumentImageExtractor'); |
- |
- |
-/** @enum {number} */ |
-DocumentImageExtractor.Parameters = { |
- AREA_MULTIPLIER: -1e-5, |
- ASPECT_RATIO_DEMOTION_FACTOR: 0.8, |
- MAX_ASPECT_RATIO: 2, |
- MAX_ELEMENTS_WITH_BACKGROUND: 30, |
- MAX_OFFSET: 2000, |
- NON_TOPLEVEL_DEMOTION_FACTOR: 0.5, |
- OFFSET_MULTIPLIER: 1e-3, |
- WEIGHT_APPLE: 0.7, |
- WEIGHT_MICRODATA: 0.55, |
- WEIGHT_MICROSOFT: 0.9, |
- WEIGHT_OPEN_GRAPH: 1.0, |
- WEIGHT_SRC: 0.6, |
- WEIGHT_TWITTER: 0.8 |
-}; |
-var Parameters = DocumentImageExtractor.Parameters; |
- |
- |
-/** |
- * Map of image type to relevance multiplier. |
- * @private {!Object.<string, number>} |
- */ |
-DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_ = |
- goog.object.create('.gif', 0.5); |
- |
- |
- |
-/** @constructor */ |
-DocumentImageExtractor.Context = function() { |
- /** @type {number} */ |
- this.numElementsWithBackground = 0; |
- |
- /** @type {!Object.<string, number>} */ |
- this.urlToRelevance = {}; |
-}; |
-var Context = DocumentImageExtractor.Context; |
- |
- |
-/** |
- * Extracts salient images from an HTML document. |
- * @param {!Document} doc |
- * @return {!Array.<!DocumentImage>} |
- * @override |
- */ |
-DocumentImageExtractor.prototype.extractAllFromDocument = function(doc) { |
- var context = new Context(); |
- return this.extractFromNodeList(doc.getElementsByTagName('*'), context); |
-}; |
- |
- |
-/** |
- * Extracts a salient image from an HTML element, returns null if failed. |
- * @param {!Element} element HTML element. |
- * @param {Object=} opt_context Optional context. |
- * @return {DocumentImage} |
- * @override |
- */ |
-DocumentImageExtractor.prototype.extractFromElement = function( |
- element, opt_context) { |
- var image = null; |
- switch (element.tagName.toLowerCase()) { |
- case 'meta': |
- image = this.extractOpenGraphImage_(element) || |
- this.extractMicrosoftImage_(element) || |
- this.extractTwitterImage_(element) || |
- this.extractMicrodataImage_(element); |
- break; |
- case 'link': |
- image = this.extractAppleImage_(element) || |
- this.extractImageSrcImage_(element) || |
- this.extractMicrodataImage_(element); |
- break; |
- case 'img': |
- if (this.filter(element)) { |
- image = this.extractImage_(element); |
- } |
- break; |
- default: |
- if (this.filter(element) && opt_context.numElementsWithBackground < |
- Parameters.MAX_ELEMENTS_WITH_BACKGROUND) { |
- image = this.extractBackgroundImage_(element); |
- if (image) { |
- ++opt_context.numElementsWithBackground; |
- } |
- } |
- } |
- |
- if (!image) { |
- return null; |
- } |
- |
- var size = image.getDisplaySize() || image.getSize(); |
- goog.asserts.assert(!goog.isNull(size)); |
- if (image.getUrl() != document.location.href) { |
- // Ignore images that are too small. |
- if (size.width < Constants.EXTRACT_MIN_WIDTH || |
- size.height < Constants.EXTRACT_MIN_HEIGHT) { |
- return null; |
- } |
- } |
- // Demote smaller images (squash area using the sigmoid function). |
- var relevance = image.getRelevance(); |
- relevance /= (1 + Math.exp(Parameters.AREA_MULTIPLIER * size.area())); |
- // Demote images with bad aspect ratio. |
- var aspectRatio = size.aspectRatio(); |
- if (aspectRatio < 1) { |
- aspectRatio = 1 / aspectRatio; |
- } |
- if (aspectRatio > Parameters.MAX_ASPECT_RATIO) { |
- relevance *= Parameters.ASPECT_RATIO_DEMOTION_FACTOR; |
- } |
- |
- // TODO(busaryev): use the following features: |
- // - relative size of the image comparing to neighbors; |
- // - area of the visible portion of the image; |
- // - position (demote images on the border of the page). |
- |
- var url = image.getUrl(); |
- try { |
- // Make sure that image url is absolute. |
- var documentUrl = goog.dom.getOwnerDocument(element).documentURI; |
- url = goog.Uri.resolve(documentUrl, url).toString(); |
- } catch (e) { |
- goog.log.info(DocumentImageExtractor.logger_, |
- 'Cannot resolve url: ' + url); |
- return null; |
- } |
- |
- var imagePath = goog.string.makeSafe(goog.uri.utils.getPath(url)); |
- var lastDot = imagePath.lastIndexOf('.'); |
- if (lastDot > 0) { |
- var imageType = imagePath.slice(lastDot); |
- var multiplier = |
- DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_[imageType]; |
- if (multiplier) { |
- relevance *= multiplier; |
- } |
- } |
- |
- if (url in opt_context.urlToRelevance && |
- opt_context.urlToRelevance[url] > relevance) { |
- return null; |
- } |
- opt_context.urlToRelevance[url] = relevance; |
- return new DocumentImage(relevance, url, image.getSize(), |
- image.getDisplaySize()); |
-}; |
- |
- |
-/** |
- * @param {!Element} element |
- * @return {DocumentImage} |
- * @private |
- */ |
-DocumentImageExtractor.prototype.extractOpenGraphImage_ = function(element) { |
- return this.extractCanonicalImage_( |
- element, Parameters.WEIGHT_OPEN_GRAPH, 'property', 'og:image', 'content'); |
-}; |
- |
- |
-/** |
- * @param {!Element} element |
- * @return {DocumentImage} |
- * @private |
- */ |
-DocumentImageExtractor.prototype.extractMicrosoftImage_ = function(element) { |
- return this.extractCanonicalImage_(element, Parameters.WEIGHT_MICROSOFT, |
- 'name', 'msapplication-tileimage', 'content'); |
-}; |
- |
- |
-/** |
- * @param {!Element} element |
- * @return {DocumentImage} |
- * @private |
- */ |
-DocumentImageExtractor.prototype.extractTwitterImage_ = function(element) { |
- return this.extractCanonicalImage_( |
- element, Parameters.WEIGHT_TWITTER, 'name', 'twitter:image', 'content'); |
-}; |
- |
- |
-/** |
- * @param {!Element} element |
- * @return {DocumentImage} |
- * @private |
- */ |
-DocumentImageExtractor.prototype.extractAppleImage_ = function(element) { |
- return this.extractCanonicalImage_( |
- element, Parameters.WEIGHT_APPLE, 'rel', 'apple-touch-icon', 'href'); |
-}; |
- |
- |
-/** |
- * @param {!Element} element |
- * @return {DocumentImage} |
- * @private |
- */ |
-DocumentImageExtractor.prototype.extractImageSrcImage_ = function(element) { |
- return this.extractCanonicalImage_( |
- element, Parameters.WEIGHT_SRC, 'rel', 'image_src', 'href'); |
-}; |
- |
- |
-/** |
- * @param {!Element} element |
- * @param {!number} relevance |
- * @param {string} attributeName |
- * @param {string} attribute |
- * @param {string} urlAttributeName |
- * @return {DocumentImage} |
- * @private |
- */ |
-DocumentImageExtractor.prototype.extractCanonicalImage_ = function( |
- element, relevance, attributeName, attribute, urlAttributeName) { |
- goog.asserts.assert(goog.isNumber(relevance)); |
- goog.asserts.assert(goog.isString(attributeName)); |
- goog.asserts.assert(goog.isString(attribute)); |
- |
- if (goog.string.caseInsensitiveEquals( |
- goog.string.makeSafe(element.getAttribute(attributeName)), attribute)) { |
- var url = element.getAttribute(urlAttributeName); |
- if (!url || goog.string.startsWith(url, 'data:')) { |
- return null; |
- } |
- var width = goog.string.parseInt( |
- element.getAttribute(CustomAttribute.WIDTH)); |
- var height = goog.string.parseInt( |
- element.getAttribute(CustomAttribute.HEIGHT)); |
- if (width && height) { |
- // For non-toplevel urls, demote the image if it is not in the document. |
- var ownerDocument = goog.dom.getOwnerDocument(element); |
- if (goog.uri.utils.getPath(ownerDocument.documentURI) != '/' && |
- ownerDocument.body.innerHTML.indexOf(url) == -1) { |
- relevance *= Parameters.NON_TOPLEVEL_DEMOTION_FACTOR; |
- } |
- var size = new goog.math.Size(width, height); |
- return new DocumentImage(relevance, url, size); |
- } |
- } |
- return null; |
-}; |
- |
- |
-/** |
- * @param {!Element} element |
- * @return {DocumentImage} |
- * @private |
- */ |
-DocumentImageExtractor.prototype.extractMicrodataImage_ = function(element) { |
- var itemProp = element.getAttribute('itemprop'); |
- if (itemProp && itemProp.toLowerCase() == 'thumbnailurl') { |
- var url = element.getAttribute('href') || element.getAttribute('content'); |
- if (!url || goog.string.startsWith(url, 'data:')) { |
- return null; |
- } |
- var width = goog.string.parseInt( |
- element.getAttribute(CustomAttribute.WIDTH)); |
- var height = goog.string.parseInt( |
- element.getAttribute(CustomAttribute.HEIGHT)); |
- if (width && height) { |
- var size = new goog.math.Size(width, height); |
- return new DocumentImage(Parameters.WEIGHT_MICRODATA, url, size); |
- } |
- } |
- return null; |
-}; |
- |
- |
-/** |
- * @param {!Element} element |
- * @return {number} |
- * @private |
- */ |
-DocumentImageExtractor.prototype.getElementRelevance_ = function(element) { |
- var offset = goog.style.getPageOffsetTop(element); |
- if (offset > Parameters.MAX_OFFSET) { |
- return 0; |
- } |
- return 1 / (1 + Math.exp(Parameters.OFFSET_MULTIPLIER * offset)); |
-}; |
- |
- |
-/** |
- * Extracts an image from the <img> HTML element. |
- * @param {!Element} element |
- * @return {DocumentImage} |
- * @private |
- */ |
-DocumentImageExtractor.prototype.extractImage_ = function(element) { |
- goog.asserts.assert(element.tagName.toLowerCase() == 'img'); |
- var url = element.src; |
- // We cannot handle data URIs. |
- if (url && !goog.string.startsWith(url, 'data:')) { |
- var naturalSize = new goog.math.Size( |
- element.naturalWidth, element.naturalHeight); |
- var displaySize = goog.style.getSize(element); |
- var size = naturalSize.area() < displaySize.area() ? |
- naturalSize : displaySize; |
- if (size.width && size.height) { |
- var relevance = this.getElementRelevance_(element); |
- return new DocumentImage(relevance, url, naturalSize, displaySize); |
- } |
- } |
- return null; |
-}; |
- |
- |
-/** |
- * Extracts an image specified in 'background-image' property of an element. |
- * @param {!Element} element |
- * @return {DocumentImage} |
- * @private |
- */ |
-DocumentImageExtractor.prototype.extractBackgroundImage_ = function(element) { |
- var backgroundImage = goog.style.getComputedStyle( |
- element, 'background-image'); |
- var backgroundRepeat = goog.style.getComputedStyle( |
- element, 'background-repeat'); |
- var backgroundSize = goog.style.getComputedStyle( |
- element, 'background-size'); |
- if (backgroundImage && |
- (backgroundRepeat == 'no-repeat' || backgroundSize == 'cover') && |
- goog.string.startsWith(backgroundImage, 'url(') && |
- goog.string.endsWith(backgroundImage, ')')) { |
- var url = backgroundImage.substr(4, backgroundImage.length - 5); |
- if (url && !goog.string.startsWith(url, 'data:')) { |
- var size = goog.style.getSize(element); |
- if (size.width && size.height) { |
- var relevance = this.getElementRelevance_(element); |
- var children = goog.dom.getChildren(element); |
- for (var i = 0; i < children.length; ++i) { |
- var child = children[i]; |
- if (goog.style.getComputedStyle(child, 'display') != 'none' && |
- goog.style.getSize(child).area() > 0.1 * size.area()) { |
- relevance *= 0.1; |
- break; |
- } |
- } |
- return new DocumentImage(relevance, url, |
- undefined /* image size is unknown */, size); |
- } |
- } |
- } |
- return null; |
-}; |
-}); // goog.scope |