Index: third_party/document_image_extractor/src/document_image_extractor.js |
diff --git a/third_party/document_image_extractor/src/document_image_extractor.js b/third_party/document_image_extractor/src/document_image_extractor.js |
index 38c59bb7388b3f439313b86afcfd4684554aeef2..a9ee672d032641557848c731039a441653777466 100644 |
--- a/third_party/document_image_extractor/src/document_image_extractor.js |
+++ b/third_party/document_image_extractor/src/document_image_extractor.js |
@@ -2,56 +2,58 @@ |
// Use of this source code is governed by a BSD-style license that can be |
// found in the LICENSE file. |
-goog.provide('image.collections.extension.DocumentImageExtractor'); |
- |
-goog.require('goog.Uri'); |
-goog.require('goog.asserts'); |
-goog.require('goog.dom'); |
-goog.require('goog.log'); |
-goog.require('goog.math.Size'); |
-goog.require('goog.object'); |
-goog.require('goog.string'); |
-goog.require('goog.style'); |
-goog.require('goog.uri.utils'); |
-goog.require('gws.collections.common.Constants'); |
-goog.require('image.collections.extension.AdElementFilter'); |
-goog.require('image.collections.extension.DocumentFeature'); |
-goog.require('image.collections.extension.DocumentFeatureExtractor'); |
-goog.require('image.collections.extension.DocumentImage'); |
-goog.require('image.collections.extension.VisibleElementFilter'); |
+goog.provide('image.collections.extension.domextractor.DocumentImageExtractor'); |
+ |
+goog.require('image.collections.extension.domextractor.AdElementFilter'); |
+goog.require('image.collections.extension.domextractor.DocumentFeature'); |
+goog.require('image.collections.extension.domextractor.DocumentFeatureExtractor'); |
+goog.require('image.collections.extension.domextractor.DocumentImage'); |
+goog.require('image.collections.extension.domextractor.DomUtils'); |
+goog.require('image.collections.extension.domextractor.Size'); |
+goog.require('image.collections.extension.domextractor.VisibleElementFilter'); |
goog.scope(function() { |
-var AdElementFilter = image.collections.extension.AdElementFilter; |
-var Constants = gws.collections.common.Constants; |
-var DocumentFeature = image.collections.extension.DocumentFeature; |
+var AdElementFilter = image.collections.extension.domextractor.AdElementFilter; |
+var DocumentFeature = image.collections.extension.domextractor.DocumentFeature; |
var DocumentFeatureExtractor = |
- image.collections.extension.DocumentFeatureExtractor; |
-var DocumentImage = image.collections.extension.DocumentImage; |
+ image.collections.extension.domextractor.DocumentFeatureExtractor; |
+var DocumentImage = image.collections.extension.domextractor.DocumentImage; |
var CustomAttribute = DocumentImage.CustomAttribute; |
-var VisibleElementFilter = image.collections.extension.VisibleElementFilter; |
+var DomUtils = image.collections.extension.domextractor.DomUtils; |
+var Size = image.collections.extension.domextractor.Size; |
+var VisibleElementFilter = |
+ image.collections.extension.domextractor.VisibleElementFilter; |
+ |
+ |
+/** @const {number} The minimum width of extracted images. */ |
+var EXTRACT_MIN_WIDTH = 100; |
+ |
+ |
+/** @const {number} The minimum height of extracted images. */ |
+var EXTRACT_MIN_HEIGHT = 100; |
/** |
* This class is used for extracting a salient image from an HTML document. |
* @extends {DocumentFeatureExtractor} |
* @constructor |
+ * @suppress {undefinedNames} |
*/ |
-image.collections.extension.DocumentImageExtractor = function() { |
+image.collections.extension.domextractor.DocumentImageExtractor = function() { |
DocumentImageExtractor.base(this, 'constructor'); |
this.addFilter(new AdElementFilter()); |
this.addFilter(new VisibleElementFilter()); |
+ |
+ /** @private {!Element} Helper element for resolving URLs. */ |
+ this.helperAnchor_ = document.createElement('a'); |
}; |
-goog.inherits(image.collections.extension.DocumentImageExtractor, |
+DomUtils.inherits( |
+ image.collections.extension.domextractor.DocumentImageExtractor, |
DocumentFeatureExtractor); |
-var DocumentImageExtractor = image.collections.extension.DocumentImageExtractor; |
-goog.addSingletonGetter(DocumentImageExtractor); |
- |
- |
-/** @private {goog.log.Logger} Extractor logger. */ |
-DocumentImageExtractor.logger_ = goog.log.getLogger( |
- 'image.collections.extension.DocumentImageExtractor'); |
+var DocumentImageExtractor = |
+ image.collections.extension.domextractor.DocumentImageExtractor; |
/** @enum {number} */ |
@@ -77,8 +79,7 @@ var Parameters = DocumentImageExtractor.Parameters; |
* Map of image type to relevance multiplier. |
* @private {!Object.<string, number>} |
*/ |
-DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_ = |
- goog.object.create('.gif', 0.5); |
+DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_ = {'.gif': 0.5}; |
@@ -147,11 +148,10 @@ DocumentImageExtractor.prototype.extractFromElement = function( |
} |
var size = image.getDisplaySize() || image.getSize(); |
- goog.asserts.assert(!goog.isNull(size)); |
if (image.getUrl() != document.location.href) { |
// Ignore images that are too small. |
- if (size.width < Constants.EXTRACT_MIN_WIDTH || |
- size.height < Constants.EXTRACT_MIN_HEIGHT) { |
+ if (size.width < EXTRACT_MIN_WIDTH || |
+ size.height < EXTRACT_MIN_HEIGHT) { |
return null; |
} |
} |
@@ -159,7 +159,7 @@ DocumentImageExtractor.prototype.extractFromElement = function( |
var relevance = image.getRelevance(); |
relevance /= (1 + Math.exp(Parameters.AREA_MULTIPLIER * size.area())); |
// Demote images with bad aspect ratio. |
- var aspectRatio = size.aspectRatio(); |
+ var aspectRatio = size.width / size.height; |
if (aspectRatio < 1) { |
aspectRatio = 1 / aspectRatio; |
} |
@@ -173,17 +173,11 @@ DocumentImageExtractor.prototype.extractFromElement = function( |
// - position (demote images on the border of the page). |
var url = image.getUrl(); |
- try { |
- // Make sure that image url is absolute. |
- var documentUrl = goog.dom.getOwnerDocument(element).documentURI; |
- url = goog.Uri.resolve(documentUrl, url).toString(); |
- } catch (e) { |
- goog.log.info(DocumentImageExtractor.logger_, |
- 'Cannot resolve url: ' + url); |
- return null; |
- } |
+ // Make sure that image url is absolute. |
+ this.helperAnchor_.href = url; |
+ url = this.helperAnchor_.href; |
- var imagePath = goog.string.makeSafe(goog.uri.utils.getPath(url)); |
+ var imagePath = decodeURIComponent(this.helperAnchor_.pathname || ''); |
var lastDot = imagePath.lastIndexOf('.'); |
if (lastDot > 0) { |
var imageType = imagePath.slice(lastDot); |
@@ -267,31 +261,28 @@ DocumentImageExtractor.prototype.extractImageSrcImage_ = function(element) { |
* @param {string} urlAttributeName |
* @return {DocumentImage} |
* @private |
+ * @suppress {missingProperties} |
*/ |
DocumentImageExtractor.prototype.extractCanonicalImage_ = function( |
element, relevance, attributeName, attribute, urlAttributeName) { |
- goog.asserts.assert(goog.isNumber(relevance)); |
- goog.asserts.assert(goog.isString(attributeName)); |
- goog.asserts.assert(goog.isString(attribute)); |
- |
- if (goog.string.caseInsensitiveEquals( |
- goog.string.makeSafe(element.getAttribute(attributeName)), attribute)) { |
+ if (element.hasAttribute(attributeName) && |
+ element.getAttribute(attributeName).toLowerCase() == |
+ attribute.toLowerCase()) { |
var url = element.getAttribute(urlAttributeName); |
- if (!url || goog.string.startsWith(url, 'data:')) { |
+ if (!url || url.startsWith('data:')) { |
return null; |
} |
- var width = goog.string.parseInt( |
- element.getAttribute(CustomAttribute.WIDTH)); |
- var height = goog.string.parseInt( |
- element.getAttribute(CustomAttribute.HEIGHT)); |
+ var width = parseInt(element.getAttribute(CustomAttribute.WIDTH), 10); |
+ var height = parseInt(element.getAttribute(CustomAttribute.HEIGHT), 10); |
if (width && height) { |
// For non-toplevel urls, demote the image if it is not in the document. |
- var ownerDocument = goog.dom.getOwnerDocument(element); |
- if (goog.uri.utils.getPath(ownerDocument.documentURI) != '/' && |
- ownerDocument.body.innerHTML.indexOf(url) == -1) { |
+ var ownerDocument = DomUtils.getOwnerDocument(element); |
+ this.helperAnchor_.href = ownerDocument.documentURI; |
+ var path = this.helperAnchor_.pathname; |
+ if (path != '/' && ownerDocument.body.innerHTML.indexOf(url) == -1) { |
relevance *= Parameters.NON_TOPLEVEL_DEMOTION_FACTOR; |
} |
- var size = new goog.math.Size(width, height); |
+ var size = new Size(width, height); |
return new DocumentImage(relevance, url, size); |
} |
} |
@@ -303,20 +294,19 @@ DocumentImageExtractor.prototype.extractCanonicalImage_ = function( |
* @param {!Element} element |
* @return {DocumentImage} |
* @private |
+ * @suppress {missingProperties} |
*/ |
DocumentImageExtractor.prototype.extractMicrodataImage_ = function(element) { |
var itemProp = element.getAttribute('itemprop'); |
if (itemProp && itemProp.toLowerCase() == 'thumbnailurl') { |
var url = element.getAttribute('href') || element.getAttribute('content'); |
- if (!url || goog.string.startsWith(url, 'data:')) { |
+ if (!url || url.startsWith('data:')) { |
return null; |
} |
- var width = goog.string.parseInt( |
- element.getAttribute(CustomAttribute.WIDTH)); |
- var height = goog.string.parseInt( |
- element.getAttribute(CustomAttribute.HEIGHT)); |
+ var width = parseInt(element.getAttribute(CustomAttribute.WIDTH), 10); |
+ var height = parseInt(element.getAttribute(CustomAttribute.HEIGHT), 10); |
if (width && height) { |
- var size = new goog.math.Size(width, height); |
+ var size = new Size(width, height); |
return new DocumentImage(Parameters.WEIGHT_MICRODATA, url, size); |
} |
} |
@@ -330,7 +320,7 @@ DocumentImageExtractor.prototype.extractMicrodataImage_ = function(element) { |
* @private |
*/ |
DocumentImageExtractor.prototype.getElementRelevance_ = function(element) { |
- var offset = goog.style.getPageOffsetTop(element); |
+ var offset = DomUtils.getPageOffsetTop(element); |
if (offset > Parameters.MAX_OFFSET) { |
return 0; |
} |
@@ -345,13 +335,11 @@ DocumentImageExtractor.prototype.getElementRelevance_ = function(element) { |
* @private |
*/ |
DocumentImageExtractor.prototype.extractImage_ = function(element) { |
- goog.asserts.assert(element.tagName.toLowerCase() == 'img'); |
var url = element.src; |
// We cannot handle data URIs. |
- if (url && !goog.string.startsWith(url, 'data:')) { |
- var naturalSize = new goog.math.Size( |
- element.naturalWidth, element.naturalHeight); |
- var displaySize = goog.style.getSize(element); |
+ if (url && !url.startsWith('data:')) { |
+ var naturalSize = new Size(element.naturalWidth, element.naturalHeight); |
+ var displaySize = DomUtils.getSize(element); |
var size = naturalSize.area() < displaySize.area() ? |
naturalSize : displaySize; |
if (size.width && size.height) { |
@@ -368,28 +356,29 @@ DocumentImageExtractor.prototype.extractImage_ = function(element) { |
* @param {!Element} element |
* @return {DocumentImage} |
* @private |
+ * @suppress {missingProperties} |
*/ |
DocumentImageExtractor.prototype.extractBackgroundImage_ = function(element) { |
- var backgroundImage = goog.style.getComputedStyle( |
+ var backgroundImage = DomUtils.getComputedStyle( |
element, 'background-image'); |
- var backgroundRepeat = goog.style.getComputedStyle( |
+ var backgroundRepeat = DomUtils.getComputedStyle( |
element, 'background-repeat'); |
- var backgroundSize = goog.style.getComputedStyle( |
+ var backgroundSize = DomUtils.getComputedStyle( |
element, 'background-size'); |
if (backgroundImage && |
(backgroundRepeat == 'no-repeat' || backgroundSize == 'cover') && |
- goog.string.startsWith(backgroundImage, 'url(') && |
- goog.string.endsWith(backgroundImage, ')')) { |
+ backgroundImage.startsWith('url(') && |
+ backgroundImage.endsWith(')')) { |
var url = backgroundImage.substr(4, backgroundImage.length - 5); |
- if (url && !goog.string.startsWith(url, 'data:')) { |
- var size = goog.style.getSize(element); |
+ if (url && !url.startsWith('data:')) { |
+ var size = DomUtils.getSize(element); |
if (size.width && size.height) { |
var relevance = this.getElementRelevance_(element); |
- var children = goog.dom.getChildren(element); |
+ var children = element.children; |
for (var i = 0; i < children.length; ++i) { |
var child = children[i]; |
- if (goog.style.getComputedStyle(child, 'display') != 'none' && |
- goog.style.getSize(child).area() > 0.1 * size.area()) { |
+ if (DomUtils.getComputedStyle(child, 'display') != 'none' && |
+ DomUtils.getSize(child).area() > 0.1 * size.area()) { |
relevance *= 0.1; |
break; |
} |