Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(73)

Side by Side Diff: third_party/document_image_extractor/src/document_image_extractor.js

Issue 1138123002: Update third_party/document_image_extractor (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2015 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 goog.provide('image.collections.extension.DocumentImageExtractor'); 5 goog.provide('image.collections.extension.domextractor.DocumentImageExtractor');
6 6
7 goog.require('goog.Uri'); 7 goog.require('image.collections.extension.domextractor.AdElementFilter');
8 goog.require('goog.asserts'); 8 goog.require('image.collections.extension.domextractor.DocumentFeature');
9 goog.require('goog.dom'); 9 goog.require('image.collections.extension.domextractor.DocumentFeatureExtractor' );
10 goog.require('goog.log'); 10 goog.require('image.collections.extension.domextractor.DocumentImage');
11 goog.require('goog.math.Size'); 11 goog.require('image.collections.extension.domextractor.DomUtils');
12 goog.require('goog.object'); 12 goog.require('image.collections.extension.domextractor.Size');
13 goog.require('goog.string'); 13 goog.require('image.collections.extension.domextractor.VisibleElementFilter');
14 goog.require('goog.style');
15 goog.require('goog.uri.utils');
16 goog.require('gws.collections.common.Constants');
17 goog.require('image.collections.extension.AdElementFilter');
18 goog.require('image.collections.extension.DocumentFeature');
19 goog.require('image.collections.extension.DocumentFeatureExtractor');
20 goog.require('image.collections.extension.DocumentImage');
21 goog.require('image.collections.extension.VisibleElementFilter');
22 14
23 goog.scope(function() { 15 goog.scope(function() {
24 var AdElementFilter = image.collections.extension.AdElementFilter; 16 var AdElementFilter = image.collections.extension.domextractor.AdElementFilter;
25 var Constants = gws.collections.common.Constants; 17 var DocumentFeature = image.collections.extension.domextractor.DocumentFeature;
26 var DocumentFeature = image.collections.extension.DocumentFeature;
27 var DocumentFeatureExtractor = 18 var DocumentFeatureExtractor =
28 image.collections.extension.DocumentFeatureExtractor; 19 image.collections.extension.domextractor.DocumentFeatureExtractor;
29 var DocumentImage = image.collections.extension.DocumentImage; 20 var DocumentImage = image.collections.extension.domextractor.DocumentImage;
30 var CustomAttribute = DocumentImage.CustomAttribute; 21 var CustomAttribute = DocumentImage.CustomAttribute;
31 var VisibleElementFilter = image.collections.extension.VisibleElementFilter; 22 var DomUtils = image.collections.extension.domextractor.DomUtils;
23 var Size = image.collections.extension.domextractor.Size;
24 var VisibleElementFilter =
25 image.collections.extension.domextractor.VisibleElementFilter;
32 26
33 27
34 28
29 /** @const {number} The minimum width of extracted images. */
30 var EXTRACT_MIN_WIDTH = 100;
31
32
33 /** @const {number} The minimum height of extracted images. */
34 var EXTRACT_MIN_HEIGHT = 100;
35
36
35 /** 37 /**
36 * This class is used for extracting a salient image from an HTML document. 38 * This class is used for extracting a salient image from an HTML document.
37 * @extends {DocumentFeatureExtractor} 39 * @extends {DocumentFeatureExtractor}
38 * @constructor 40 * @constructor
41 * @suppress {undefinedNames}
39 */ 42 */
40 image.collections.extension.DocumentImageExtractor = function() { 43 image.collections.extension.domextractor.DocumentImageExtractor = function() {
41 DocumentImageExtractor.base(this, 'constructor'); 44 DocumentImageExtractor.base(this, 'constructor');
42 45
43 this.addFilter(new AdElementFilter()); 46 this.addFilter(new AdElementFilter());
44 this.addFilter(new VisibleElementFilter()); 47 this.addFilter(new VisibleElementFilter());
48
49 /** @private {!Element} Helper element for resolving URLs. */
50 this.helperAnchor_ = document.createElement('a');
45 }; 51 };
46 goog.inherits(image.collections.extension.DocumentImageExtractor, 52 DomUtils.inherits(
53 image.collections.extension.domextractor.DocumentImageExtractor,
47 DocumentFeatureExtractor); 54 DocumentFeatureExtractor);
48 var DocumentImageExtractor = image.collections.extension.DocumentImageExtractor; 55 var DocumentImageExtractor =
49 goog.addSingletonGetter(DocumentImageExtractor); 56 image.collections.extension.domextractor.DocumentImageExtractor;
50
51
52 /** @private {goog.log.Logger} Extractor logger. */
53 DocumentImageExtractor.logger_ = goog.log.getLogger(
54 'image.collections.extension.DocumentImageExtractor');
55 57
56 58
57 /** @enum {number} */ 59 /** @enum {number} */
58 DocumentImageExtractor.Parameters = { 60 DocumentImageExtractor.Parameters = {
59 AREA_MULTIPLIER: -1e-5, 61 AREA_MULTIPLIER: -1e-5,
60 ASPECT_RATIO_DEMOTION_FACTOR: 0.8, 62 ASPECT_RATIO_DEMOTION_FACTOR: 0.8,
61 MAX_ASPECT_RATIO: 2, 63 MAX_ASPECT_RATIO: 2,
62 MAX_ELEMENTS_WITH_BACKGROUND: 30, 64 MAX_ELEMENTS_WITH_BACKGROUND: 30,
63 MAX_OFFSET: 2000, 65 MAX_OFFSET: 2000,
64 NON_TOPLEVEL_DEMOTION_FACTOR: 0.5, 66 NON_TOPLEVEL_DEMOTION_FACTOR: 0.5,
65 OFFSET_MULTIPLIER: 1e-3, 67 OFFSET_MULTIPLIER: 1e-3,
66 WEIGHT_APPLE: 0.7, 68 WEIGHT_APPLE: 0.7,
67 WEIGHT_MICRODATA: 0.55, 69 WEIGHT_MICRODATA: 0.55,
68 WEIGHT_MICROSOFT: 0.9, 70 WEIGHT_MICROSOFT: 0.9,
69 WEIGHT_OPEN_GRAPH: 1.0, 71 WEIGHT_OPEN_GRAPH: 1.0,
70 WEIGHT_SRC: 0.6, 72 WEIGHT_SRC: 0.6,
71 WEIGHT_TWITTER: 0.8 73 WEIGHT_TWITTER: 0.8
72 }; 74 };
73 var Parameters = DocumentImageExtractor.Parameters; 75 var Parameters = DocumentImageExtractor.Parameters;
74 76
75 77
76 /** 78 /**
77 * Map of image type to relevance multiplier. 79 * Map of image type to relevance multiplier.
78 * @private {!Object.<string, number>} 80 * @private {!Object.<string, number>}
79 */ 81 */
80 DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_ = 82 DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_ = {'.gif': 0.5};
81 goog.object.create('.gif', 0.5);
82 83
83 84
84 85
85 /** @constructor */ 86 /** @constructor */
86 DocumentImageExtractor.Context = function() { 87 DocumentImageExtractor.Context = function() {
87 /** @type {number} */ 88 /** @type {number} */
88 this.numElementsWithBackground = 0; 89 this.numElementsWithBackground = 0;
89 90
90 /** @type {!Object.<string, number>} */ 91 /** @type {!Object.<string, number>} */
91 this.urlToRelevance = {}; 92 this.urlToRelevance = {};
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after
140 ++opt_context.numElementsWithBackground; 141 ++opt_context.numElementsWithBackground;
141 } 142 }
142 } 143 }
143 } 144 }
144 145
145 if (!image) { 146 if (!image) {
146 return null; 147 return null;
147 } 148 }
148 149
149 var size = image.getDisplaySize() || image.getSize(); 150 var size = image.getDisplaySize() || image.getSize();
150 goog.asserts.assert(!goog.isNull(size));
151 if (image.getUrl() != document.location.href) { 151 if (image.getUrl() != document.location.href) {
152 // Ignore images that are too small. 152 // Ignore images that are too small.
153 if (size.width < Constants.EXTRACT_MIN_WIDTH || 153 if (size.width < EXTRACT_MIN_WIDTH ||
154 size.height < Constants.EXTRACT_MIN_HEIGHT) { 154 size.height < EXTRACT_MIN_HEIGHT) {
155 return null; 155 return null;
156 } 156 }
157 } 157 }
158 // Demote smaller images (squash area using the sigmoid function). 158 // Demote smaller images (squash area using the sigmoid function).
159 var relevance = image.getRelevance(); 159 var relevance = image.getRelevance();
160 relevance /= (1 + Math.exp(Parameters.AREA_MULTIPLIER * size.area())); 160 relevance /= (1 + Math.exp(Parameters.AREA_MULTIPLIER * size.area()));
161 // Demote images with bad aspect ratio. 161 // Demote images with bad aspect ratio.
162 var aspectRatio = size.aspectRatio(); 162 var aspectRatio = size.width / size.height;
163 if (aspectRatio < 1) { 163 if (aspectRatio < 1) {
164 aspectRatio = 1 / aspectRatio; 164 aspectRatio = 1 / aspectRatio;
165 } 165 }
166 if (aspectRatio > Parameters.MAX_ASPECT_RATIO) { 166 if (aspectRatio > Parameters.MAX_ASPECT_RATIO) {
167 relevance *= Parameters.ASPECT_RATIO_DEMOTION_FACTOR; 167 relevance *= Parameters.ASPECT_RATIO_DEMOTION_FACTOR;
168 } 168 }
169 169
170 // TODO(busaryev): use the following features: 170 // TODO(busaryev): use the following features:
171 // - relative size of the image comparing to neighbors; 171 // - relative size of the image comparing to neighbors;
172 // - area of the visible portion of the image; 172 // - area of the visible portion of the image;
173 // - position (demote images on the border of the page). 173 // - position (demote images on the border of the page).
174 174
175 var url = image.getUrl(); 175 var url = image.getUrl();
176 try { 176 // Make sure that image url is absolute.
177 // Make sure that image url is absolute. 177 this.helperAnchor_.href = url;
178 var documentUrl = goog.dom.getOwnerDocument(element).documentURI; 178 url = this.helperAnchor_.href;
179 url = goog.Uri.resolve(documentUrl, url).toString();
180 } catch (e) {
181 goog.log.info(DocumentImageExtractor.logger_,
182 'Cannot resolve url: ' + url);
183 return null;
184 }
185 179
186 var imagePath = goog.string.makeSafe(goog.uri.utils.getPath(url)); 180 var imagePath = decodeURIComponent(this.helperAnchor_.pathname || '');
187 var lastDot = imagePath.lastIndexOf('.'); 181 var lastDot = imagePath.lastIndexOf('.');
188 if (lastDot > 0) { 182 if (lastDot > 0) {
189 var imageType = imagePath.slice(lastDot); 183 var imageType = imagePath.slice(lastDot);
190 var multiplier = 184 var multiplier =
191 DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_[imageType]; 185 DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_[imageType];
192 if (multiplier) { 186 if (multiplier) {
193 relevance *= multiplier; 187 relevance *= multiplier;
194 } 188 }
195 } 189 }
196 190
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
260 254
261 255
262 /** 256 /**
263 * @param {!Element} element 257 * @param {!Element} element
264 * @param {!number} relevance 258 * @param {!number} relevance
265 * @param {string} attributeName 259 * @param {string} attributeName
266 * @param {string} attribute 260 * @param {string} attribute
267 * @param {string} urlAttributeName 261 * @param {string} urlAttributeName
268 * @return {DocumentImage} 262 * @return {DocumentImage}
269 * @private 263 * @private
264 * @suppress {missingProperties}
270 */ 265 */
271 DocumentImageExtractor.prototype.extractCanonicalImage_ = function( 266 DocumentImageExtractor.prototype.extractCanonicalImage_ = function(
272 element, relevance, attributeName, attribute, urlAttributeName) { 267 element, relevance, attributeName, attribute, urlAttributeName) {
273 goog.asserts.assert(goog.isNumber(relevance)); 268 if (element.hasAttribute(attributeName) &&
274 goog.asserts.assert(goog.isString(attributeName)); 269 element.getAttribute(attributeName).toLowerCase() ==
275 goog.asserts.assert(goog.isString(attribute)); 270 attribute.toLowerCase()) {
276
277 if (goog.string.caseInsensitiveEquals(
278 goog.string.makeSafe(element.getAttribute(attributeName)), attribute)) {
279 var url = element.getAttribute(urlAttributeName); 271 var url = element.getAttribute(urlAttributeName);
280 if (!url || goog.string.startsWith(url, 'data:')) { 272 if (!url || url.startsWith('data:')) {
281 return null; 273 return null;
282 } 274 }
283 var width = goog.string.parseInt( 275 var width = parseInt(element.getAttribute(CustomAttribute.WIDTH), 10);
284 element.getAttribute(CustomAttribute.WIDTH)); 276 var height = parseInt(element.getAttribute(CustomAttribute.HEIGHT), 10);
285 var height = goog.string.parseInt(
286 element.getAttribute(CustomAttribute.HEIGHT));
287 if (width && height) { 277 if (width && height) {
288 // For non-toplevel urls, demote the image if it is not in the document. 278 // For non-toplevel urls, demote the image if it is not in the document.
289 var ownerDocument = goog.dom.getOwnerDocument(element); 279 var ownerDocument = DomUtils.getOwnerDocument(element);
290 if (goog.uri.utils.getPath(ownerDocument.documentURI) != '/' && 280 this.helperAnchor_.href = ownerDocument.documentURI;
291 ownerDocument.body.innerHTML.indexOf(url) == -1) { 281 var path = this.helperAnchor_.pathname;
282 if (path != '/' && ownerDocument.body.innerHTML.indexOf(url) == -1) {
292 relevance *= Parameters.NON_TOPLEVEL_DEMOTION_FACTOR; 283 relevance *= Parameters.NON_TOPLEVEL_DEMOTION_FACTOR;
293 } 284 }
294 var size = new goog.math.Size(width, height); 285 var size = new Size(width, height);
295 return new DocumentImage(relevance, url, size); 286 return new DocumentImage(relevance, url, size);
296 } 287 }
297 } 288 }
298 return null; 289 return null;
299 }; 290 };
300 291
301 292
302 /** 293 /**
303 * @param {!Element} element 294 * @param {!Element} element
304 * @return {DocumentImage} 295 * @return {DocumentImage}
305 * @private 296 * @private
297 * @suppress {missingProperties}
306 */ 298 */
307 DocumentImageExtractor.prototype.extractMicrodataImage_ = function(element) { 299 DocumentImageExtractor.prototype.extractMicrodataImage_ = function(element) {
308 var itemProp = element.getAttribute('itemprop'); 300 var itemProp = element.getAttribute('itemprop');
309 if (itemProp && itemProp.toLowerCase() == 'thumbnailurl') { 301 if (itemProp && itemProp.toLowerCase() == 'thumbnailurl') {
310 var url = element.getAttribute('href') || element.getAttribute('content'); 302 var url = element.getAttribute('href') || element.getAttribute('content');
311 if (!url || goog.string.startsWith(url, 'data:')) { 303 if (!url || url.startsWith('data:')) {
312 return null; 304 return null;
313 } 305 }
314 var width = goog.string.parseInt( 306 var width = parseInt(element.getAttribute(CustomAttribute.WIDTH), 10);
315 element.getAttribute(CustomAttribute.WIDTH)); 307 var height = parseInt(element.getAttribute(CustomAttribute.HEIGHT), 10);
316 var height = goog.string.parseInt(
317 element.getAttribute(CustomAttribute.HEIGHT));
318 if (width && height) { 308 if (width && height) {
319 var size = new goog.math.Size(width, height); 309 var size = new Size(width, height);
320 return new DocumentImage(Parameters.WEIGHT_MICRODATA, url, size); 310 return new DocumentImage(Parameters.WEIGHT_MICRODATA, url, size);
321 } 311 }
322 } 312 }
323 return null; 313 return null;
324 }; 314 };
325 315
326 316
327 /** 317 /**
328 * @param {!Element} element 318 * @param {!Element} element
329 * @return {number} 319 * @return {number}
330 * @private 320 * @private
331 */ 321 */
332 DocumentImageExtractor.prototype.getElementRelevance_ = function(element) { 322 DocumentImageExtractor.prototype.getElementRelevance_ = function(element) {
333 var offset = goog.style.getPageOffsetTop(element); 323 var offset = DomUtils.getPageOffsetTop(element);
334 if (offset > Parameters.MAX_OFFSET) { 324 if (offset > Parameters.MAX_OFFSET) {
335 return 0; 325 return 0;
336 } 326 }
337 return 1 / (1 + Math.exp(Parameters.OFFSET_MULTIPLIER * offset)); 327 return 1 / (1 + Math.exp(Parameters.OFFSET_MULTIPLIER * offset));
338 }; 328 };
339 329
340 330
341 /** 331 /**
342 * Extracts an image from the <img> HTML element. 332 * Extracts an image from the <img> HTML element.
343 * @param {!Element} element 333 * @param {!Element} element
344 * @return {DocumentImage} 334 * @return {DocumentImage}
345 * @private 335 * @private
346 */ 336 */
347 DocumentImageExtractor.prototype.extractImage_ = function(element) { 337 DocumentImageExtractor.prototype.extractImage_ = function(element) {
348 goog.asserts.assert(element.tagName.toLowerCase() == 'img');
349 var url = element.src; 338 var url = element.src;
350 // We cannot handle data URIs. 339 // We cannot handle data URIs.
351 if (url && !goog.string.startsWith(url, 'data:')) { 340 if (url && !url.startsWith('data:')) {
352 var naturalSize = new goog.math.Size( 341 var naturalSize = new Size(element.naturalWidth, element.naturalHeight);
353 element.naturalWidth, element.naturalHeight); 342 var displaySize = DomUtils.getSize(element);
354 var displaySize = goog.style.getSize(element);
355 var size = naturalSize.area() < displaySize.area() ? 343 var size = naturalSize.area() < displaySize.area() ?
356 naturalSize : displaySize; 344 naturalSize : displaySize;
357 if (size.width && size.height) { 345 if (size.width && size.height) {
358 var relevance = this.getElementRelevance_(element); 346 var relevance = this.getElementRelevance_(element);
359 return new DocumentImage(relevance, url, naturalSize, displaySize); 347 return new DocumentImage(relevance, url, naturalSize, displaySize);
360 } 348 }
361 } 349 }
362 return null; 350 return null;
363 }; 351 };
364 352
365 353
366 /** 354 /**
367 * Extracts an image specified in 'background-image' property of an element. 355 * Extracts an image specified in 'background-image' property of an element.
368 * @param {!Element} element 356 * @param {!Element} element
369 * @return {DocumentImage} 357 * @return {DocumentImage}
370 * @private 358 * @private
359 * @suppress {missingProperties}
371 */ 360 */
372 DocumentImageExtractor.prototype.extractBackgroundImage_ = function(element) { 361 DocumentImageExtractor.prototype.extractBackgroundImage_ = function(element) {
373 var backgroundImage = goog.style.getComputedStyle( 362 var backgroundImage = DomUtils.getComputedStyle(
374 element, 'background-image'); 363 element, 'background-image');
375 var backgroundRepeat = goog.style.getComputedStyle( 364 var backgroundRepeat = DomUtils.getComputedStyle(
376 element, 'background-repeat'); 365 element, 'background-repeat');
377 var backgroundSize = goog.style.getComputedStyle( 366 var backgroundSize = DomUtils.getComputedStyle(
378 element, 'background-size'); 367 element, 'background-size');
379 if (backgroundImage && 368 if (backgroundImage &&
380 (backgroundRepeat == 'no-repeat' || backgroundSize == 'cover') && 369 (backgroundRepeat == 'no-repeat' || backgroundSize == 'cover') &&
381 goog.string.startsWith(backgroundImage, 'url(') && 370 backgroundImage.startsWith('url(') &&
382 goog.string.endsWith(backgroundImage, ')')) { 371 backgroundImage.endsWith(')')) {
383 var url = backgroundImage.substr(4, backgroundImage.length - 5); 372 var url = backgroundImage.substr(4, backgroundImage.length - 5);
384 if (url && !goog.string.startsWith(url, 'data:')) { 373 if (url && !url.startsWith('data:')) {
385 var size = goog.style.getSize(element); 374 var size = DomUtils.getSize(element);
386 if (size.width && size.height) { 375 if (size.width && size.height) {
387 var relevance = this.getElementRelevance_(element); 376 var relevance = this.getElementRelevance_(element);
388 var children = goog.dom.getChildren(element); 377 var children = element.children;
389 for (var i = 0; i < children.length; ++i) { 378 for (var i = 0; i < children.length; ++i) {
390 var child = children[i]; 379 var child = children[i];
391 if (goog.style.getComputedStyle(child, 'display') != 'none' && 380 if (DomUtils.getComputedStyle(child, 'display') != 'none' &&
392 goog.style.getSize(child).area() > 0.1 * size.area()) { 381 DomUtils.getSize(child).area() > 0.1 * size.area()) {
393 relevance *= 0.1; 382 relevance *= 0.1;
394 break; 383 break;
395 } 384 }
396 } 385 }
397 return new DocumentImage(relevance, url, 386 return new DocumentImage(relevance, url,
398 undefined /* image size is unknown */, size); 387 undefined /* image size is unknown */, size);
399 } 388 }
400 } 389 }
401 } 390 }
402 return null; 391 return null;
403 }; 392 };
404 }); // goog.scope 393 }); // goog.scope
OLDNEW
« no previous file with comments | « third_party/document_image_extractor/src/document_image.js ('k') | third_party/document_image_extractor/src/document_video.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698