OLD | NEW |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 goog.provide('image.collections.extension.DocumentImageExtractor'); | 5 goog.provide('image.collections.extension.domextractor.DocumentImageExtractor'); |
6 | 6 |
7 goog.require('goog.Uri'); | 7 goog.require('image.collections.extension.domextractor.AdElementFilter'); |
8 goog.require('goog.asserts'); | 8 goog.require('image.collections.extension.domextractor.DocumentFeature'); |
9 goog.require('goog.dom'); | 9 goog.require('image.collections.extension.domextractor.DocumentFeatureExtractor'
); |
10 goog.require('goog.log'); | 10 goog.require('image.collections.extension.domextractor.DocumentImage'); |
11 goog.require('goog.math.Size'); | 11 goog.require('image.collections.extension.domextractor.DomUtils'); |
12 goog.require('goog.object'); | 12 goog.require('image.collections.extension.domextractor.Size'); |
13 goog.require('goog.string'); | 13 goog.require('image.collections.extension.domextractor.VisibleElementFilter'); |
14 goog.require('goog.style'); | |
15 goog.require('goog.uri.utils'); | |
16 goog.require('gws.collections.common.Constants'); | |
17 goog.require('image.collections.extension.AdElementFilter'); | |
18 goog.require('image.collections.extension.DocumentFeature'); | |
19 goog.require('image.collections.extension.DocumentFeatureExtractor'); | |
20 goog.require('image.collections.extension.DocumentImage'); | |
21 goog.require('image.collections.extension.VisibleElementFilter'); | |
22 | 14 |
23 goog.scope(function() { | 15 goog.scope(function() { |
24 var AdElementFilter = image.collections.extension.AdElementFilter; | 16 var AdElementFilter = image.collections.extension.domextractor.AdElementFilter; |
25 var Constants = gws.collections.common.Constants; | 17 var DocumentFeature = image.collections.extension.domextractor.DocumentFeature; |
26 var DocumentFeature = image.collections.extension.DocumentFeature; | |
27 var DocumentFeatureExtractor = | 18 var DocumentFeatureExtractor = |
28 image.collections.extension.DocumentFeatureExtractor; | 19 image.collections.extension.domextractor.DocumentFeatureExtractor; |
29 var DocumentImage = image.collections.extension.DocumentImage; | 20 var DocumentImage = image.collections.extension.domextractor.DocumentImage; |
30 var CustomAttribute = DocumentImage.CustomAttribute; | 21 var CustomAttribute = DocumentImage.CustomAttribute; |
31 var VisibleElementFilter = image.collections.extension.VisibleElementFilter; | 22 var DomUtils = image.collections.extension.domextractor.DomUtils; |
| 23 var Size = image.collections.extension.domextractor.Size; |
| 24 var VisibleElementFilter = |
| 25 image.collections.extension.domextractor.VisibleElementFilter; |
32 | 26 |
33 | 27 |
34 | 28 |
| 29 /** @const {number} The minimum width of extracted images. */ |
| 30 var EXTRACT_MIN_WIDTH = 100; |
| 31 |
| 32 |
| 33 /** @const {number} The minimum height of extracted images. */ |
| 34 var EXTRACT_MIN_HEIGHT = 100; |
| 35 |
| 36 |
35 /** | 37 /** |
36 * This class is used for extracting a salient image from an HTML document. | 38 * This class is used for extracting a salient image from an HTML document. |
37 * @extends {DocumentFeatureExtractor} | 39 * @extends {DocumentFeatureExtractor} |
38 * @constructor | 40 * @constructor |
| 41 * @suppress {undefinedNames} |
39 */ | 42 */ |
40 image.collections.extension.DocumentImageExtractor = function() { | 43 image.collections.extension.domextractor.DocumentImageExtractor = function() { |
41 DocumentImageExtractor.base(this, 'constructor'); | 44 DocumentImageExtractor.base(this, 'constructor'); |
42 | 45 |
43 this.addFilter(new AdElementFilter()); | 46 this.addFilter(new AdElementFilter()); |
44 this.addFilter(new VisibleElementFilter()); | 47 this.addFilter(new VisibleElementFilter()); |
| 48 |
| 49 /** @private {!Element} Helper element for resolving URLs. */ |
| 50 this.helperAnchor_ = document.createElement('a'); |
45 }; | 51 }; |
46 goog.inherits(image.collections.extension.DocumentImageExtractor, | 52 DomUtils.inherits( |
| 53 image.collections.extension.domextractor.DocumentImageExtractor, |
47 DocumentFeatureExtractor); | 54 DocumentFeatureExtractor); |
48 var DocumentImageExtractor = image.collections.extension.DocumentImageExtractor; | 55 var DocumentImageExtractor = |
49 goog.addSingletonGetter(DocumentImageExtractor); | 56 image.collections.extension.domextractor.DocumentImageExtractor; |
50 | |
51 | |
52 /** @private {goog.log.Logger} Extractor logger. */ | |
53 DocumentImageExtractor.logger_ = goog.log.getLogger( | |
54 'image.collections.extension.DocumentImageExtractor'); | |
55 | 57 |
56 | 58 |
57 /** @enum {number} */ | 59 /** @enum {number} */ |
58 DocumentImageExtractor.Parameters = { | 60 DocumentImageExtractor.Parameters = { |
59 AREA_MULTIPLIER: -1e-5, | 61 AREA_MULTIPLIER: -1e-5, |
60 ASPECT_RATIO_DEMOTION_FACTOR: 0.8, | 62 ASPECT_RATIO_DEMOTION_FACTOR: 0.8, |
61 MAX_ASPECT_RATIO: 2, | 63 MAX_ASPECT_RATIO: 2, |
62 MAX_ELEMENTS_WITH_BACKGROUND: 30, | 64 MAX_ELEMENTS_WITH_BACKGROUND: 30, |
63 MAX_OFFSET: 2000, | 65 MAX_OFFSET: 2000, |
64 NON_TOPLEVEL_DEMOTION_FACTOR: 0.5, | 66 NON_TOPLEVEL_DEMOTION_FACTOR: 0.5, |
65 OFFSET_MULTIPLIER: 1e-3, | 67 OFFSET_MULTIPLIER: 1e-3, |
66 WEIGHT_APPLE: 0.7, | 68 WEIGHT_APPLE: 0.7, |
67 WEIGHT_MICRODATA: 0.55, | 69 WEIGHT_MICRODATA: 0.55, |
68 WEIGHT_MICROSOFT: 0.9, | 70 WEIGHT_MICROSOFT: 0.9, |
69 WEIGHT_OPEN_GRAPH: 1.0, | 71 WEIGHT_OPEN_GRAPH: 1.0, |
70 WEIGHT_SRC: 0.6, | 72 WEIGHT_SRC: 0.6, |
71 WEIGHT_TWITTER: 0.8 | 73 WEIGHT_TWITTER: 0.8 |
72 }; | 74 }; |
73 var Parameters = DocumentImageExtractor.Parameters; | 75 var Parameters = DocumentImageExtractor.Parameters; |
74 | 76 |
75 | 77 |
76 /** | 78 /** |
77 * Map of image type to relevance multiplier. | 79 * Map of image type to relevance multiplier. |
78 * @private {!Object.<string, number>} | 80 * @private {!Object.<string, number>} |
79 */ | 81 */ |
80 DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_ = | 82 DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_ = {'.gif': 0.5}; |
81 goog.object.create('.gif', 0.5); | |
82 | 83 |
83 | 84 |
84 | 85 |
85 /** @constructor */ | 86 /** @constructor */ |
86 DocumentImageExtractor.Context = function() { | 87 DocumentImageExtractor.Context = function() { |
87 /** @type {number} */ | 88 /** @type {number} */ |
88 this.numElementsWithBackground = 0; | 89 this.numElementsWithBackground = 0; |
89 | 90 |
90 /** @type {!Object.<string, number>} */ | 91 /** @type {!Object.<string, number>} */ |
91 this.urlToRelevance = {}; | 92 this.urlToRelevance = {}; |
(...skipping 48 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
140 ++opt_context.numElementsWithBackground; | 141 ++opt_context.numElementsWithBackground; |
141 } | 142 } |
142 } | 143 } |
143 } | 144 } |
144 | 145 |
145 if (!image) { | 146 if (!image) { |
146 return null; | 147 return null; |
147 } | 148 } |
148 | 149 |
149 var size = image.getDisplaySize() || image.getSize(); | 150 var size = image.getDisplaySize() || image.getSize(); |
150 goog.asserts.assert(!goog.isNull(size)); | |
151 if (image.getUrl() != document.location.href) { | 151 if (image.getUrl() != document.location.href) { |
152 // Ignore images that are too small. | 152 // Ignore images that are too small. |
153 if (size.width < Constants.EXTRACT_MIN_WIDTH || | 153 if (size.width < EXTRACT_MIN_WIDTH || |
154 size.height < Constants.EXTRACT_MIN_HEIGHT) { | 154 size.height < EXTRACT_MIN_HEIGHT) { |
155 return null; | 155 return null; |
156 } | 156 } |
157 } | 157 } |
158 // Demote smaller images (squash area using the sigmoid function). | 158 // Demote smaller images (squash area using the sigmoid function). |
159 var relevance = image.getRelevance(); | 159 var relevance = image.getRelevance(); |
160 relevance /= (1 + Math.exp(Parameters.AREA_MULTIPLIER * size.area())); | 160 relevance /= (1 + Math.exp(Parameters.AREA_MULTIPLIER * size.area())); |
161 // Demote images with bad aspect ratio. | 161 // Demote images with bad aspect ratio. |
162 var aspectRatio = size.aspectRatio(); | 162 var aspectRatio = size.width / size.height; |
163 if (aspectRatio < 1) { | 163 if (aspectRatio < 1) { |
164 aspectRatio = 1 / aspectRatio; | 164 aspectRatio = 1 / aspectRatio; |
165 } | 165 } |
166 if (aspectRatio > Parameters.MAX_ASPECT_RATIO) { | 166 if (aspectRatio > Parameters.MAX_ASPECT_RATIO) { |
167 relevance *= Parameters.ASPECT_RATIO_DEMOTION_FACTOR; | 167 relevance *= Parameters.ASPECT_RATIO_DEMOTION_FACTOR; |
168 } | 168 } |
169 | 169 |
170 // TODO(busaryev): use the following features: | 170 // TODO(busaryev): use the following features: |
171 // - relative size of the image comparing to neighbors; | 171 // - relative size of the image comparing to neighbors; |
172 // - area of the visible portion of the image; | 172 // - area of the visible portion of the image; |
173 // - position (demote images on the border of the page). | 173 // - position (demote images on the border of the page). |
174 | 174 |
175 var url = image.getUrl(); | 175 var url = image.getUrl(); |
176 try { | 176 // Make sure that image url is absolute. |
177 // Make sure that image url is absolute. | 177 this.helperAnchor_.href = url; |
178 var documentUrl = goog.dom.getOwnerDocument(element).documentURI; | 178 url = this.helperAnchor_.href; |
179 url = goog.Uri.resolve(documentUrl, url).toString(); | |
180 } catch (e) { | |
181 goog.log.info(DocumentImageExtractor.logger_, | |
182 'Cannot resolve url: ' + url); | |
183 return null; | |
184 } | |
185 | 179 |
186 var imagePath = goog.string.makeSafe(goog.uri.utils.getPath(url)); | 180 var imagePath = decodeURIComponent(this.helperAnchor_.pathname || ''); |
187 var lastDot = imagePath.lastIndexOf('.'); | 181 var lastDot = imagePath.lastIndexOf('.'); |
188 if (lastDot > 0) { | 182 if (lastDot > 0) { |
189 var imageType = imagePath.slice(lastDot); | 183 var imageType = imagePath.slice(lastDot); |
190 var multiplier = | 184 var multiplier = |
191 DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_[imageType]; | 185 DocumentImageExtractor.IMAGE_TYPE_RELEVANCE_MULTIPLIER_[imageType]; |
192 if (multiplier) { | 186 if (multiplier) { |
193 relevance *= multiplier; | 187 relevance *= multiplier; |
194 } | 188 } |
195 } | 189 } |
196 | 190 |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
260 | 254 |
261 | 255 |
262 /** | 256 /** |
263 * @param {!Element} element | 257 * @param {!Element} element |
264 * @param {!number} relevance | 258 * @param {!number} relevance |
265 * @param {string} attributeName | 259 * @param {string} attributeName |
266 * @param {string} attribute | 260 * @param {string} attribute |
267 * @param {string} urlAttributeName | 261 * @param {string} urlAttributeName |
268 * @return {DocumentImage} | 262 * @return {DocumentImage} |
269 * @private | 263 * @private |
| 264 * @suppress {missingProperties} |
270 */ | 265 */ |
271 DocumentImageExtractor.prototype.extractCanonicalImage_ = function( | 266 DocumentImageExtractor.prototype.extractCanonicalImage_ = function( |
272 element, relevance, attributeName, attribute, urlAttributeName) { | 267 element, relevance, attributeName, attribute, urlAttributeName) { |
273 goog.asserts.assert(goog.isNumber(relevance)); | 268 if (element.hasAttribute(attributeName) && |
274 goog.asserts.assert(goog.isString(attributeName)); | 269 element.getAttribute(attributeName).toLowerCase() == |
275 goog.asserts.assert(goog.isString(attribute)); | 270 attribute.toLowerCase()) { |
276 | |
277 if (goog.string.caseInsensitiveEquals( | |
278 goog.string.makeSafe(element.getAttribute(attributeName)), attribute)) { | |
279 var url = element.getAttribute(urlAttributeName); | 271 var url = element.getAttribute(urlAttributeName); |
280 if (!url || goog.string.startsWith(url, 'data:')) { | 272 if (!url || url.startsWith('data:')) { |
281 return null; | 273 return null; |
282 } | 274 } |
283 var width = goog.string.parseInt( | 275 var width = parseInt(element.getAttribute(CustomAttribute.WIDTH), 10); |
284 element.getAttribute(CustomAttribute.WIDTH)); | 276 var height = parseInt(element.getAttribute(CustomAttribute.HEIGHT), 10); |
285 var height = goog.string.parseInt( | |
286 element.getAttribute(CustomAttribute.HEIGHT)); | |
287 if (width && height) { | 277 if (width && height) { |
288 // For non-toplevel urls, demote the image if it is not in the document. | 278 // For non-toplevel urls, demote the image if it is not in the document. |
289 var ownerDocument = goog.dom.getOwnerDocument(element); | 279 var ownerDocument = DomUtils.getOwnerDocument(element); |
290 if (goog.uri.utils.getPath(ownerDocument.documentURI) != '/' && | 280 this.helperAnchor_.href = ownerDocument.documentURI; |
291 ownerDocument.body.innerHTML.indexOf(url) == -1) { | 281 var path = this.helperAnchor_.pathname; |
| 282 if (path != '/' && ownerDocument.body.innerHTML.indexOf(url) == -1) { |
292 relevance *= Parameters.NON_TOPLEVEL_DEMOTION_FACTOR; | 283 relevance *= Parameters.NON_TOPLEVEL_DEMOTION_FACTOR; |
293 } | 284 } |
294 var size = new goog.math.Size(width, height); | 285 var size = new Size(width, height); |
295 return new DocumentImage(relevance, url, size); | 286 return new DocumentImage(relevance, url, size); |
296 } | 287 } |
297 } | 288 } |
298 return null; | 289 return null; |
299 }; | 290 }; |
300 | 291 |
301 | 292 |
302 /** | 293 /** |
303 * @param {!Element} element | 294 * @param {!Element} element |
304 * @return {DocumentImage} | 295 * @return {DocumentImage} |
305 * @private | 296 * @private |
| 297 * @suppress {missingProperties} |
306 */ | 298 */ |
307 DocumentImageExtractor.prototype.extractMicrodataImage_ = function(element) { | 299 DocumentImageExtractor.prototype.extractMicrodataImage_ = function(element) { |
308 var itemProp = element.getAttribute('itemprop'); | 300 var itemProp = element.getAttribute('itemprop'); |
309 if (itemProp && itemProp.toLowerCase() == 'thumbnailurl') { | 301 if (itemProp && itemProp.toLowerCase() == 'thumbnailurl') { |
310 var url = element.getAttribute('href') || element.getAttribute('content'); | 302 var url = element.getAttribute('href') || element.getAttribute('content'); |
311 if (!url || goog.string.startsWith(url, 'data:')) { | 303 if (!url || url.startsWith('data:')) { |
312 return null; | 304 return null; |
313 } | 305 } |
314 var width = goog.string.parseInt( | 306 var width = parseInt(element.getAttribute(CustomAttribute.WIDTH), 10); |
315 element.getAttribute(CustomAttribute.WIDTH)); | 307 var height = parseInt(element.getAttribute(CustomAttribute.HEIGHT), 10); |
316 var height = goog.string.parseInt( | |
317 element.getAttribute(CustomAttribute.HEIGHT)); | |
318 if (width && height) { | 308 if (width && height) { |
319 var size = new goog.math.Size(width, height); | 309 var size = new Size(width, height); |
320 return new DocumentImage(Parameters.WEIGHT_MICRODATA, url, size); | 310 return new DocumentImage(Parameters.WEIGHT_MICRODATA, url, size); |
321 } | 311 } |
322 } | 312 } |
323 return null; | 313 return null; |
324 }; | 314 }; |
325 | 315 |
326 | 316 |
327 /** | 317 /** |
328 * @param {!Element} element | 318 * @param {!Element} element |
329 * @return {number} | 319 * @return {number} |
330 * @private | 320 * @private |
331 */ | 321 */ |
332 DocumentImageExtractor.prototype.getElementRelevance_ = function(element) { | 322 DocumentImageExtractor.prototype.getElementRelevance_ = function(element) { |
333 var offset = goog.style.getPageOffsetTop(element); | 323 var offset = DomUtils.getPageOffsetTop(element); |
334 if (offset > Parameters.MAX_OFFSET) { | 324 if (offset > Parameters.MAX_OFFSET) { |
335 return 0; | 325 return 0; |
336 } | 326 } |
337 return 1 / (1 + Math.exp(Parameters.OFFSET_MULTIPLIER * offset)); | 327 return 1 / (1 + Math.exp(Parameters.OFFSET_MULTIPLIER * offset)); |
338 }; | 328 }; |
339 | 329 |
340 | 330 |
341 /** | 331 /** |
342 * Extracts an image from the <img> HTML element. | 332 * Extracts an image from the <img> HTML element. |
343 * @param {!Element} element | 333 * @param {!Element} element |
344 * @return {DocumentImage} | 334 * @return {DocumentImage} |
345 * @private | 335 * @private |
346 */ | 336 */ |
347 DocumentImageExtractor.prototype.extractImage_ = function(element) { | 337 DocumentImageExtractor.prototype.extractImage_ = function(element) { |
348 goog.asserts.assert(element.tagName.toLowerCase() == 'img'); | |
349 var url = element.src; | 338 var url = element.src; |
350 // We cannot handle data URIs. | 339 // We cannot handle data URIs. |
351 if (url && !goog.string.startsWith(url, 'data:')) { | 340 if (url && !url.startsWith('data:')) { |
352 var naturalSize = new goog.math.Size( | 341 var naturalSize = new Size(element.naturalWidth, element.naturalHeight); |
353 element.naturalWidth, element.naturalHeight); | 342 var displaySize = DomUtils.getSize(element); |
354 var displaySize = goog.style.getSize(element); | |
355 var size = naturalSize.area() < displaySize.area() ? | 343 var size = naturalSize.area() < displaySize.area() ? |
356 naturalSize : displaySize; | 344 naturalSize : displaySize; |
357 if (size.width && size.height) { | 345 if (size.width && size.height) { |
358 var relevance = this.getElementRelevance_(element); | 346 var relevance = this.getElementRelevance_(element); |
359 return new DocumentImage(relevance, url, naturalSize, displaySize); | 347 return new DocumentImage(relevance, url, naturalSize, displaySize); |
360 } | 348 } |
361 } | 349 } |
362 return null; | 350 return null; |
363 }; | 351 }; |
364 | 352 |
365 | 353 |
366 /** | 354 /** |
367 * Extracts an image specified in 'background-image' property of an element. | 355 * Extracts an image specified in 'background-image' property of an element. |
368 * @param {!Element} element | 356 * @param {!Element} element |
369 * @return {DocumentImage} | 357 * @return {DocumentImage} |
370 * @private | 358 * @private |
| 359 * @suppress {missingProperties} |
371 */ | 360 */ |
372 DocumentImageExtractor.prototype.extractBackgroundImage_ = function(element) { | 361 DocumentImageExtractor.prototype.extractBackgroundImage_ = function(element) { |
373 var backgroundImage = goog.style.getComputedStyle( | 362 var backgroundImage = DomUtils.getComputedStyle( |
374 element, 'background-image'); | 363 element, 'background-image'); |
375 var backgroundRepeat = goog.style.getComputedStyle( | 364 var backgroundRepeat = DomUtils.getComputedStyle( |
376 element, 'background-repeat'); | 365 element, 'background-repeat'); |
377 var backgroundSize = goog.style.getComputedStyle( | 366 var backgroundSize = DomUtils.getComputedStyle( |
378 element, 'background-size'); | 367 element, 'background-size'); |
379 if (backgroundImage && | 368 if (backgroundImage && |
380 (backgroundRepeat == 'no-repeat' || backgroundSize == 'cover') && | 369 (backgroundRepeat == 'no-repeat' || backgroundSize == 'cover') && |
381 goog.string.startsWith(backgroundImage, 'url(') && | 370 backgroundImage.startsWith('url(') && |
382 goog.string.endsWith(backgroundImage, ')')) { | 371 backgroundImage.endsWith(')')) { |
383 var url = backgroundImage.substr(4, backgroundImage.length - 5); | 372 var url = backgroundImage.substr(4, backgroundImage.length - 5); |
384 if (url && !goog.string.startsWith(url, 'data:')) { | 373 if (url && !url.startsWith('data:')) { |
385 var size = goog.style.getSize(element); | 374 var size = DomUtils.getSize(element); |
386 if (size.width && size.height) { | 375 if (size.width && size.height) { |
387 var relevance = this.getElementRelevance_(element); | 376 var relevance = this.getElementRelevance_(element); |
388 var children = goog.dom.getChildren(element); | 377 var children = element.children; |
389 for (var i = 0; i < children.length; ++i) { | 378 for (var i = 0; i < children.length; ++i) { |
390 var child = children[i]; | 379 var child = children[i]; |
391 if (goog.style.getComputedStyle(child, 'display') != 'none' && | 380 if (DomUtils.getComputedStyle(child, 'display') != 'none' && |
392 goog.style.getSize(child).area() > 0.1 * size.area()) { | 381 DomUtils.getSize(child).area() > 0.1 * size.area()) { |
393 relevance *= 0.1; | 382 relevance *= 0.1; |
394 break; | 383 break; |
395 } | 384 } |
396 } | 385 } |
397 return new DocumentImage(relevance, url, | 386 return new DocumentImage(relevance, url, |
398 undefined /* image size is unknown */, size); | 387 undefined /* image size is unknown */, size); |
399 } | 388 } |
400 } | 389 } |
401 } | 390 } |
402 return null; | 391 return null; |
403 }; | 392 }; |
404 }); // goog.scope | 393 }); // goog.scope |
OLD | NEW |