Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(257)

Side by Side Diff: heuristics/distillable/calculate_derived_features.py

Issue 1620043002: Add scripts for distillability modelling (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: set upstream patchset, identical to patch set 2 Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « heuristics/distillable/README.md ('k') | heuristics/distillable/extract_features.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Copyright 2014 The Chromium Authors. All rights reserved. 2 # Copyright 2016 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 5
6 import argparse 6 import argparse
7 import csv 7 import csv
8 import json 8 import json
9 import marshal
9 import os 10 import os
10 import shutil 11 import shutil
12 import math
11 import sys 13 import sys
12 import re 14 import re
13 import urlparse 15 import urlparse
14 16
15 def CountMatches(s, p): 17 def CountMatches(s, p):
16 return len(re.findall(p, s)) 18 return len(re.findall(p, s))
17 19
18 def WordCount(s): 20 def WordCount(s):
19 return CountMatches(s, r'\w+') 21 return CountMatches(s, r'\w+')
20 22
21 def GetLastSegment(path): 23 def GetLastSegment(path):
22 return re.search('[^/]*\/?$', path).group(0) 24 return re.search('[^/]*\/?$', path).group(0)
23 25
24 def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, inner Text, textContent, innerHTML): 26 def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, n umForms, numPPRE, visibleElements, visiblePPRE,
27 innerText, textContent, innerHTML, numText, numPassword, mozScores):
25 path = urlparse.urlparse(url).path 28 path = urlparse.urlparse(url).path
26 29
27 path = path.encode('utf-8') 30 path = path.encode('utf-8')
28 innerText = innerText.encode('utf-8') 31 innerText = innerText.encode('utf-8')
29 textContent = textContent.encode('utf-8') 32 textContent = textContent.encode('utf-8')
30 innerHTML = innerHTML.encode('utf-8') 33 innerHTML = innerHTML.encode('utf-8')
31 34
32 innerTextWords = WordCount(innerText) 35 innerTextWords = WordCount(innerText)
33 textContentWords = WordCount(textContent) 36 textContentWords = WordCount(textContent)
34 innerHTMLWords = WordCount(innerHTML) 37 innerHTMLWords = WordCount(innerHTML)
35 return [ 38 return [
36 'opengraph', opengraph, 39 'openGraph', opengraph,
40
37 'forum', 'forum' in path, 41 'forum', 'forum' in path,
38 'index', 'index' in path, 42 'index', 'index' in path,
43 'search', 'search' in path,
39 'view', 'view' in path, 44 'view', 'view' in path,
45 'archive', 'archive' in path,
40 'asp', '.asp' in path, 46 'asp', '.asp' in path,
41 'phpbb', 'phpbb' in path, 47 'phpbb', 'phpbb' in path,
42 'php', path.endswith('.php'), 48 'php', path.endswith('.php'),
43 'pathlength', len(path), 49 'pathLength', len(path),
44 'domain', len(path) < 2, 50 'domain', len(path) < 2,
45 'pathcomponents', CountMatches(path, r'\/.'), 51 'pathComponents', CountMatches(path, r'\/.'),
46 'slugdetector', CountMatches(path, r'[^\w/]'), 52 'slugDetector', CountMatches(path, r'[^\w/]'),
47 'pathnumbers', CountMatches(path, r'\d+'), 53 'pathNumbers', CountMatches(path, r'\d+'),
48 'lastSegmentLength', len(GetLastSegment(path)), 54 'lastSegmentLength', len(GetLastSegment(path)),
49 'formcount', numForms, 55
50 'anchorcount', numAnchors, 56 'formCount', numForms,
51 'elementcount', numElements, 57 'anchorCount', numAnchors,
52 'anchorratio', float(numAnchors) / max(1, numElements), 58 'elementCount', numElements,
53 'innertextlength', len(innerText), 59 'anchorRatio', float(numAnchors) / max(1, numElements),
54 'textcontentlength', len(textContent), 60
55 'innerhtmllength', len(innerHTML), 61 'mozScore', min(mozScores[3], 6 * math.sqrt(1000-140)),
56 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), 62 'mozScoreAllSqrt', min(mozScores[4], 6 * math.sqrt(1000)),
57 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), 63 'mozScoreAllLinear', min(mozScores[5], 6000),
58 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textCon tent)),
59 'innertextwordcount', innerTextWords,
60 'textcontentwordcount', textContentWords,
61 'innerhtmlwordcount', innerHTMLWords,
62 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
63 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords ),
64 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textCon tentWords),
65 ] 64 ]
66 65
67 def main(argv): 66 def main(argv):
68 parser = argparse.ArgumentParser() 67 parser = argparse.ArgumentParser()
69 parser.add_argument('--out', required=True) 68 parser.add_argument('--out', required=True)
70 parser.add_argument('--core', required=True) 69 parser.add_argument('--core', required=True)
71 options = parser.parse_args(argv) 70 options = parser.parse_args(argv)
72 71
73 if os.path.exists(options.out): 72 if os.path.exists(options.out):
74 raise Exception('exists: ' + options.out) 73 raise Exception('exists: ' + options.out)
75 74
76 core = None 75 core = None
77 with open(options.core) as core_file: 76 with open(options.core) as core_file:
78 core = json.load(core_file) 77 core = marshal.load(core_file)
79 78
80 for entry in core: 79 for entry in core:
81 features = entry['features'] 80 features = entry['features']
81 print 'processing %d' % (entry['index'])
82 entry['features'] = CalcDerivedFeatures( 82 entry['features'] = CalcDerivedFeatures(
83 entry['index'],
83 features['opengraph'], 84 features['opengraph'],
84 features['url'], 85 features['url'],
86 features['title'],
85 features['numElements'], 87 features['numElements'],
86 features['numAnchors'], 88 features['numAnchors'],
87 features['numForms'], 89 features['numForms'],
90 features['numPPRE'],
91 features['visibleElements'],
92 features['visiblePPRE'],
88 features['innerText'], 93 features['innerText'],
89 features['textContent'], 94 features['textContent'],
90 features['innerHTML']) 95 features['innerHTML'],
96 features['numTextInput'],
97 features['numPasswordInput'],
98 [features['mozScore'], features['mozScoreAllSqrt'], features['mozScoreAllL inear'],
99 features['mozScoreFast'], features['mozScoreFastAllSqrt'], features['mozS coreFastAllLinear']]
100 )
91 101
92 with open(options.out, 'w') as outfile: 102 with open(options.out, 'w') as outfile:
93 json.dump(core, outfile, indent=1) 103 json.dump(core, outfile, indent=1)
94 104
95 return 0 105 return 0
96 106
97 if __name__ == '__main__': 107 if __name__ == '__main__':
98 sys.exit(main(sys.argv[1:])) 108 sys.exit(main(sys.argv[1:]))
99 109
OLDNEW
« no previous file with comments | « heuristics/distillable/README.md ('k') | heuristics/distillable/extract_features.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698