Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(118)

Side by Side Diff: heuristics/distillable/check_derived_features.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible
Patch Set: update docs Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2016 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 import argparse
7 import csv
8 import json
9 import os
10 import shutil
11 import sys
12 import unittest
13
14 from write_features_csv import filter_fields, getGroups
15
16 def isAlmostEqual(a, b, header, eps=0.001):
17 assert len(a) == len(b)
18 for i in range(len(a)):
19 if abs(a[i] - b[i]) > eps:
20 print '%s mismatch: a[%d] = %f, b[%d] = %f' % (header[i], i, a[i], i, b[i] )
21 return False
22 return True
23
24 def compareDerivedFeatures(features, from_mhtml):
25 """Compare the derived features from the JS vs. native impl
26
27 Args:
28 features: the JSON dump of features
29 from_mhtml (bool): whether the features are collected from mhtml archive
30 """
31 header = map(str, features[0]['features'][::2])
32 err = 0
33 skipped = 0
34 for f in features:
35 if not 'native' in f:
36 print 'Skipped %s' % (f['url'])
37 skipped += 1
38 continue
39 data = [map(float, f['features'][1::2])]
40 (h, data) = filter_fields(header, data, getGroups(header)['v1'])
41 js = data[0]
42 # js is now the derived features from JS aligned with native impl.
43 if not from_mhtml and js[17] != f['native']['features']['elementCount']:
44 # elementCount is simple enough so assume it's correct.
45 # If elementCount doesn't match, the DOM might've changed between JS and
46 # native runs.
47 # For mhtml, this should not be possible since DOM is static.
48 print 'Skipped %s' % (f['url'])
49 skipped += 1
50 continue
51
52 native = map(float, f['native']['derived_features'])
53 data = [js, native]
54 # Filter out the features derived from path if it is from mhtml, because
55 # the url from native impl would be the file:// one.
56 if from_mhtml:
57 (h, data) = filter_fields(h, data, getGroups(header)['v1NoPath'])
58 if not isAlmostEqual(data[0], data[1], h):
59 err += 1
60 print f['url']
61 if from_mhtml:
62 print '%s.mhtml' % f['index']
63 print data[0]
64 print data[1]
65 print
66 print '%d/%d have mismatching derived features, %d were skipped.' % (err, len( features), skipped)
67
68 def main(argv):
69 parser = argparse.ArgumentParser()
70 parser.add_argument('--features', required=True, help="filename of aggregated derived features")
71 parser.add_argument('--from-mhtml', action='store_true', help="whether the fea tures are from mhtml")
72 options = parser.parse_args(argv)
73
74 with open(options.features) as features:
75 features = json.load(features)
76 compareDerivedFeatures(features, options.from_mhtml)
77
78 if __name__ == '__main__':
79 sys.exit(main(sys.argv[1:]))
OLDNEW
« no previous file with comments | « heuristics/distillable/calculate_derived_features.py ('k') | heuristics/distillable/check_distilled_mhtml.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698