| Index: heuristics/distillable/check_derived_features.py
|
| diff --git a/heuristics/distillable/check_derived_features.py b/heuristics/distillable/check_derived_features.py
|
| new file mode 100755
|
| index 0000000000000000000000000000000000000000..98bb96a25b8385dca7e455695e54d099d48f6a6e
|
| --- /dev/null
|
| +++ b/heuristics/distillable/check_derived_features.py
|
| @@ -0,0 +1,79 @@
|
| +#!/usr/bin/env python
|
| +# Copyright 2016 The Chromium Authors. All rights reserved.
|
| +# Use of this source code is governed by a BSD-style license that can be
|
| +# found in the LICENSE file.
|
| +
|
| +import argparse
|
| +import csv
|
| +import json
|
| +import os
|
| +import shutil
|
| +import sys
|
| +import unittest
|
| +
|
| +from write_features_csv import filter_fields, getGroups
|
| +
|
| +def isAlmostEqual(a, b, header, eps=0.001):
|
| + assert len(a) == len(b)
|
| + for i in range(len(a)):
|
| + if abs(a[i] - b[i]) > eps:
|
| + print '%s mismatch: a[%d] = %f, b[%d] = %f' % (header[i], i, a[i], i, b[i])
|
| + return False
|
| + return True
|
| +
|
| +def compareDerivedFeatures(features, from_mhtml):
|
| + """Compare the derived features from the JS vs. native impl
|
| +
|
| + Args:
|
| + features: the JSON dump of features
|
| + from_mhtml (bool): whether the features are collected from mhtml archive
|
| + """
|
| + header = map(str, features[0]['features'][::2])
|
| + err = 0
|
| + skipped = 0
|
| + for f in features:
|
| + if not 'native' in f:
|
| + print 'Skipped %s' % (f['url'])
|
| + skipped += 1
|
| + continue
|
| + data = [map(float, f['features'][1::2])]
|
| + (h, data) = filter_fields(header, data, getGroups(header)['v1'])
|
| + js = data[0]
|
| + # js is now the derived features from JS aligned with native impl.
|
| + if not from_mhtml and js[17] != f['native']['features']['elementCount']:
|
| + # elementCount is simple enough so assume it's correct.
|
| + # If elementCount doesn't match, the DOM might've changed between JS and
|
| + # native runs.
|
| + # For mhtml, this should not be possible since DOM is static.
|
| + print 'Skipped %s' % (f['url'])
|
| + skipped += 1
|
| + continue
|
| +
|
| + native = map(float, f['native']['derived_features'])
|
| + data = [js, native]
|
| + # Filter out the features derived from path if it is from mhtml, because
|
| + # the url from native impl would be the file:// one.
|
| + if from_mhtml:
|
| + (h, data) = filter_fields(h, data, getGroups(header)['v1NoPath'])
|
| + if not isAlmostEqual(data[0], data[1], h):
|
| + err += 1
|
| + print f['url']
|
| + if from_mhtml:
|
| + print '%s.mhtml' % f['index']
|
| + print data[0]
|
| + print data[1]
|
| + print
|
| + print '%d/%d have mismatching derived features, %d were skipped.' % (err, len(features), skipped)
|
| +
|
| +def main(argv):
|
| + parser = argparse.ArgumentParser()
|
| + parser.add_argument('--features', required=True, help="filename of aggregated derived features")
|
| + parser.add_argument('--from-mhtml', action='store_true', help="whether the features are from mhtml")
|
| + options = parser.parse_args(argv)
|
| +
|
| + with open(options.features) as features:
|
| + features = json.load(features)
|
| + compareDerivedFeatures(features, options.from_mhtml)
|
| +
|
| +if __name__ == '__main__':
|
| + sys.exit(main(sys.argv[1:]))
|
|
|