| Index: heuristics/distillable/check_distilled_mhtml.py
|
| diff --git a/heuristics/distillable/check_distilled_mhtml.py b/heuristics/distillable/check_distilled_mhtml.py
|
| new file mode 100755
|
| index 0000000000000000000000000000000000000000..80495c746765e91013f9c7eb4786daefa1d61a9f
|
| --- /dev/null
|
| +++ b/heuristics/distillable/check_distilled_mhtml.py
|
| @@ -0,0 +1,65 @@
|
| +#!/usr/bin/env python
|
| +# Copyright 2016 The Chromium Authors. All rights reserved.
|
| +# Use of this source code is governed by a BSD-style license that can be
|
| +# found in the LICENSE file.
|
| +
|
| +import argparse
|
| +import json
|
| +import os
|
| +import sys
|
| +
|
| +def compare_innerText(dfeature, mdfeature):
|
| + """Compare the distilled content from the original page with the one from the mhtml archive
|
| +
|
| + Args:
|
| + dfeature (str): filename of the distilled feature from the original page
|
| + mdfeature (str): filename of the distilled feature from mhtml archive
|
| +
|
| + Returns:
|
| + True if the content is the same.
|
| + """
|
| +
|
| + with open(dfeature) as f:
|
| + d = json.load(f)
|
| + with open(mdfeature) as f:
|
| + md = json.load(f)
|
| + mhtml = os.path.splitext(mdfeature)[0] + '.mhtml'
|
| + if d['features']['innerText'] != md['features']['innerText']:
|
| + if md['features']['innerText'] in d['features']['innerText']:
|
| + # The one from the original might have next page stitched.
|
| + return True
|
| + if md['features']['innerText'] == 'No data found.':
|
| + print '%s failed to distill, but %s can' % (mhtml, d['url'])
|
| + else:
|
| + print '\n[ERROR] Different distilled content.\nFrom original (%s):\n"%s"\n\n\nFrom mhtml (%s):\n"%s"\n' % (
|
| + d['url'], d['features']['innerText'],
|
| + mhtml, md['features']['innerText']
|
| + )
|
| + return False
|
| + return True
|
| +
|
| +def compare_distilled(dir):
|
| + """Compare all the distilled contents from the original pages with those from the mhtml archives
|
| +
|
| + Args:
|
| + dir (str): directory containing all the extracted features
|
| + """
|
| +
|
| + files = [os.path.join(dir, f) for f in os.listdir(dir)]
|
| + mdfeatures = [f for f in files if os.path.isfile(f) and os.path.splitext(f)[1] == '.mdfeature']
|
| + err = 0
|
| + for mdfeature in mdfeatures:
|
| + dfeature = os.path.splitext(mdfeature)[0] + '.dfeature'
|
| + if not compare_innerText(dfeature, mdfeature):
|
| + err += 1
|
| + print '%d/%d have different distilled content from mhtml' % (err, len(mdfeatures))
|
| +
|
| +def main(argv):
|
| + parser = argparse.ArgumentParser()
|
| + parser.add_argument('--dir', required=True, help="data directory")
|
| + options = parser.parse_args(argv)
|
| +
|
| + compare_distilled(options.dir)
|
| +
|
| +if __name__ == '__main__':
|
| + sys.exit(main(sys.argv[1:]))
|
|
|