OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 import argparse |
| 7 import json |
| 8 import os |
| 9 import sys |
| 10 |
| 11 def compare_innerText(dfeature, mdfeature): |
| 12 """Compare the distilled content from the original page with the one from the
mhtml archive |
| 13 |
| 14 Args: |
| 15 dfeature (str): filename of the distilled feature from the original page |
| 16 mdfeature (str): filename of the distilled feature from mhtml archive |
| 17 |
| 18 Returns: |
| 19 True if the content is the same. |
| 20 """ |
| 21 |
| 22 with open(dfeature) as f: |
| 23 d = json.load(f) |
| 24 with open(mdfeature) as f: |
| 25 md = json.load(f) |
| 26 mhtml = os.path.splitext(mdfeature)[0] + '.mhtml' |
| 27 if d['features']['innerText'] != md['features']['innerText']: |
| 28 if md['features']['innerText'] in d['features']['innerText']: |
| 29 # The one from the original might have next page stitched. |
| 30 return True |
| 31 if md['features']['innerText'] == 'No data found.': |
| 32 print '%s failed to distill, but %s can' % (mhtml, d['url']) |
| 33 else: |
| 34 print '\n[ERROR] Different distilled content.\nFrom original (%s):\n"%s"\n
\n\nFrom mhtml (%s):\n"%s"\n' % ( |
| 35 d['url'], d['features']['innerText'], |
| 36 mhtml, md['features']['innerText'] |
| 37 ) |
| 38 return False |
| 39 return True |
| 40 |
| 41 def compare_distilled(dir): |
| 42 """Compare all the distilled contents from the original pages with those from
the mhtml archives |
| 43 |
| 44 Args: |
| 45 dir (str): directory containing all the extracted features |
| 46 """ |
| 47 |
| 48 files = [os.path.join(dir, f) for f in os.listdir(dir)] |
| 49 mdfeatures = [f for f in files if os.path.isfile(f) and os.path.splitext(f)[1]
== '.mdfeature'] |
| 50 err = 0 |
| 51 for mdfeature in mdfeatures: |
| 52 dfeature = os.path.splitext(mdfeature)[0] + '.dfeature' |
| 53 if not compare_innerText(dfeature, mdfeature): |
| 54 err += 1 |
| 55 print '%d/%d have different distilled content from mhtml' % (err, len(mdfeatur
es)) |
| 56 |
| 57 def main(argv): |
| 58 parser = argparse.ArgumentParser() |
| 59 parser.add_argument('--dir', required=True, help="data directory") |
| 60 options = parser.parse_args(argv) |
| 61 |
| 62 compare_distilled(options.dir) |
| 63 |
| 64 if __name__ == '__main__': |
| 65 sys.exit(main(sys.argv[1:])) |
OLD | NEW |