Index: heuristics/distillable/check_distilled_mhtml.py |
diff --git a/heuristics/distillable/check_distilled_mhtml.py b/heuristics/distillable/check_distilled_mhtml.py |
new file mode 100755 |
index 0000000000000000000000000000000000000000..80495c746765e91013f9c7eb4786daefa1d61a9f |
--- /dev/null |
+++ b/heuristics/distillable/check_distilled_mhtml.py |
@@ -0,0 +1,65 @@ |
+#!/usr/bin/env python |
+# Copyright 2016 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+import argparse |
+import json |
+import os |
+import sys |
+ |
+def compare_innerText(dfeature, mdfeature): |
+ """Compare the distilled content from the original page with the one from the mhtml archive |
+ |
+ Args: |
+ dfeature (str): filename of the distilled feature from the original page |
+ mdfeature (str): filename of the distilled feature from mhtml archive |
+ |
+ Returns: |
+ True if the content is the same. |
+ """ |
+ |
+ with open(dfeature) as f: |
+ d = json.load(f) |
+ with open(mdfeature) as f: |
+ md = json.load(f) |
+ mhtml = os.path.splitext(mdfeature)[0] + '.mhtml' |
+ if d['features']['innerText'] != md['features']['innerText']: |
+ if md['features']['innerText'] in d['features']['innerText']: |
+ # The one from the original might have next page stitched. |
+ return True |
+ if md['features']['innerText'] == 'No data found.': |
+ print '%s failed to distill, but %s can' % (mhtml, d['url']) |
+ else: |
+ print '\n[ERROR] Different distilled content.\nFrom original (%s):\n"%s"\n\n\nFrom mhtml (%s):\n"%s"\n' % ( |
+ d['url'], d['features']['innerText'], |
+ mhtml, md['features']['innerText'] |
+ ) |
+ return False |
+ return True |
+ |
+def compare_distilled(dir): |
+ """Compare all the distilled contents from the original pages with those from the mhtml archives |
+ |
+ Args: |
+ dir (str): directory containing all the extracted features |
+ """ |
+ |
+ files = [os.path.join(dir, f) for f in os.listdir(dir)] |
+ mdfeatures = [f for f in files if os.path.isfile(f) and os.path.splitext(f)[1] == '.mdfeature'] |
+ err = 0 |
+ for mdfeature in mdfeatures: |
+ dfeature = os.path.splitext(mdfeature)[0] + '.dfeature' |
+ if not compare_innerText(dfeature, mdfeature): |
+ err += 1 |
+ print '%d/%d have different distilled content from mhtml' % (err, len(mdfeatures)) |
+ |
+def main(argv): |
+ parser = argparse.ArgumentParser() |
+ parser.add_argument('--dir', required=True, help="data directory") |
+ options = parser.parse_args(argv) |
+ |
+ compare_distilled(options.dir) |
+ |
+if __name__ == '__main__': |
+ sys.exit(main(sys.argv[1:])) |