Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(242)

Unified Diff: heuristics/distillable/check_distilled_mhtml.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible
Patch Set: update docs Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « heuristics/distillable/check_derived_features.py ('k') | heuristics/distillable/extract_features.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: heuristics/distillable/check_distilled_mhtml.py
diff --git a/heuristics/distillable/check_distilled_mhtml.py b/heuristics/distillable/check_distilled_mhtml.py
new file mode 100755
index 0000000000000000000000000000000000000000..80495c746765e91013f9c7eb4786daefa1d61a9f
--- /dev/null
+++ b/heuristics/distillable/check_distilled_mhtml.py
@@ -0,0 +1,65 @@
+#!/usr/bin/env python
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import argparse
+import json
+import os
+import sys
+
+def compare_innerText(dfeature, mdfeature):
+ """Compare the distilled content from the original page with the one from the mhtml archive
+
+ Args:
+ dfeature (str): filename of the distilled feature from the original page
+ mdfeature (str): filename of the distilled feature from mhtml archive
+
+ Returns:
+ True if the content is the same.
+ """
+
+ with open(dfeature) as f:
+ d = json.load(f)
+ with open(mdfeature) as f:
+ md = json.load(f)
+ mhtml = os.path.splitext(mdfeature)[0] + '.mhtml'
+ if d['features']['innerText'] != md['features']['innerText']:
+ if md['features']['innerText'] in d['features']['innerText']:
+ # The one from the original might have next page stitched.
+ return True
+ if md['features']['innerText'] == 'No data found.':
+ print '%s failed to distill, but %s can' % (mhtml, d['url'])
+ else:
+ print '\n[ERROR] Different distilled content.\nFrom original (%s):\n"%s"\n\n\nFrom mhtml (%s):\n"%s"\n' % (
+ d['url'], d['features']['innerText'],
+ mhtml, md['features']['innerText']
+ )
+ return False
+ return True
+
+def compare_distilled(dir):
+ """Compare all the distilled contents from the original pages with those from the mhtml archives
+
+ Args:
+ dir (str): directory containing all the extracted features
+ """
+
+ files = [os.path.join(dir, f) for f in os.listdir(dir)]
+ mdfeatures = [f for f in files if os.path.isfile(f) and os.path.splitext(f)[1] == '.mdfeature']
+ err = 0
+ for mdfeature in mdfeatures:
+ dfeature = os.path.splitext(mdfeature)[0] + '.dfeature'
+ if not compare_innerText(dfeature, mdfeature):
+ err += 1
+ print '%d/%d have different distilled content from mhtml' % (err, len(mdfeatures))
+
+def main(argv):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--dir', required=True, help="data directory")
+ options = parser.parse_args(argv)
+
+ compare_distilled(options.dir)
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
« no previous file with comments | « heuristics/distillable/check_derived_features.py ('k') | heuristics/distillable/extract_features.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698