Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(50)

Side by Side Diff: heuristics/distillable/check_distilled_mhtml.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible
Patch Set: update docs Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2016 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 import argparse
7 import json
8 import os
9 import sys
10
11 def compare_innerText(dfeature, mdfeature):
12 """Compare the distilled content from the original page with the one from the mhtml archive
13
14 Args:
15 dfeature (str): filename of the distilled feature from the original page
16 mdfeature (str): filename of the distilled feature from mhtml archive
17
18 Returns:
19 True if the content is the same.
20 """
21
22 with open(dfeature) as f:
23 d = json.load(f)
24 with open(mdfeature) as f:
25 md = json.load(f)
26 mhtml = os.path.splitext(mdfeature)[0] + '.mhtml'
27 if d['features']['innerText'] != md['features']['innerText']:
28 if md['features']['innerText'] in d['features']['innerText']:
29 # The one from the original might have next page stitched.
30 return True
31 if md['features']['innerText'] == 'No data found.':
32 print '%s failed to distill, but %s can' % (mhtml, d['url'])
33 else:
34 print '\n[ERROR] Different distilled content.\nFrom original (%s):\n"%s"\n \n\nFrom mhtml (%s):\n"%s"\n' % (
35 d['url'], d['features']['innerText'],
36 mhtml, md['features']['innerText']
37 )
38 return False
39 return True
40
41 def compare_distilled(dir):
42 """Compare all the distilled contents from the original pages with those from the mhtml archives
43
44 Args:
45 dir (str): directory containing all the extracted features
46 """
47
48 files = [os.path.join(dir, f) for f in os.listdir(dir)]
49 mdfeatures = [f for f in files if os.path.isfile(f) and os.path.splitext(f)[1] == '.mdfeature']
50 err = 0
51 for mdfeature in mdfeatures:
52 dfeature = os.path.splitext(mdfeature)[0] + '.dfeature'
53 if not compare_innerText(dfeature, mdfeature):
54 err += 1
55 print '%d/%d have different distilled content from mhtml' % (err, len(mdfeatur es))
56
57 def main(argv):
58 parser = argparse.ArgumentParser()
59 parser.add_argument('--dir', required=True, help="data directory")
60 options = parser.parse_args(argv)
61
62 compare_distilled(options.dir)
63
64 if __name__ == '__main__':
65 sys.exit(main(sys.argv[1:]))
OLDNEW
« no previous file with comments | « heuristics/distillable/check_derived_features.py ('k') | heuristics/distillable/extract_features.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698