OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file | |
3 # for details. All rights reserved. Use of this source code is governed by a | |
4 # BSD-style license that can be found in the LICENSE file. | |
5 ''' | |
6 This script finds all HTML pages in a folder and downloads all images, replacing | |
7 the urls with local ones. | |
8 ''' | |
9 import os, sys, optparse, subprocess, multiprocessing | |
10 from os.path import abspath, basename, dirname, join | |
11 | |
12 SWARM_PATH = dirname(abspath(__file__)) | |
13 CLIENT_PATH = dirname(dirname(SWARM_PATH)) | |
14 CLIENT_TOOLS_PATH = join(CLIENT_PATH, 'tools') | |
15 | |
16 # Add the client tools directory so we can find htmlconverter.py. | |
17 sys.path.append(CLIENT_TOOLS_PATH) | |
18 import htmlconverter | |
19 converter = CLIENT_TOOLS_PATH + '/htmlconverter.py' | |
20 | |
21 # This has to be a top level function to use with multiprocessing | |
22 def convertImgs(infile): | |
23 global options | |
24 try: | |
25 htmlconverter.convertForOffline( | |
26 infile, infile, | |
27 verbose=options.verbose, | |
28 encode_images=options.inline_images) | |
29 print 'Converted ' + infile | |
30 except BaseException, e: | |
31 print 'Caught error: %s' % e | |
32 | |
33 def Flags(): | |
34 """ Constructs a parser for extracting flags from the command line. """ | |
35 parser = optparse.OptionParser() | |
36 parser.add_option("--inline_images", | |
37 help=("Encode img payloads as data:// URLs rather than local files."), | |
38 default=False, | |
39 action='store_true') | |
40 parser.add_option("--verbose", | |
41 help="Print verbose output", | |
42 default=False, | |
43 action="store_true") | |
44 return parser | |
45 | |
46 def main(): | |
47 global options | |
48 parser = Flags() | |
49 options, args = parser.parse_args() | |
50 print "args: %s" % args | |
51 if len(args) < 1 or 'help' in args[0]: | |
52 print 'Usage: %s DIRECTORY' % basename(sys.argv[0]) | |
53 return 1 | |
54 | |
55 dirname = args[0] | |
56 print 'Searching directory ' + dirname | |
57 | |
58 files = [] | |
59 for root, dirs, fnames in os.walk(dirname): | |
60 for fname in fnames: | |
61 if fname.endswith('.html'): | |
62 files.append(join(root, fname)) | |
63 | |
64 count = 4 * multiprocessing.cpu_count() | |
65 pool = multiprocessing.Pool(processes=count) | |
66 # Note: need a timeout to get keyboard interrupt due to a Python bug | |
67 pool.map_async(convertImgs, files).get(3600) # one hour | |
68 | |
69 if __name__ == '__main__': | |
70 main() | |
OLD | NEW |