Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(235)

Side by Side Diff: download_from_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Move printing from main thread to printer thread Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Download files from Google Storage based on SHA1 sums."""
7
8
9 import hashlib
10 import optparse
11 import os
12 import Queue
13 import re
14 import sys
15 import threading
16 import time
17
18 import subprocess2
19
20
21 GSUTIL_DEFAULT_PATH = os.path.join(
22 os.path.dirname(os.path.abspath(__file__)),
23 'third_party', 'gsutil', 'gsutil')
24
25
26 # Common utilities
27 class Gsutil(object):
28 """Call gsutil with some predefined settings."""
29 def __init__(self, path, boto_path=None, timeout=None):
30 if not os.path.exists(path):
31 raise OSError('GSUtil not found in %s' % path)
32 self.path = path
33 self.timeout = timeout
34 self.boto_path = boto_path
35
36 def call(self, *args):
37 env = os.environ.copy()
38 if self.boto_path is not None:
39 env['AWS_CREDENTIAL_FILE'] = self.boto_path
40 return subprocess2.call((sys.executable, self.path) + args,
41 env=env,
42 timeout=self.timeout)
43
44 def check_call(self, *args):
45 env = os.environ.copy()
46 if self.boto_path is not None:
47 env['AWS_CREDENTIAL_FILE'] = self.boto_path
48 ((out, err), code) = subprocess2.communicate(
49 (sys.executable, self.path) + args,
50 stdout=subprocess2.PIPE,
51 stderr=subprocess2.PIPE,
52 env=env,
53 timeout=self.timeout)
54
55 # Parse output.
56 status_code_match = re.search('status=([0-9]+)', err)
57 if status_code_match:
58 return int(status_code_match.groups(1))
59 elif ('You are attempting to access protected data with '
60 'no configured credentials.' in err):
61 return (403, out, err)
62 elif 'No such object' in err:
63 return (404, out, err)
64 else:
65 return (code, out, err)
66
67 def clone(self):
68 return Gsutil(self.path, self.boto_path, self.timeout)
69
70
71 def CheckBucketPermissions(bucket, gsutil):
M-A Ruel 2013/03/07 16:22:46 Since it's new code; either use CamelCaps for func
Ryan Tseng 2013/03/07 18:51:48 Done.
72 if not bucket:
73 print >> sys.stderr, 'Missing bucket %s.'
74 return (None, 1)
75 base_url = 'gs://%s' % bucket
76
77 code, _, ls_err = gsutil.check_call('ls', base_url)
78 if code == 403:
79 code, _, _ = gsutil.call('config')
80 if code != 0:
81 print >> sys.stderr, 'Error while authenticating to %s.' % base_url
82 elif code == 404:
83 print >> sys.stderr, '%s not found.' % base_url
84 elif code != 0:
85 print >> sys.stderr, ls_err
86 return (base_url, code)
87
88
89 def GetSHA1(filename):
90 sha1 = hashlib.sha1()
91 with open(filename, 'rb') as f:
92 while True:
93 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.
94 chunk = f.read(1024*1024)
95 if not chunk:
96 break
97 sha1.update(chunk)
98 return sha1.hexdigest()
99
100
101 # Download-specific code starts here
102
103 def enumerate_work_queue(input_filename, work_queue, directory,
104 recursive, ignore_errors, output, sha1_file):
105 if sha1_file:
106 if not os.path.exists(input_filename):
107 print >> sys.stderr, '%s not found.' % input_filename
108 if not ignore_errors:
109 raise Exception('%s not found.' % input_filename)
M-A Ruel 2013/03/07 16:22:46 Use a proper exception type. You can create a new
Ryan Tseng 2013/03/07 18:51:48 Done.
110 with open(input_filename, 'rb') as f:
111 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
112 if sha1_match:
113 work_queue.put(
114 (sha1_match.groups(1)[0], input_filename.replace('.sha1', '')))
115 return 1
116 print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename
M-A Ruel 2013/03/07 16:22:46 You want to print after the raise statement so tha
Ryan Tseng 2013/03/07 18:51:48 Done.
117 if not ignore_errors:
118 raise Exception('No sha1 sum found in %s.' % input_filename)
119 return 0
120
121 if not directory:
122 work_queue.put((input_filename, output))
123 return 1
124
125 work_queue_size = 0
126 for root, dirs, files in os.walk(input_filename):
127 if not recursive:
128 for item in dirs[:]:
129 dirs.remove(item)
130 else:
131 for exclude in ['.svn', '.git']:
132 if exclude in dirs:
133 dirs.remove(exclude)
134 for filename in files:
135 full_path = os.path.join(root, filename)
136 if full_path.endswith('.sha1'):
137 with open(full_path, 'rb') as f:
138 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
139 if sha1_match:
140 work_queue.put(
141 (sha1_match.groups(1)[0], full_path.replace('.sha1', '')))
142 work_queue_size += 1
143 else:
144 print >> sys.stderr, 'No sha1 sum found in %s.' % filename
145 if not ignore_errors:
146 raise Exception('No sha1 sum found in %s.' % filename)
147 return work_queue_size
148
149
150 def _downloader_worker_thread(thread_num, q, force, base_url, gsutil, out_q):
151 while True:
152 input_sha1_sum, output_filename = q.get()
153 if input_sha1_sum is None:
154 out_q.put('Thread %d is done' % thread_num)
155 return
156 if os.path.exists(output_filename) and not force:
157 if GetSHA1(output_filename) == input_sha1_sum:
158 out_q.put(
159 'File %s exists and SHA1 sum (%s) matches. Skipping.' % (
160 output_filename , input_sha1_sum))
161 continue
162 # Check if file exists.
163 file_url = '%s/%s' % (base_url, input_sha1_sum)
164 if gsutil.check_call('ls', file_url)[0] != 0:
165 out_q.put('File %s for %s does not exist, skipping.' % (
166 file_url, output_filename))
167 continue
168 # Fetch the file.
169 out_q.put('Downloading %s to %s...' % (file_url, output_filename))
170 code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename)
171 if code != 0:
172 out_q.put(err)
173 return code
174
175
176 def printer_worker(output_queue):
177 while True:
178 line = output_queue.get()
179 # Its pausible we want to print empty lines.
180 if line is None:
181 break
182 print line
183
184
185 def download_from_google_storage(
186 input_filename, base_url, gsutil, num_threads, directory, recursive,
187 force, output, ignore_errors, sha1_file):
188 # Start up all the worker threads.
189 all_threads = []
190 download_timer = time.time()
191 stdout_queue = Queue.Queue()
192 work_queue = Queue.Queue()
193 for thread_num in range(num_threads):
194 t = threading.Thread(
195 target=_downloader_worker_thread,
196 args=[thread_num, work_queue, force, base_url,
197 gsutil.clone(), stdout_queue])
198 t.daemon = True
199 t.start()
200 all_threads.append(t)
201
202 # Enumerate our work queue.
203 work_queue_size = enumerate_work_queue(
204 input_filename, work_queue, directory, recursive,
205 ignore_errors, output, sha1_file)
206 for _ in all_threads:
207 work_queue.put((None, None)) # Used to tell worker threads to stop.
208
209
210
211 # Wait for all downloads to finish.
212 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])
213 printer_thread.daemon = True
214 printer_thread.start()
215 for t in all_threads:
216 t.join()
217 stdout_queue.put(None)
218 printer_thread.join()
219
220 print 'Success.'
221 print 'Downloading %d files took %1f second(s)' % (
222 work_queue_size, time.time() - download_timer)
223 return 0
224
225
226 def main(args):
227 usage = ('usage: %prog [options] target\nTarget must be:\n'
228 '(default) a sha1 sum ([A-Za-z0-9]{40}).\n(-s or --sha1_file) a '
229 '.sha1 file, containing a sha1 sum on the first line. (-d or '
230 '--directory) A directory to scan for .sha1 files. ')
231 parser = optparse.OptionParser(usage)
232 parser.add_option('-o', '--output',
233 help='Specify the output file name. Defaults to:\n'
234 '(a) Given a SHA1 hash, the name is the SHA1 hash.\n'
235 '(b) Given a .sha1 file or directory, the name will '
236 'match (.*).sha1.')
237 parser.add_option('-b', '--bucket',
238 help='Google Storage bucket to fetch from.')
239 parser.add_option('-e', '--boto',
240 help='Specify a custom boto file.')
241 parser.add_option('-c', '--no_resume', action='store_true',
242 help='Resume download if file is partially downloaded.')
243 parser.add_option('-f', '--force', action='store_true',
244 help='Force download even if local file exists.')
245 parser.add_option('-i', '--ignore_errors', action='store_true',
246 help='Don\'t throw error if we find an invalid .sha1 file.')
247 parser.add_option('-r', '--recursive', action='store_true',
248 help='Scan folders recursively for .sha1 files. '
249 'Must be used with -d/--directory')
250 parser.add_option('-t', '--num_threads', default=1, type='int',
251 help='Number of downloader threads to run.')
252 parser.add_option('-d', '--directory', action='store_true',
253 help='The target is a directory. '
254 'Cannot be used with -s/--sha1_file.')
255 parser.add_option('-s', '--sha1_file', action='store_true',
256 help='The target is a file containing a sha1 sum. '
257 'Cannot be used with -d/--directory.')
258
259 (options, args) = parser.parse_args()
260 if not args:
261 parser.error('Missing target.')
262 if len(args) > 1:
263 parser.error('Too many targets.')
264 if not options.bucket:
265 parser.error('Missing bucket. Specify bucket with --bucket.')
266 if options.sha1_file and options.directory:
267 parser.error('Both --directory and --sha1_file are specified, '
268 'can only specify one.')
269 elif options.recursive and not options.directory:
270 parser.error('--recursive specified but --directory not specified.')
271 elif options.output and options.directory:
272 parser.error('--directory is specified, so --output has no effect.')
273 else:
274 input_filename = args[0]
275
276 # Set output filename if not specified.
277 if not options.output and not options.directory:
278 if not options.sha1_file:
279 # Target is a sha1 sum, so output filename would also be the sha1 sum.
280 options.output = input_filename
281 elif options.sha1_file:
282 # Target is a .sha1 file.
283 if not input_filename.endswith('.sha1'):
284 parser.error('--sha1_file is specified, but the input filename '
285 'does not end with .sha1, and no --output is specified. '
286 'Either make sure the input filename has a .sha1 '
287 'extension, or specify --output.')
288 options.output = input_filename[:-5]
289 else:
290 raise NotImplementedError('Unreachable state.')
M-A Ruel 2013/03/07 16:22:46 parser.error(). It's not useful to raise from main
Ryan Tseng 2013/03/07 18:51:48 Done.
291
292 # Check if output file already exists.
293 if not options.directory and not options.force and not options.no_resume:
294 if os.path.exists(options.output):
295 parser.error('Output file %s exists and --no_resume is specified.'
296 % options.output)
297
298 # Make sure we can find a working instance of gsutil.
299 if os.path.exists(GSUTIL_DEFAULT_PATH):
300 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)
301 else:
302 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %
303 GSUTIL_DEFAULT_PATH)
304 return 1
305
306 # Check we have a valid bucket with valid permissions.
307 base_url, code = CheckBucketPermissions(options.bucket, gsutil)
308 if code:
309 return code
310
311 return download_from_google_storage(
312 input_filename, base_url, gsutil, options.num_threads, options.directory,
313 options.recursive, options.force, options.output, options.ignore_errors,
314 options.sha1_file)
315
316
317 if __name__ == '__main__':
318 sys.exit(main(sys.argv))
OLDNEW
« no previous file with comments | « no previous file | tests/gstools/download_test_data/rootfolder_text.txt » ('j') | upload_to_google_storage.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698