Chromium Code Reviews

Side by Side Diff: download_from_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Split tests, fixed ret_code seeding Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff |
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Download files from Google Storage based on SHA1 sums."""
7
8
9 import hashlib
10 import optparse
11 import os
12 import Queue
13 import re
14 import sys
15 import threading
16 import time
17
18 import subprocess2
19
20
21 GSUTIL_DEFAULT_PATH = os.path.join(
22 os.path.dirname(os.path.abspath(__file__)),
23 'third_party', 'gsutil', 'gsutil')
24
25
26 class FileNotFoundError(IOError):
27 pass
28
29
30 class InvalidFileError(IOError):
31 pass
32
33
34 # Common utilities
35 class Gsutil(object):
36 """Call gsutil with some predefined settings."""
37 def __init__(self, path, boto_path=None, timeout=None):
38 if not os.path.exists(path):
39 raise FileNotFoundError('GSUtil not found in %s' % path)
40 self.path = path
41 self.timeout = timeout
42 self.boto_path = boto_path
43
44 def call(self, *args):
45 env = os.environ.copy()
46 if self.boto_path:
47 env['AWS_CREDENTIAL_FILE'] = self.boto_path
48 return subprocess2.call((sys.executable, self.path) + args,
49 env=env,
50 timeout=self.timeout)
51
52 def check_call(self, *args):
53 env = os.environ.copy()
54 if self.boto_path:
55 env['AWS_CREDENTIAL_FILE'] = self.boto_path
56 ((out, err), code) = subprocess2.communicate(
57 (sys.executable, self.path) + args,
58 stdout=subprocess2.PIPE,
59 stderr=subprocess2.PIPE,
60 env=env,
61 timeout=self.timeout)
62
63 # Parse output.
64 status_code_match = re.search('status=([0-9]+)', err)
65 if status_code_match:
66 return int(status_code_match.groups(1))
67 if ('You are attempting to access protected data with '
68 'no configured credentials.' in err):
69 return (403, out, err)
70 if 'No such object' in err:
71 return (404, out, err)
72 return (code, out, err)
73
74 def clone(self):
M-A Ruel 2013/03/09 12:41:13 Technically, you don't need that. You can use an o
Ryan Tseng 2013/03/11 17:35:14 Done.
75 return Gsutil(self.path, self.boto_path, self.timeout)
76
77
78 def check_bucket_permissions(bucket, gsutil):
79 if not bucket:
80 print >> sys.stderr, 'Missing bucket %s.'
81 return (None, 1)
82 base_url = 'gs://%s' % bucket
83
84 code, _, ls_err = gsutil.check_call('ls', base_url)
85 if code == 403:
86 code, _, _ = gsutil.call('config')
87 if code != 0:
88 print >> sys.stderr, 'Error while authenticating to %s.' % base_url
89 elif code == 404:
90 print >> sys.stderr, '%s not found.' % base_url
91 elif code != 0:
92 print >> sys.stderr, ls_err
93 return (base_url, code)
94
95
96 def get_sha1(filename):
97 sha1 = hashlib.sha1()
98 with open(filename, 'rb') as f:
99 while True:
100 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.
101 chunk = f.read(1024*1024)
102 if not chunk:
103 break
104 sha1.update(chunk)
105 return sha1.hexdigest()
106
107
108 # Download-specific code starts here
109
110 def enumerate_work_queue(input_filename, work_queue, directory,
111 recursive, ignore_errors, output, sha1_file):
112 if sha1_file:
113 if not os.path.exists(input_filename):
114 if not ignore_errors:
115 raise FileNotFoundError('%s not found.' % input_filename)
116 print >> sys.stderr, '%s not found.' % input_filename
117 with open(input_filename, 'rb') as f:
118 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
119 if sha1_match:
120 work_queue.put(
121 (sha1_match.groups(1)[0], input_filename.replace('.sha1', '')))
122 return 1
123 if not ignore_errors:
124 raise InvalidFileError('No sha1 sum found in %s.' % input_filename)
125 print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename
126 return 0
127
128 if not directory:
129 work_queue.put((input_filename, output))
130 return 1
131
132 work_queue_size = 0
133 for root, dirs, files in os.walk(input_filename):
134 if not recursive:
135 for item in dirs[:]:
136 dirs.remove(item)
137 else:
138 for exclude in ['.svn', '.git']:
139 if exclude in dirs:
140 dirs.remove(exclude)
141 for filename in files:
142 full_path = os.path.join(root, filename)
143 if full_path.endswith('.sha1'):
144 with open(full_path, 'rb') as f:
145 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
146 if sha1_match:
147 work_queue.put(
148 (sha1_match.groups(1)[0], full_path.replace('.sha1', '')))
149 work_queue_size += 1
150 else:
151 if not ignore_errors:
152 raise InvalidFileError('No sha1 sum found in %s.' % filename)
153 print >> sys.stderr, 'No sha1 sum found in %s.' % filename
154 return work_queue_size
155
156
157 def _downloader_worker_thread(thread_num, q, force, base_url,
158 gsutil, out_q, ret_codes):
159 while True:
160 input_sha1_sum, output_filename = q.get()
161 if input_sha1_sum is None:
162 return
163 if os.path.exists(output_filename) and not force:
164 if get_sha1(output_filename) == input_sha1_sum:
165 out_q.put(
166 '%d> File %s exists and SHA1 matches. Skipping.' % (
167 thread_num, output_filename))
168 continue
169 # Check if file exists.
170 file_url = '%s/%s' % (base_url, input_sha1_sum)
171 if gsutil.check_call('ls', file_url)[0] != 0:
172 out_q.put('%d> File %s for %s does not exist, skipping.' % (
173 thread_num, file_url, output_filename))
174 ret_codes.put((1, 'File %s for %s does not exist.' % (
175 file_url, output_filename)))
176 continue
177 # Fetch the file.
178 out_q.put('%d> Downloading %s...' % (
179 thread_num, output_filename))
180 code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename)
181 if code != 0:
182 out_q.put('%d> %s' % (thread_num, err))
183 ret_codes.put((code, err))
184
185
186 def printer_worker(output_queue):
187 while True:
188 line = output_queue.get()
189 # Its plausible we want to print empty lines.
190 if line is None:
191 break
192 print line
193
194
195 def download_from_google_storage(
196 input_filename, base_url, gsutil, num_threads, directory, recursive,
197 force, output, ignore_errors, sha1_file):
198 # Start up all the worker threads.
199 all_threads = []
200 download_timer = time.time()
M-A Ruel 2013/03/09 12:41:13 download_start
Ryan Tseng 2013/03/11 17:35:14 Done.
201 stdout_queue = Queue.Queue()
202 work_queue = Queue.Queue()
203 ret_codes = Queue.Queue()
204 ret_codes.put((0, None))
M-A Ruel 2013/03/09 12:41:13 Good!
205 for thread_num in range(num_threads):
206 t = threading.Thread(
207 target=_downloader_worker_thread,
208 args=[thread_num, work_queue, force, base_url,
209 gsutil.clone(), stdout_queue, ret_codes])
210 t.daemon = True
211 t.start()
212 all_threads.append(t)
213 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])
214 printer_thread.daemon = True
215 printer_thread.start()
216
217 # Enumerate our work queue.
218 work_queue_size = enumerate_work_queue(
219 input_filename, work_queue, directory, recursive,
220 ignore_errors, output, sha1_file)
221 for _ in all_threads:
222 work_queue.put((None, None)) # Used to tell worker threads to stop.
223
224 # Wait for all downloads to finish.
225 for t in all_threads:
226 t.join()
227 stdout_queue.put(None)
228 printer_thread.join()
229
230 # See if we ran into any errors.
231 max_ret_code = 0
232 for ret_code, message in ret_codes.queue:
233 max_ret_code = max(ret_code, max_ret_code)
234 if message:
235 print >> sys.stderr, message
236 if not max_ret_code:
237 print 'Success!'
238
239 print 'Downloading %d files took %1f second(s)' % (
240 work_queue_size, time.time() - download_timer)
241 return max_ret_code
242
243
244 def main(args):
245 usage = ('usage: %prog [options] target\nTarget must be:\n'
246 '(default) a sha1 sum ([A-Za-z0-9]{40}).\n(-s or --sha1_file) a '
247 '.sha1 file, containing a sha1 sum on the first line. (-d or '
248 '--directory) A directory to scan for .sha1 files. ')
249 parser = optparse.OptionParser(usage)
250 parser.add_option('-o', '--output',
251 help='Specify the output file name. Defaults to:\n'
252 '(a) Given a SHA1 hash, the name is the SHA1 hash.\n'
253 '(b) Given a .sha1 file or directory, the name will '
254 'match (.*).sha1.')
255 parser.add_option('-b', '--bucket',
256 help='Google Storage bucket to fetch from.')
257 parser.add_option('-e', '--boto',
258 help='Specify a custom boto file.')
259 parser.add_option('-c', '--no_resume', action='store_true',
260 help='Resume download if file is partially downloaded.')
261 parser.add_option('-f', '--force', action='store_true',
262 help='Force download even if local file exists.')
263 parser.add_option('-i', '--ignore_errors', action='store_true',
264 help='Don\'t throw error if we find an invalid .sha1 file.')
265 parser.add_option('-r', '--recursive', action='store_true',
266 help='Scan folders recursively for .sha1 files. '
267 'Must be used with -d/--directory')
268 parser.add_option('-t', '--num_threads', default=1, type='int',
269 help='Number of downloader threads to run.')
270 parser.add_option('-d', '--directory', action='store_true',
271 help='The target is a directory. '
272 'Cannot be used with -s/--sha1_file.')
273 parser.add_option('-s', '--sha1_file', action='store_true',
274 help='The target is a file containing a sha1 sum. '
275 'Cannot be used with -d/--directory.')
276
277 (options, args) = parser.parse_args()
278 if not args:
279 parser.error('Missing target.')
280 if len(args) > 1:
281 parser.error('Too many targets.')
282 if not options.bucket:
283 parser.error('Missing bucket. Specify bucket with --bucket.')
284 if options.sha1_file and options.directory:
285 parser.error('Both --directory and --sha1_file are specified, '
286 'can only specify one.')
287 if options.recursive and not options.directory:
288 parser.error('--recursive specified but --directory not specified.')
289 if options.output and options.directory:
290 parser.error('--directory is specified, so --output has no effect.')
291 input_filename = args[0]
292
293 # Set output filename if not specified.
294 if not options.output and not options.directory:
295 if not options.sha1_file:
296 # Target is a sha1 sum, so output filename would also be the sha1 sum.
297 options.output = input_filename
298 elif options.sha1_file:
299 # Target is a .sha1 file.
300 if not input_filename.endswith('.sha1'):
301 parser.error('--sha1_file is specified, but the input filename '
302 'does not end with .sha1, and no --output is specified. '
303 'Either make sure the input filename has a .sha1 '
304 'extension, or specify --output.')
305 options.output = input_filename[:-5]
306 else:
307 parser.error('Unreachable state.')
308
309 # Check if output file already exists.
310 if not options.directory and not options.force and not options.no_resume:
311 if os.path.exists(options.output):
312 parser.error('Output file %s exists and --no_resume is specified.'
313 % options.output)
314
315 # Make sure we can find a working instance of gsutil.
316 if not os.path.exists(GSUTIL_DEFAULT_PATH):
317 parser.error('gsutil not found in %s, bad depot_tools checkout?' %
318 GSUTIL_DEFAULT_PATH)
319 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)
320
321 # Check we have a valid bucket with valid permissions.
322 base_url, code = check_bucket_permissions(options.bucket, gsutil)
323 if code:
324 return code
325
326 return download_from_google_storage(
327 input_filename, base_url, gsutil, options.num_threads, options.directory,
328 options.recursive, options.force, options.output, options.ignore_errors,
329 options.sha1_file)
330
331
332 if __name__ == '__main__':
333 sys.exit(main(sys.argv))
OLDNEW
« no previous file with comments | « no previous file | tests/download_from_google_storage_unittests.py » ('j') | tests/download_from_google_storage_unittests.py » ('J')

Powered by Google App Engine