OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 """Download files from Google Storage based on SHA1 sums.""" | 6 """Download files from Google Storage based on SHA1 sums.""" |
7 | 7 |
8 | 8 |
9 import hashlib | 9 import hashlib |
10 import optparse | 10 import optparse |
11 import os | 11 import os |
12 import Queue | 12 import Queue |
13 import re | 13 import re |
14 import shutil | |
15 import stat | 14 import stat |
16 import sys | 15 import sys |
17 import tarfile | |
18 import threading | 16 import threading |
19 import time | 17 import time |
20 | 18 |
21 import subprocess2 | 19 import subprocess2 |
22 | 20 |
23 | 21 |
24 GSUTIL_DEFAULT_PATH = os.path.join( | 22 GSUTIL_DEFAULT_PATH = os.path.join( |
25 os.path.dirname(os.path.abspath(__file__)), 'gsutil.py') | 23 os.path.dirname(os.path.abspath(__file__)), 'gsutil.py') |
26 # Maps sys.platform to what we actually want to call them. | 24 # Maps sys.platform to what we actually want to call them. |
27 PLATFORM_MAPPING = { | 25 PLATFORM_MAPPING = { |
(...skipping 16 matching lines...) Expand all Loading... |
44 pass | 42 pass |
45 | 43 |
46 | 44 |
47 def GetNormalizedPlatform(): | 45 def GetNormalizedPlatform(): |
48 """Returns the result of sys.platform accounting for cygwin. | 46 """Returns the result of sys.platform accounting for cygwin. |
49 Under cygwin, this will always return "win32" like the native Python.""" | 47 Under cygwin, this will always return "win32" like the native Python.""" |
50 if sys.platform == 'cygwin': | 48 if sys.platform == 'cygwin': |
51 return 'win32' | 49 return 'win32' |
52 return sys.platform | 50 return sys.platform |
53 | 51 |
| 52 |
54 # Common utilities | 53 # Common utilities |
55 class Gsutil(object): | 54 class Gsutil(object): |
56 """Call gsutil with some predefined settings. This is a convenience object, | 55 """Call gsutil with some predefined settings. This is a convenience object, |
57 and is also immutable.""" | 56 and is also immutable.""" |
58 def __init__(self, path, boto_path=None, timeout=None, version='4.7'): | 57 def __init__(self, path, boto_path=None, timeout=None, version='4.7'): |
59 if not os.path.exists(path): | 58 if not os.path.exists(path): |
60 raise FileNotFoundError('GSUtil not found in %s' % path) | 59 raise FileNotFoundError('GSUtil not found in %s' % path) |
61 self.path = path | 60 self.path = path |
62 self.timeout = timeout | 61 self.timeout = timeout |
63 self.boto_path = boto_path | 62 self.boto_path = boto_path |
(...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
180 work_queue.put( | 179 work_queue.put( |
181 (sha1_match.groups(1)[0], full_path.replace('.sha1', ''))) | 180 (sha1_match.groups(1)[0], full_path.replace('.sha1', ''))) |
182 work_queue_size += 1 | 181 work_queue_size += 1 |
183 else: | 182 else: |
184 if not ignore_errors: | 183 if not ignore_errors: |
185 raise InvalidFileError('No sha1 sum found in %s.' % filename) | 184 raise InvalidFileError('No sha1 sum found in %s.' % filename) |
186 print >> sys.stderr, 'No sha1 sum found in %s.' % filename | 185 print >> sys.stderr, 'No sha1 sum found in %s.' % filename |
187 return work_queue_size | 186 return work_queue_size |
188 | 187 |
189 | 188 |
190 def _validate_tar_file(tar, prefix): | |
191 def _validate(tarinfo): | |
192 """Returns false if the tarinfo is something we explicitly forbid.""" | |
193 if tarinfo.issym() or tarinfo.islnk(): | |
194 return False | |
195 if '..' in tarinfo.name or not tarinfo.name.startswith(prefix): | |
196 return False | |
197 return True | |
198 return all(map(_validate, tar.getmembers())) | |
199 | |
200 def _downloader_worker_thread(thread_num, q, force, base_url, | 189 def _downloader_worker_thread(thread_num, q, force, base_url, |
201 gsutil, out_q, ret_codes, verbose, extract, | 190 gsutil, out_q, ret_codes, verbose): |
202 delete=True): | |
203 while True: | 191 while True: |
204 input_sha1_sum, output_filename = q.get() | 192 input_sha1_sum, output_filename = q.get() |
205 if input_sha1_sum is None: | 193 if input_sha1_sum is None: |
206 return | 194 return |
207 if os.path.exists(output_filename) and not force: | 195 if os.path.exists(output_filename) and not force: |
208 if get_sha1(output_filename) == input_sha1_sum: | 196 if get_sha1(output_filename) == input_sha1_sum: |
209 if verbose: | 197 if verbose: |
210 out_q.put( | 198 out_q.put( |
211 '%d> File %s exists and SHA1 matches. Skipping.' % ( | 199 '%d> File %s exists and SHA1 matches. Skipping.' % ( |
212 thread_num, output_filename)) | 200 thread_num, output_filename)) |
(...skipping 10 matching lines...) Expand all Loading... |
223 else: | 211 else: |
224 # Other error, probably auth related (bad ~/.boto, etc). | 212 # Other error, probably auth related (bad ~/.boto, etc). |
225 out_q.put('%d> Failed to fetch file %s for %s, skipping. [Err: %s]' % ( | 213 out_q.put('%d> Failed to fetch file %s for %s, skipping. [Err: %s]' % ( |
226 thread_num, file_url, output_filename, err)) | 214 thread_num, file_url, output_filename, err)) |
227 ret_codes.put((1, 'Failed to fetch file %s for %s. [Err: %s]' % ( | 215 ret_codes.put((1, 'Failed to fetch file %s for %s. [Err: %s]' % ( |
228 file_url, output_filename, err))) | 216 file_url, output_filename, err))) |
229 continue | 217 continue |
230 # Fetch the file. | 218 # Fetch the file. |
231 out_q.put('%d> Downloading %s...' % (thread_num, output_filename)) | 219 out_q.put('%d> Downloading %s...' % (thread_num, output_filename)) |
232 try: | 220 try: |
233 if delete: | 221 os.remove(output_filename) # Delete the file if it exists already. |
234 os.remove(output_filename) # Delete the file if it exists already. | |
235 except OSError: | 222 except OSError: |
236 if os.path.exists(output_filename): | 223 if os.path.exists(output_filename): |
237 out_q.put('%d> Warning: deleting %s failed.' % ( | 224 out_q.put('%d> Warning: deleting %s failed.' % ( |
238 thread_num, output_filename)) | 225 thread_num, output_filename)) |
239 code, _, err = gsutil.check_call('cp', file_url, output_filename) | 226 code, _, err = gsutil.check_call('cp', file_url, output_filename) |
240 if code != 0: | 227 if code != 0: |
241 out_q.put('%d> %s' % (thread_num, err)) | 228 out_q.put('%d> %s' % (thread_num, err)) |
242 ret_codes.put((code, err)) | 229 ret_codes.put((code, err)) |
243 | 230 |
244 if extract: | |
245 if (not tarfile.is_tarfile(output_filename) | |
246 or not output_filename.endswith('.tar.gz')): | |
247 out_q.put('%d> Error: %s is not a tar.gz archive.' % ( | |
248 thread_num, output_filename)) | |
249 ret_codes.put((1, '%s is not a tar.gz archive.' % (output_filename))) | |
250 continue | |
251 with tarfile.open(output_filename, 'r:gz') as tar: | |
252 dirname = os.path.dirname(os.path.abspath(output_filename)) | |
253 extract_dir = output_filename[0:len(output_filename)-7] | |
254 if not _validate_tar_file(tar, os.path.basename(extract_dir)): | |
255 out_q.put('%d> Error: %s contains files outside %s.' % ( | |
256 thread_num, output_filename, extract_dir)) | |
257 ret_codes.put((1, '%s contains invalid entries.' % (output_filename))) | |
258 continue | |
259 if os.path.exists(extract_dir): | |
260 try: | |
261 shutil.rmtree(extract_dir) | |
262 out_q.put('%d> Removed %s...' % (thread_num, extract_dir)) | |
263 except OSError: | |
264 out_q.put('%d> Warning: Can\'t delete: %s' % ( | |
265 thread_num, extract_dir)) | |
266 ret_codes.put((1, 'Can\'t delete %s.' % (extract_dir))) | |
267 continue | |
268 out_q.put('%d> Extracting %d entries from %s to %s' % | |
269 (thread_num, len(tar.getmembers()),output_filename, | |
270 extract_dir)) | |
271 tar.extractall(path=dirname) | |
272 # Set executable bit. | 231 # Set executable bit. |
273 if sys.platform == 'cygwin': | 232 if sys.platform == 'cygwin': |
274 # Under cygwin, mark all files as executable. The executable flag in | 233 # Under cygwin, mark all files as executable. The executable flag in |
275 # Google Storage will not be set when uploading from Windows, so if | 234 # Google Storage will not be set when uploading from Windows, so if |
276 # this script is running under cygwin and we're downloading an | 235 # this script is running under cygwin and we're downloading an |
277 # executable, it will be unrunnable from inside cygwin without this. | 236 # executable, it will be unrunnable from inside cygwin without this. |
278 st = os.stat(output_filename) | 237 st = os.stat(output_filename) |
279 os.chmod(output_filename, st.st_mode | stat.S_IEXEC) | 238 os.chmod(output_filename, st.st_mode | stat.S_IEXEC) |
280 elif sys.platform != 'win32': | 239 elif sys.platform != 'win32': |
281 # On non-Windows platforms, key off of the custom header | 240 # On non-Windows platforms, key off of the custom header |
(...skipping 10 matching lines...) Expand all Loading... |
292 while True: | 251 while True: |
293 line = output_queue.get() | 252 line = output_queue.get() |
294 # Its plausible we want to print empty lines. | 253 # Its plausible we want to print empty lines. |
295 if line is None: | 254 if line is None: |
296 break | 255 break |
297 print line | 256 print line |
298 | 257 |
299 | 258 |
300 def download_from_google_storage( | 259 def download_from_google_storage( |
301 input_filename, base_url, gsutil, num_threads, directory, recursive, | 260 input_filename, base_url, gsutil, num_threads, directory, recursive, |
302 force, output, ignore_errors, sha1_file, verbose, auto_platform, extract): | 261 force, output, ignore_errors, sha1_file, verbose, auto_platform): |
303 # Start up all the worker threads. | 262 # Start up all the worker threads. |
304 all_threads = [] | 263 all_threads = [] |
305 download_start = time.time() | 264 download_start = time.time() |
306 stdout_queue = Queue.Queue() | 265 stdout_queue = Queue.Queue() |
307 work_queue = Queue.Queue() | 266 work_queue = Queue.Queue() |
308 ret_codes = Queue.Queue() | 267 ret_codes = Queue.Queue() |
309 ret_codes.put((0, None)) | 268 ret_codes.put((0, None)) |
310 for thread_num in range(num_threads): | 269 for thread_num in range(num_threads): |
311 t = threading.Thread( | 270 t = threading.Thread( |
312 target=_downloader_worker_thread, | 271 target=_downloader_worker_thread, |
313 args=[thread_num, work_queue, force, base_url, | 272 args=[thread_num, work_queue, force, base_url, |
314 gsutil, stdout_queue, ret_codes, verbose, extract]) | 273 gsutil, stdout_queue, ret_codes, verbose]) |
315 t.daemon = True | 274 t.daemon = True |
316 t.start() | 275 t.start() |
317 all_threads.append(t) | 276 all_threads.append(t) |
318 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue]) | 277 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue]) |
319 printer_thread.daemon = True | 278 printer_thread.daemon = True |
320 printer_thread.start() | 279 printer_thread.start() |
321 | 280 |
322 # Enumerate our work queue. | 281 # Enumerate our work queue. |
323 work_queue_size = enumerate_work_queue( | 282 work_queue_size = enumerate_work_queue( |
324 input_filename, work_queue, directory, recursive, | 283 input_filename, work_queue, directory, recursive, |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
392 parser.add_option('-p', '--platform', | 351 parser.add_option('-p', '--platform', |
393 help='A regular expression that is compared against ' | 352 help='A regular expression that is compared against ' |
394 'Python\'s sys.platform. If this option is specified, ' | 353 'Python\'s sys.platform. If this option is specified, ' |
395 'the download will happen only if there is a match.') | 354 'the download will happen only if there is a match.') |
396 parser.add_option('-a', '--auto_platform', | 355 parser.add_option('-a', '--auto_platform', |
397 action='store_true', | 356 action='store_true', |
398 help='Detects if any parent folder of the target matches ' | 357 help='Detects if any parent folder of the target matches ' |
399 '(linux|mac|win). If so, the script will only ' | 358 '(linux|mac|win). If so, the script will only ' |
400 'process files that are in the paths that ' | 359 'process files that are in the paths that ' |
401 'that matches the current platform.') | 360 'that matches the current platform.') |
402 parser.add_option('-u', '--extract', | |
403 action='store_true', | |
404 help='Extract a downloaded tar.gz file. ' | |
405 'Leaves the tar.gz file around for sha1 verification' | |
406 'If a directory with the same name as the tar.gz ' | |
407 'file already exists, is deleted (to get a ' | |
408 'clean state in case of update.)') | |
409 parser.add_option('-v', '--verbose', action='store_true', | 361 parser.add_option('-v', '--verbose', action='store_true', |
410 help='Output extra diagnostic and progress information.') | 362 help='Output extra diagnostic and progress information.') |
411 | 363 |
412 (options, args) = parser.parse_args() | 364 (options, args) = parser.parse_args() |
413 | 365 |
414 # Make sure we should run at all based on platform matching. | 366 # Make sure we should run at all based on platform matching. |
415 if options.platform: | 367 if options.platform: |
416 if options.auto_platform: | 368 if options.auto_platform: |
417 parser.error('--platform can not be specified with --auto_platform') | 369 parser.error('--platform can not be specified with --auto_platform') |
418 if not re.match(options.platform, GetNormalizedPlatform()): | 370 if not re.match(options.platform, GetNormalizedPlatform()): |
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
492 if not options.directory and not options.force and not options.no_resume: | 444 if not options.directory and not options.force and not options.no_resume: |
493 if os.path.exists(options.output): | 445 if os.path.exists(options.output): |
494 parser.error('Output file %s exists and --no_resume is specified.' | 446 parser.error('Output file %s exists and --no_resume is specified.' |
495 % options.output) | 447 % options.output) |
496 | 448 |
497 base_url = 'gs://%s' % options.bucket | 449 base_url = 'gs://%s' % options.bucket |
498 | 450 |
499 return download_from_google_storage( | 451 return download_from_google_storage( |
500 input_filename, base_url, gsutil, options.num_threads, options.directory, | 452 input_filename, base_url, gsutil, options.num_threads, options.directory, |
501 options.recursive, options.force, options.output, options.ignore_errors, | 453 options.recursive, options.force, options.output, options.ignore_errors, |
502 options.sha1_file, options.verbose, options.auto_platform, | 454 options.sha1_file, options.verbose, options.auto_platform) |
503 options.extract) | |
504 | 455 |
505 | 456 |
506 if __name__ == '__main__': | 457 if __name__ == '__main__': |
507 sys.exit(main(sys.argv)) | 458 sys.exit(main(sys.argv)) |
OLD | NEW |