OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | 2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 """Download files from Google Storage based on SHA1 sums.""" | 6 """Download files from Google Storage based on SHA1 sums.""" |
7 | 7 |
8 | 8 |
9 import hashlib | 9 import hashlib |
10 import optparse | 10 import optparse |
11 import os | 11 import os |
12 import Queue | 12 import Queue |
13 import re | 13 import re |
| 14 import shutil |
14 import stat | 15 import stat |
15 import sys | 16 import sys |
| 17 import tarfile |
16 import threading | 18 import threading |
17 import time | 19 import time |
18 | 20 |
19 import subprocess2 | 21 import subprocess2 |
20 | 22 |
21 | 23 |
22 GSUTIL_DEFAULT_PATH = os.path.join( | 24 GSUTIL_DEFAULT_PATH = os.path.join( |
23 os.path.dirname(os.path.abspath(__file__)), 'gsutil.py') | 25 os.path.dirname(os.path.abspath(__file__)), 'gsutil.py') |
24 # Maps sys.platform to what we actually want to call them. | 26 # Maps sys.platform to what we actually want to call them. |
25 PLATFORM_MAPPING = { | 27 PLATFORM_MAPPING = { |
(...skipping 16 matching lines...) Expand all Loading... |
42 pass | 44 pass |
43 | 45 |
44 | 46 |
45 def GetNormalizedPlatform(): | 47 def GetNormalizedPlatform(): |
46 """Returns the result of sys.platform accounting for cygwin. | 48 """Returns the result of sys.platform accounting for cygwin. |
47 Under cygwin, this will always return "win32" like the native Python.""" | 49 Under cygwin, this will always return "win32" like the native Python.""" |
48 if sys.platform == 'cygwin': | 50 if sys.platform == 'cygwin': |
49 return 'win32' | 51 return 'win32' |
50 return sys.platform | 52 return sys.platform |
51 | 53 |
52 | |
53 # Common utilities | 54 # Common utilities |
54 class Gsutil(object): | 55 class Gsutil(object): |
55 """Call gsutil with some predefined settings. This is a convenience object, | 56 """Call gsutil with some predefined settings. This is a convenience object, |
56 and is also immutable.""" | 57 and is also immutable.""" |
57 def __init__(self, path, boto_path=None, timeout=None, version='4.7'): | 58 def __init__(self, path, boto_path=None, timeout=None, version='4.7'): |
58 if not os.path.exists(path): | 59 if not os.path.exists(path): |
59 raise FileNotFoundError('GSUtil not found in %s' % path) | 60 raise FileNotFoundError('GSUtil not found in %s' % path) |
60 self.path = path | 61 self.path = path |
61 self.timeout = timeout | 62 self.timeout = timeout |
62 self.boto_path = boto_path | 63 self.boto_path = boto_path |
(...skipping 116 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
179 work_queue.put( | 180 work_queue.put( |
180 (sha1_match.groups(1)[0], full_path.replace('.sha1', ''))) | 181 (sha1_match.groups(1)[0], full_path.replace('.sha1', ''))) |
181 work_queue_size += 1 | 182 work_queue_size += 1 |
182 else: | 183 else: |
183 if not ignore_errors: | 184 if not ignore_errors: |
184 raise InvalidFileError('No sha1 sum found in %s.' % filename) | 185 raise InvalidFileError('No sha1 sum found in %s.' % filename) |
185 print >> sys.stderr, 'No sha1 sum found in %s.' % filename | 186 print >> sys.stderr, 'No sha1 sum found in %s.' % filename |
186 return work_queue_size | 187 return work_queue_size |
187 | 188 |
188 | 189 |
| 190 def _validate_tar_file(tar, prefix): |
| 191 def _validate(tarinfo): |
| 192 """Returns false if the tarinfo is something we explicitly forbid.""" |
| 193 if tarinfo.issym() or tarinfo.islnk(): |
| 194 return False |
| 195 if '..' in tarinfo.name or not tarinfo.name.startswith(prefix): |
| 196 return False |
| 197 return True |
| 198 return all(map(_validate, tar.getmembers())) |
| 199 |
189 def _downloader_worker_thread(thread_num, q, force, base_url, | 200 def _downloader_worker_thread(thread_num, q, force, base_url, |
190 gsutil, out_q, ret_codes, verbose): | 201 gsutil, out_q, ret_codes, verbose, extract, |
| 202 delete=True): |
191 while True: | 203 while True: |
192 input_sha1_sum, output_filename = q.get() | 204 input_sha1_sum, output_filename = q.get() |
193 if input_sha1_sum is None: | 205 if input_sha1_sum is None: |
194 return | 206 return |
195 if os.path.exists(output_filename) and not force: | 207 if os.path.exists(output_filename) and not force: |
196 if get_sha1(output_filename) == input_sha1_sum: | 208 if get_sha1(output_filename) == input_sha1_sum: |
197 if verbose: | 209 if verbose: |
198 out_q.put( | 210 out_q.put( |
199 '%d> File %s exists and SHA1 matches. Skipping.' % ( | 211 '%d> File %s exists and SHA1 matches. Skipping.' % ( |
200 thread_num, output_filename)) | 212 thread_num, output_filename)) |
(...skipping 10 matching lines...) Expand all Loading... |
211 else: | 223 else: |
212 # Other error, probably auth related (bad ~/.boto, etc). | 224 # Other error, probably auth related (bad ~/.boto, etc). |
213 out_q.put('%d> Failed to fetch file %s for %s, skipping. [Err: %s]' % ( | 225 out_q.put('%d> Failed to fetch file %s for %s, skipping. [Err: %s]' % ( |
214 thread_num, file_url, output_filename, err)) | 226 thread_num, file_url, output_filename, err)) |
215 ret_codes.put((1, 'Failed to fetch file %s for %s. [Err: %s]' % ( | 227 ret_codes.put((1, 'Failed to fetch file %s for %s. [Err: %s]' % ( |
216 file_url, output_filename, err))) | 228 file_url, output_filename, err))) |
217 continue | 229 continue |
218 # Fetch the file. | 230 # Fetch the file. |
219 out_q.put('%d> Downloading %s...' % (thread_num, output_filename)) | 231 out_q.put('%d> Downloading %s...' % (thread_num, output_filename)) |
220 try: | 232 try: |
221 os.remove(output_filename) # Delete the file if it exists already. | 233 if delete: |
| 234 os.remove(output_filename) # Delete the file if it exists already. |
222 except OSError: | 235 except OSError: |
223 if os.path.exists(output_filename): | 236 if os.path.exists(output_filename): |
224 out_q.put('%d> Warning: deleting %s failed.' % ( | 237 out_q.put('%d> Warning: deleting %s failed.' % ( |
225 thread_num, output_filename)) | 238 thread_num, output_filename)) |
226 code, _, err = gsutil.check_call('cp', file_url, output_filename) | 239 code, _, err = gsutil.check_call('cp', file_url, output_filename) |
227 if code != 0: | 240 if code != 0: |
228 out_q.put('%d> %s' % (thread_num, err)) | 241 out_q.put('%d> %s' % (thread_num, err)) |
229 ret_codes.put((code, err)) | 242 ret_codes.put((code, err)) |
230 continue | 243 continue |
231 | 244 |
232 remote_sha1 = get_sha1(output_filename) | 245 remote_sha1 = get_sha1(output_filename) |
233 if remote_sha1 != input_sha1_sum: | 246 if remote_sha1 != input_sha1_sum: |
234 msg = ('%d> ERROR remote sha1 (%s) does not match expected sha1 (%s).' % | 247 msg = ('%d> ERROR remote sha1 (%s) does not match expected sha1 (%s).' % |
235 (thread_num, remote_sha1, input_sha1_sum)) | 248 (thread_num, remote_sha1, input_sha1_sum)) |
236 out_q.put(msg) | 249 out_q.put(msg) |
237 ret_codes.put((20, msg)) | 250 ret_codes.put((20, msg)) |
238 continue | 251 continue |
239 | 252 |
| 253 if extract: |
| 254 if (not tarfile.is_tarfile(output_filename) |
| 255 or not output_filename.endswith('.tar.gz')): |
| 256 out_q.put('%d> Error: %s is not a tar.gz archive.' % ( |
| 257 thread_num, output_filename)) |
| 258 ret_codes.put((1, '%s is not a tar.gz archive.' % (output_filename))) |
| 259 continue |
| 260 with tarfile.open(output_filename, 'r:gz') as tar: |
| 261 dirname = os.path.dirname(os.path.abspath(output_filename)) |
| 262 extract_dir = output_filename[0:len(output_filename)-7] |
| 263 if not _validate_tar_file(tar, os.path.basename(extract_dir)): |
| 264 out_q.put('%d> Error: %s contains files outside %s.' % ( |
| 265 thread_num, output_filename, extract_dir)) |
| 266 ret_codes.put((1, '%s contains invalid entries.' % (output_filename))) |
| 267 continue |
| 268 if os.path.exists(extract_dir): |
| 269 try: |
| 270 shutil.rmtree(extract_dir) |
| 271 out_q.put('%d> Removed %s...' % (thread_num, extract_dir)) |
| 272 except OSError: |
| 273 out_q.put('%d> Warning: Can\'t delete: %s' % ( |
| 274 thread_num, extract_dir)) |
| 275 ret_codes.put((1, 'Can\'t delete %s.' % (extract_dir))) |
| 276 continue |
| 277 out_q.put('%d> Extracting %d entries from %s to %s' % |
| 278 (thread_num, len(tar.getmembers()),output_filename, |
| 279 extract_dir)) |
| 280 tar.extractall(path=dirname) |
240 # Set executable bit. | 281 # Set executable bit. |
241 if sys.platform == 'cygwin': | 282 if sys.platform == 'cygwin': |
242 # Under cygwin, mark all files as executable. The executable flag in | 283 # Under cygwin, mark all files as executable. The executable flag in |
243 # Google Storage will not be set when uploading from Windows, so if | 284 # Google Storage will not be set when uploading from Windows, so if |
244 # this script is running under cygwin and we're downloading an | 285 # this script is running under cygwin and we're downloading an |
245 # executable, it will be unrunnable from inside cygwin without this. | 286 # executable, it will be unrunnable from inside cygwin without this. |
246 st = os.stat(output_filename) | 287 st = os.stat(output_filename) |
247 os.chmod(output_filename, st.st_mode | stat.S_IEXEC) | 288 os.chmod(output_filename, st.st_mode | stat.S_IEXEC) |
248 elif sys.platform != 'win32': | 289 elif sys.platform != 'win32': |
249 # On non-Windows platforms, key off of the custom header | 290 # On non-Windows platforms, key off of the custom header |
(...skipping 10 matching lines...) Expand all Loading... |
260 while True: | 301 while True: |
261 line = output_queue.get() | 302 line = output_queue.get() |
262 # Its plausible we want to print empty lines. | 303 # Its plausible we want to print empty lines. |
263 if line is None: | 304 if line is None: |
264 break | 305 break |
265 print line | 306 print line |
266 | 307 |
267 | 308 |
268 def download_from_google_storage( | 309 def download_from_google_storage( |
269 input_filename, base_url, gsutil, num_threads, directory, recursive, | 310 input_filename, base_url, gsutil, num_threads, directory, recursive, |
270 force, output, ignore_errors, sha1_file, verbose, auto_platform): | 311 force, output, ignore_errors, sha1_file, verbose, auto_platform, extract): |
271 # Start up all the worker threads. | 312 # Start up all the worker threads. |
272 all_threads = [] | 313 all_threads = [] |
273 download_start = time.time() | 314 download_start = time.time() |
274 stdout_queue = Queue.Queue() | 315 stdout_queue = Queue.Queue() |
275 work_queue = Queue.Queue() | 316 work_queue = Queue.Queue() |
276 ret_codes = Queue.Queue() | 317 ret_codes = Queue.Queue() |
277 ret_codes.put((0, None)) | 318 ret_codes.put((0, None)) |
278 for thread_num in range(num_threads): | 319 for thread_num in range(num_threads): |
279 t = threading.Thread( | 320 t = threading.Thread( |
280 target=_downloader_worker_thread, | 321 target=_downloader_worker_thread, |
281 args=[thread_num, work_queue, force, base_url, | 322 args=[thread_num, work_queue, force, base_url, |
282 gsutil, stdout_queue, ret_codes, verbose]) | 323 gsutil, stdout_queue, ret_codes, verbose, extract]) |
283 t.daemon = True | 324 t.daemon = True |
284 t.start() | 325 t.start() |
285 all_threads.append(t) | 326 all_threads.append(t) |
286 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue]) | 327 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue]) |
287 printer_thread.daemon = True | 328 printer_thread.daemon = True |
288 printer_thread.start() | 329 printer_thread.start() |
289 | 330 |
290 # Enumerate our work queue. | 331 # Enumerate our work queue. |
291 work_queue_size = enumerate_work_queue( | 332 work_queue_size = enumerate_work_queue( |
292 input_filename, work_queue, directory, recursive, | 333 input_filename, work_queue, directory, recursive, |
(...skipping 67 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
360 parser.add_option('-p', '--platform', | 401 parser.add_option('-p', '--platform', |
361 help='A regular expression that is compared against ' | 402 help='A regular expression that is compared against ' |
362 'Python\'s sys.platform. If this option is specified, ' | 403 'Python\'s sys.platform. If this option is specified, ' |
363 'the download will happen only if there is a match.') | 404 'the download will happen only if there is a match.') |
364 parser.add_option('-a', '--auto_platform', | 405 parser.add_option('-a', '--auto_platform', |
365 action='store_true', | 406 action='store_true', |
366 help='Detects if any parent folder of the target matches ' | 407 help='Detects if any parent folder of the target matches ' |
367 '(linux|mac|win). If so, the script will only ' | 408 '(linux|mac|win). If so, the script will only ' |
368 'process files that are in the paths that ' | 409 'process files that are in the paths that ' |
369 'that matches the current platform.') | 410 'that matches the current platform.') |
| 411 parser.add_option('-u', '--extract', |
| 412 action='store_true', |
| 413 help='Extract a downloaded tar.gz file. ' |
| 414 'Leaves the tar.gz file around for sha1 verification' |
| 415 'If a directory with the same name as the tar.gz ' |
| 416 'file already exists, is deleted (to get a ' |
| 417 'clean state in case of update.)') |
370 parser.add_option('-v', '--verbose', action='store_true', | 418 parser.add_option('-v', '--verbose', action='store_true', |
371 help='Output extra diagnostic and progress information.') | 419 help='Output extra diagnostic and progress information.') |
372 | 420 |
373 (options, args) = parser.parse_args() | 421 (options, args) = parser.parse_args() |
374 | 422 |
375 # Make sure we should run at all based on platform matching. | 423 # Make sure we should run at all based on platform matching. |
376 if options.platform: | 424 if options.platform: |
377 if options.auto_platform: | 425 if options.auto_platform: |
378 parser.error('--platform can not be specified with --auto_platform') | 426 parser.error('--platform can not be specified with --auto_platform') |
379 if not re.match(options.platform, GetNormalizedPlatform()): | 427 if not re.match(options.platform, GetNormalizedPlatform()): |
(...skipping 73 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
453 if not options.directory and not options.force and not options.no_resume: | 501 if not options.directory and not options.force and not options.no_resume: |
454 if os.path.exists(options.output): | 502 if os.path.exists(options.output): |
455 parser.error('Output file %s exists and --no_resume is specified.' | 503 parser.error('Output file %s exists and --no_resume is specified.' |
456 % options.output) | 504 % options.output) |
457 | 505 |
458 base_url = 'gs://%s' % options.bucket | 506 base_url = 'gs://%s' % options.bucket |
459 | 507 |
460 return download_from_google_storage( | 508 return download_from_google_storage( |
461 input_filename, base_url, gsutil, options.num_threads, options.directory, | 509 input_filename, base_url, gsutil, options.num_threads, options.directory, |
462 options.recursive, options.force, options.output, options.ignore_errors, | 510 options.recursive, options.force, options.output, options.ignore_errors, |
463 options.sha1_file, options.verbose, options.auto_platform) | 511 options.sha1_file, options.verbose, options.auto_platform, |
| 512 options.extract) |
464 | 513 |
465 | 514 |
466 if __name__ == '__main__': | 515 if __name__ == '__main__': |
467 sys.exit(main(sys.argv)) | 516 sys.exit(main(sys.argv)) |
OLD | NEW |