download_from_google_storage.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: download_from_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Split tests, fixed ret_code seeding Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments.

Jump to:

View unified diff |

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Download files from Google Storage based on SHA1 sums."""

	7

	8

	9 import hashlib

	10 import optparse

	11 import os

	12 import Queue

	13 import re

	14 import sys

	15 import threading

	16 import time

	17

	18 import subprocess2

	19

	20

	21 GSUTIL_DEFAULT_PATH = os.path.join(

	22 os.path.dirname(os.path.abspath(__file__)),

	23 'third_party', 'gsutil', 'gsutil')

	24

	25

	26 class FileNotFoundError(IOError):

	27 pass

	28

	29

	30 class InvalidFileError(IOError):

	31 pass

	32

	33

	34 # Common utilities

	35 class Gsutil(object):

	36 """Call gsutil with some predefined settings."""

	37 def __init__(self, path, boto_path=None, timeout=None):

	38 if not os.path.exists(path):

	39 raise FileNotFoundError('GSUtil not found in %s' % path)

	40 self.path = path

	41 self.timeout = timeout

	42 self.boto_path = boto_path

	43

	44 def call(self, *args):

	45 env = os.environ.copy()

	46 if self.boto_path:

	47 env['AWS_CREDENTIAL_FILE'] = self.boto_path

	48 return subprocess2.call((sys.executable, self.path) + args,

	49 env=env,

	50 timeout=self.timeout)

	51

	52 def check_call(self, *args):

	53 env = os.environ.copy()

	54 if self.boto_path:

	55 env['AWS_CREDENTIAL_FILE'] = self.boto_path

	56 ((out, err), code) = subprocess2.communicate(

	57 (sys.executable, self.path) + args,

	58 stdout=subprocess2.PIPE,

	59 stderr=subprocess2.PIPE,

	60 env=env,

	61 timeout=self.timeout)

	62

	63 # Parse output.

	64 status_code_match = re.search('status=([0-9]+)', err)

	65 if status_code_match:

	66 return int(status_code_match.groups(1))

	67 if ('You are attempting to access protected data with '

	68 'no configured credentials.' in err):

	69 return (403, out, err)

	70 if 'No such object' in err:

	71 return (404, out, err)

	72 return (code, out, err)

	73

	74 def clone(self):
	M-A Ruel 2013/03/09 12:41:13 Technically, you don't need that. You can use an o Technically, you don't need that. You can use an object across threads in python if you use it in a read-only way, e.g. you never assign to a member of self. So you can: - remove this function. - add a note in the docstring that this object is immutable. - remove clone in the mock. FYI only since I don't think it's a good idea to use it here, Gsutil is simple enough to not need any special enforcement. This is possible to enforce that an object is immutable, I had written a class to enforce that: http://git.chromium.org/gitweb/?p=chromium/tools/commit-queue.git;a=blob;f=mo... Ryan Tseng 2013/03/11 17:35:14 Done. Show quoted text On 2013/03/09 12:41:13, Marc-Antoine Ruel wrote: > Technically, you don't need that. You can use an object across threads in python > if you use it in a read-only way, e.g. you never assign to a member of self. So > you can: > - remove this function. > - add a note in the docstring that this object is immutable. > - remove clone in the mock. > > FYI only since I don't think it's a good idea to use it here, Gsutil is simple > enough to not need any special enforcement. This is possible to enforce that an > object is immutable, I had written a class to enforce that: > http://git.chromium.org/gitweb/?p=chromium/tools/commit-queue.git;a=blob;f=mo... > Done.
	75 return Gsutil(self.path, self.boto_path, self.timeout)

	76

	77

	78 def check_bucket_permissions(bucket, gsutil):

	79 if not bucket:

	80 print >> sys.stderr, 'Missing bucket %s.'

	81 return (None, 1)

	82 base_url = 'gs://%s' % bucket

	83

	84 code, _, ls_err = gsutil.check_call('ls', base_url)

	85 if code == 403:

	86 code, _, _ = gsutil.call('config')

	87 if code != 0:

	88 print >> sys.stderr, 'Error while authenticating to %s.' % base_url

	89 elif code == 404:

	90 print >> sys.stderr, '%s not found.' % base_url

	91 elif code != 0:

	92 print >> sys.stderr, ls_err

	93 return (base_url, code)

	94

	95

	96 def get_sha1(filename):

	97 sha1 = hashlib.sha1()

	98 with open(filename, 'rb') as f:

	99 while True:

	100 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.

	101 chunk = f.read(1024*1024)

	102 if not chunk:

	103 break

	104 sha1.update(chunk)

	105 return sha1.hexdigest()

	106

	107

	108 # Download-specific code starts here

	109

	110 def enumerate_work_queue(input_filename, work_queue, directory,

	111 recursive, ignore_errors, output, sha1_file):

	112 if sha1_file:

	113 if not os.path.exists(input_filename):

	114 if not ignore_errors:

	115 raise FileNotFoundError('%s not found.' % input_filename)

	116 print >> sys.stderr, '%s not found.' % input_filename

	117 with open(input_filename, 'rb') as f:

	118 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())

	119 if sha1_match:

	120 work_queue.put(

	121 (sha1_match.groups(1)[0], input_filename.replace('.sha1', '')))

	122 return 1

	123 if not ignore_errors:

	124 raise InvalidFileError('No sha1 sum found in %s.' % input_filename)

	125 print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename

	126 return 0

	127

	128 if not directory:

	129 work_queue.put((input_filename, output))

	130 return 1

	131

	132 work_queue_size = 0

	133 for root, dirs, files in os.walk(input_filename):

	134 if not recursive:

	135 for item in dirs[:]:

	136 dirs.remove(item)

	137 else:

	138 for exclude in ['.svn', '.git']:

	139 if exclude in dirs:

	140 dirs.remove(exclude)

	141 for filename in files:

	142 full_path = os.path.join(root, filename)

	143 if full_path.endswith('.sha1'):

	144 with open(full_path, 'rb') as f:

	145 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())

	146 if sha1_match:

	147 work_queue.put(

	148 (sha1_match.groups(1)[0], full_path.replace('.sha1', '')))

	149 work_queue_size += 1

	150 else:

	151 if not ignore_errors:

	152 raise InvalidFileError('No sha1 sum found in %s.' % filename)

	153 print >> sys.stderr, 'No sha1 sum found in %s.' % filename

	154 return work_queue_size

	155

	156

	157 def _downloader_worker_thread(thread_num, q, force, base_url,

	158 gsutil, out_q, ret_codes):

	159 while True:

	160 input_sha1_sum, output_filename = q.get()

	161 if input_sha1_sum is None:

	162 return

	163 if os.path.exists(output_filename) and not force:

	164 if get_sha1(output_filename) == input_sha1_sum:

	165 out_q.put(

	166 '%d> File %s exists and SHA1 matches. Skipping.' % (

	167 thread_num, output_filename))

	168 continue

	169 # Check if file exists.

	170 file_url = '%s/%s' % (base_url, input_sha1_sum)

	171 if gsutil.check_call('ls', file_url)[0] != 0:

	172 out_q.put('%d> File %s for %s does not exist, skipping.' % (

	173 thread_num, file_url, output_filename))

	174 ret_codes.put((1, 'File %s for %s does not exist.' % (

	175 file_url, output_filename)))

	176 continue

	177 # Fetch the file.

	178 out_q.put('%d> Downloading %s...' % (

	179 thread_num, output_filename))

	180 code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename)

	181 if code != 0:

	182 out_q.put('%d> %s' % (thread_num, err))

	183 ret_codes.put((code, err))

	184

	185

	186 def printer_worker(output_queue):

	187 while True:

	188 line = output_queue.get()

	189 # Its plausible we want to print empty lines.

	190 if line is None:

	191 break

	192 print line

	193

	194

	195 def download_from_google_storage(

	196 input_filename, base_url, gsutil, num_threads, directory, recursive,

	197 force, output, ignore_errors, sha1_file):

	198 # Start up all the worker threads.

	199 all_threads = []

	200 download_timer = time.time()
	M-A Ruel 2013/03/09 12:41:13 download_start download_start Ryan Tseng 2013/03/11 17:35:14 Done. Show quoted text On 2013/03/09 12:41:13, Marc-Antoine Ruel wrote: > download_start Done.
	201 stdout_queue = Queue.Queue()

	202 work_queue = Queue.Queue()

	203 ret_codes = Queue.Queue()

	204 ret_codes.put((0, None))
	M-A Ruel 2013/03/09 12:41:13 Good! Good!
	205 for thread_num in range(num_threads):

	206 t = threading.Thread(

	207 target=_downloader_worker_thread,

	208 args=[thread_num, work_queue, force, base_url,

	209 gsutil.clone(), stdout_queue, ret_codes])

	210 t.daemon = True

	211 t.start()

	212 all_threads.append(t)

	213 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])

	214 printer_thread.daemon = True

	215 printer_thread.start()

	216

	217 # Enumerate our work queue.

	218 work_queue_size = enumerate_work_queue(

	219 input_filename, work_queue, directory, recursive,

	220 ignore_errors, output, sha1_file)

	221 for _ in all_threads:

	222 work_queue.put((None, None)) # Used to tell worker threads to stop.

	223

	224 # Wait for all downloads to finish.

	225 for t in all_threads:

	226 t.join()

	227 stdout_queue.put(None)

	228 printer_thread.join()

	229

	230 # See if we ran into any errors.

	231 max_ret_code = 0

	232 for ret_code, message in ret_codes.queue:

	233 max_ret_code = max(ret_code, max_ret_code)

	234 if message:

	235 print >> sys.stderr, message

	236 if not max_ret_code:

	237 print 'Success!'

	238

	239 print 'Downloading %d files took %1f second(s)' % (

	240 work_queue_size, time.time() - download_timer)

	241 return max_ret_code

	242

	243

	244 def main(args):

	245 usage = ('usage: %prog [options] target\nTarget must be:\n'

	246 '(default) a sha1 sum ([A-Za-z0-9]{40}).\n(-s or --sha1_file) a '

	247 '.sha1 file, containing a sha1 sum on the first line. (-d or '

	248 '--directory) A directory to scan for .sha1 files. ')

	249 parser = optparse.OptionParser(usage)

	250 parser.add_option('-o', '--output',

	251 help='Specify the output file name. Defaults to:\n'

	252 '(a) Given a SHA1 hash, the name is the SHA1 hash.\n'

	253 '(b) Given a .sha1 file or directory, the name will '

	254 'match (.*).sha1.')

	255 parser.add_option('-b', '--bucket',

	256 help='Google Storage bucket to fetch from.')

	257 parser.add_option('-e', '--boto',

	258 help='Specify a custom boto file.')

	259 parser.add_option('-c', '--no_resume', action='store_true',

	260 help='Resume download if file is partially downloaded.')

	261 parser.add_option('-f', '--force', action='store_true',

	262 help='Force download even if local file exists.')

	263 parser.add_option('-i', '--ignore_errors', action='store_true',

	264 help='Don\'t throw error if we find an invalid .sha1 file.')

	265 parser.add_option('-r', '--recursive', action='store_true',

	266 help='Scan folders recursively for .sha1 files. '

	267 'Must be used with -d/--directory')

	268 parser.add_option('-t', '--num_threads', default=1, type='int',

	269 help='Number of downloader threads to run.')

	270 parser.add_option('-d', '--directory', action='store_true',

	271 help='The target is a directory. '

	272 'Cannot be used with -s/--sha1_file.')

	273 parser.add_option('-s', '--sha1_file', action='store_true',

	274 help='The target is a file containing a sha1 sum. '

	275 'Cannot be used with -d/--directory.')

	276

	277 (options, args) = parser.parse_args()

	278 if not args:

	279 parser.error('Missing target.')

	280 if len(args) > 1:

	281 parser.error('Too many targets.')

	282 if not options.bucket:

	283 parser.error('Missing bucket. Specify bucket with --bucket.')

	284 if options.sha1_file and options.directory:

	285 parser.error('Both --directory and --sha1_file are specified, '

	286 'can only specify one.')

	287 if options.recursive and not options.directory:

	288 parser.error('--recursive specified but --directory not specified.')

	289 if options.output and options.directory:

	290 parser.error('--directory is specified, so --output has no effect.')

	291 input_filename = args[0]

	292

	293 # Set output filename if not specified.

	294 if not options.output and not options.directory:

	295 if not options.sha1_file:

	296 # Target is a sha1 sum, so output filename would also be the sha1 sum.

	297 options.output = input_filename

	298 elif options.sha1_file:

	299 # Target is a .sha1 file.

	300 if not input_filename.endswith('.sha1'):

	301 parser.error('--sha1_file is specified, but the input filename '

	302 'does not end with .sha1, and no --output is specified. '

	303 'Either make sure the input filename has a .sha1 '

	304 'extension, or specify --output.')

	305 options.output = input_filename[:-5]

	306 else:

	307 parser.error('Unreachable state.')

	308

	309 # Check if output file already exists.

	310 if not options.directory and not options.force and not options.no_resume:

	311 if os.path.exists(options.output):

	312 parser.error('Output file %s exists and --no_resume is specified.'

	313 % options.output)

	314

	315 # Make sure we can find a working instance of gsutil.

	316 if not os.path.exists(GSUTIL_DEFAULT_PATH):

	317 parser.error('gsutil not found in %s, bad depot_tools checkout?' %

	318 GSUTIL_DEFAULT_PATH)

	319 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)

	320

	321 # Check we have a valid bucket with valid permissions.

	322 base_url, code = check_bucket_permissions(options.bucket, gsutil)

	323 if code:

	324 return code

	325

	326 return download_from_google_storage(

	327 input_filename, base_url, gsutil, options.num_threads, options.directory,

	328 options.recursive, options.force, options.output, options.ignore_errors,

	329 options.sha1_file)

	330

	331

	332 if __name__ == '__main__':

	333 sys.exit(main(sys.argv))

OLD	NEW

« no previous file with comments | « no previous file | tests/download_from_google_storage_unittests.py » ('j') | tests/download_from_google_storage_unittests.py » ('J')