download_from_google_storage.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: download_from_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Added exception types, renamed variables Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Download files from Google Storage based on SHA1 sums."""

	7

	8

	9 import hashlib

	10 import optparse

	11 import os

	12 import Queue

	13 import re

	14 import sys

	15 import threading

	16 import time

	17

	18 import subprocess2

	19

	20

	21 GSUTIL_DEFAULT_PATH = os.path.join(

	22 os.path.dirname(os.path.abspath(__file__)),

	23 'third_party', 'gsutil', 'gsutil')

	24

	25

	26 class FileNotFoundError(IOError):

	27 pass

	28

	29

	30 class InvalidFileError(IOError):

	31 pass

	32

	33

	34 # Common utilities

	35 class Gsutil(object):

	36 """Call gsutil with some predefined settings."""

	37 def __init__(self, path, boto_path=None, timeout=None):

	38 if not os.path.exists(path):

	39 raise FileNotFoundError('GSUtil not found in %s' % path)

	40 self.path = path

	41 self.timeout = timeout

	42 self.boto_path = boto_path

	43

	44 def call(self, *args):

	45 env = os.environ.copy()

	46 if self.boto_path is not None:

	47 env['AWS_CREDENTIAL_FILE'] = self.boto_path

	48 return subprocess2.call((sys.executable, self.path) + args,

	49 env=env,

	50 timeout=self.timeout)

	51

	52 def check_call(self, *args):

	53 env = os.environ.copy()

	54 if self.boto_path is not None:

	55 env['AWS_CREDENTIAL_FILE'] = self.boto_path

	56 ((out, err), code) = subprocess2.communicate(

	57 (sys.executable, self.path) + args,

	58 stdout=subprocess2.PIPE,

	59 stderr=subprocess2.PIPE,

	60 env=env,

	61 timeout=self.timeout)

	62

	63 # Parse output.

	64 status_code_match = re.search('status=([0-9]+)', err)

	65 if status_code_match:

	66 return int(status_code_match.groups(1))

	67 elif ('You are attempting to access protected data with '

	68 'no configured credentials.' in err):

	69 return (403, out, err)

	70 elif 'No such object' in err:

	71 return (404, out, err)

	72 else:

	73 return (code, out, err)

	74

	75 def clone(self):

	76 return Gsutil(self.path, self.boto_path, self.timeout)

	77

	78

	79 def check_bucket_permissions(bucket, gsutil):

	80 if not bucket:

	81 print >> sys.stderr, 'Missing bucket %s.'

	82 return (None, 1)

	83 base_url = 'gs://%s' % bucket

	84

	85 code, _, ls_err = gsutil.check_call('ls', base_url)

	86 if code == 403:

	87 code, _, _ = gsutil.call('config')

	88 if code != 0:

	89 print >> sys.stderr, 'Error while authenticating to %s.' % base_url

	90 elif code == 404:

	91 print >> sys.stderr, '%s not found.' % base_url

	92 elif code != 0:

	93 print >> sys.stderr, ls_err

	94 return (base_url, code)

	95

	96

	97 def get_sha1(filename):

	98 sha1 = hashlib.sha1()

	99 with open(filename, 'rb') as f:

	100 while True:

	101 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.

	102 chunk = f.read(1024*1024)

	103 if not chunk:

	104 break

	105 sha1.update(chunk)

	106 return sha1.hexdigest()

	107

	108

	109 # Download-specific code starts here

	110

	111 def enumerate_work_queue(input_filename, work_queue, directory,

	112 recursive, ignore_errors, output, sha1_file):

	113 if sha1_file:

	114 if not os.path.exists(input_filename):

	115 if not ignore_errors:

	116 raise FileNotFoundError('%s not found.' % input_filename)

	117 print >> sys.stderr, '%s not found.' % input_filename

	118 with open(input_filename, 'rb') as f:

	119 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())

	120 if sha1_match:

	121 work_queue.put(

	122 (sha1_match.groups(1)[0], input_filename.replace('.sha1', '')))

	123 return 1

	124 if not ignore_errors:

	125 raise InvalidFileError('No sha1 sum found in %s.' % input_filename)

	126 print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename

	127 return 0

	128

	129 if not directory:

	130 work_queue.put((input_filename, output))

	131 return 1

	132

	133 work_queue_size = 0

	134 for root, dirs, files in os.walk(input_filename):

	135 if not recursive:

	136 for item in dirs[:]:

	137 dirs.remove(item)

	138 else:

	139 for exclude in ['.svn', '.git']:

	140 if exclude in dirs:

	141 dirs.remove(exclude)

	142 for filename in files:

	143 full_path = os.path.join(root, filename)

	144 if full_path.endswith('.sha1'):

	145 with open(full_path, 'rb') as f:

	146 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())

	147 if sha1_match:

	148 work_queue.put(

	149 (sha1_match.groups(1)[0], full_path.replace('.sha1', '')))

	150 work_queue_size += 1

	151 else:

	152 if not ignore_errors:

	153 raise InvalidFileError('No sha1 sum found in %s.' % filename)

	154 print >> sys.stderr, 'No sha1 sum found in %s.' % filename

	155 return work_queue_size

	156

	157

	158 def _downloader_worker_thread(thread_num, q, force, base_url, gsutil, out_q):

	159 while True:

	160 input_sha1_sum, output_filename = q.get()

	161 if input_sha1_sum is None:

	162 out_q.put('Thread %d is done' % thread_num)
	M-A Ruel 2013/03/07 19:41:22 I'd prefer you to prefix all the messages with '%d I'd prefer you to prefix all the messages with '%d>' % thread_num and then use a constant formatting, e.g. the file name then the message, so something like: '%d> %s: file existed' % (thread_num, output_filename) I don't think it's useful to print when a thread is done, the user doesn't care. Ryan Tseng 2013/03/07 20:35:18 Removed then Show quoted text On 2013/03/07 19:41:22, Marc-Antoine Ruel wrote: > I'd prefer you to prefix all the messages with > '%d>' % thread_num > and then use a constant formatting, e.g. the file name then the message, so > something like: > '%d> %s: file existed' % (thread_num, output_filename) > > I don't think it's useful to print when a thread is done, the user doesn't care. Removed then
	163 return

	164 if os.path.exists(output_filename) and not force:

	165 if get_sha1(output_filename) == input_sha1_sum:

	166 out_q.put(

	167 'File %s exists and SHA1 sum (%s) matches. Skipping.' % (
	M-A Ruel 2013/03/07 19:41:22 I don't think it's useful to print the hash, it's I don't think it's useful to print the hash, it's a 40 characters string.. Ryan Tseng 2013/03/07 20:35:18 Done, an error code is queued in the return code q Show quoted text On 2013/03/07 19:41:22, Marc-Antoine Ruel wrote: > I don't think it's useful to print the hash, it's a 40 characters string.. Done, an error code is queued in the return code queue so it fails.
	168 output_filename , input_sha1_sum))

	169 continue

	170 # Check if file exists.

	171 file_url = '%s/%s' % (base_url, input_sha1_sum)

	172 if gsutil.check_call('ls', file_url)[0] != 0:

	173 out_q.put('File %s for %s does not exist, skipping.' % (

	174 file_url, output_filename))

	175 continue

	176 # Fetch the file.

	177 out_q.put('Downloading %s to %s...' % (file_url, output_filename))

	178 code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename)
	M-A Ruel 2013/03/07 19:41:22 Will it fail if the file was already present? Will it fail if the file was already present? Ryan Tseng 2013/03/07 20:35:18 It won't fail, "gsutil cp" will just overwrite the Show quoted text On 2013/03/07 19:41:22, Marc-Antoine Ruel wrote: > Will it fail if the file was already present? It won't fail, "gsutil cp" will just overwrite the file. If the file is already present and correct, line 167 would skip the file. M-A Ruel 2013/03/07 22:26:56 Perfect, I just wanted to make sure you asserted t Show quoted text On 2013/03/07 20:35:18, Ryan T. wrote: > On 2013/03/07 19:41:22, Marc-Antoine Ruel wrote: > > Will it fail if the file was already present? > > It won't fail, "gsutil cp" will just overwrite the file. > > If the file is already present and correct, line 167 would skip the file. Perfect, I just wanted to make sure you asserted that.
	179 if code != 0:

	180 out_q.put(err)

	181 return code

	182

	183

	184 def printer_worker(output_queue):

	185 while True:

	186 line = output_queue.get()

	187 # Its pausible we want to print empty lines.
	M-A Ruel 2013/03/07 19:41:22 plausible plausible Ryan Tseng 2013/03/07 20:35:18 Done. Show quoted text On 2013/03/07 19:41:22, Marc-Antoine Ruel wrote: > plausible Done.
	188 if line is None:

	189 break

	190 print line

	191

	192

	193 def download_from_google_storage(

	194 input_filename, base_url, gsutil, num_threads, directory, recursive,

	195 force, output, ignore_errors, sha1_file):

	196 # Start up all the worker threads.

	197 all_threads = []

	198 download_timer = time.time()

	199 stdout_queue = Queue.Queue()

	200 work_queue = Queue.Queue()

	201 for thread_num in range(num_threads):

	202 t = threading.Thread(

	203 target=_downloader_worker_thread,

	204 args=[thread_num, work_queue, force, base_url,

	205 gsutil.clone(), stdout_queue])

	206 t.daemon = True

	207 t.start()

	208 all_threads.append(t)

	209

	210 # Enumerate our work queue.

	211 work_queue_size = enumerate_work_queue(

	212 input_filename, work_queue, directory, recursive,

	213 ignore_errors, output, sha1_file)

	214 for _ in all_threads:

	215 work_queue.put((None, None)) # Used to tell worker threads to stop.

	216

	217
	M-A Ruel 2013/03/07 19:41:22 3 lines -> 1 line 3 lines -> 1 line Ryan Tseng 2013/03/07 20:35:18 Done. Show quoted text On 2013/03/07 19:41:22, Marc-Antoine Ruel wrote: > 3 lines -> 1 line Done.
	218

	219 # Wait for all downloads to finish.
	M-A Ruel 2013/03/07 19:41:22 You should start this thread because starting to e You should start this thread because starting to enumerate. Ryan Tseng 2013/03/07 20:35:18 Done. Show quoted text On 2013/03/07 19:41:22, Marc-Antoine Ruel wrote: > You should start this thread because starting to enumerate. Done.
	220 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])

	221 printer_thread.daemon = True

	222 printer_thread.start()

	223 for t in all_threads:

	224 t.join()

	225 stdout_queue.put(None)

	226 printer_thread.join()

	227

	228 print 'Success.'

	229 print 'Downloading %d files took %1f second(s)' % (

	230 work_queue_size, time.time() - download_timer)

	231 return 0

	232

	233

	234 def main(args):

	235 usage = ('usage: %prog [options] target\nTarget must be:\n'

	236 '(default) a sha1 sum ([A-Za-z0-9]{40}).\n(-s or --sha1_file) a '

	237 '.sha1 file, containing a sha1 sum on the first line. (-d or '

	238 '--directory) A directory to scan for .sha1 files. ')

	239 parser = optparse.OptionParser(usage)

	240 parser.add_option('-o', '--output',

	241 help='Specify the output file name. Defaults to:\n'

	242 '(a) Given a SHA1 hash, the name is the SHA1 hash.\n'

	243 '(b) Given a .sha1 file or directory, the name will '

	244 'match (.*).sha1.')

	245 parser.add_option('-b', '--bucket',

	246 help='Google Storage bucket to fetch from.')

	247 parser.add_option('-e', '--boto',

	248 help='Specify a custom boto file.')

	249 parser.add_option('-c', '--no_resume', action='store_true',

	250 help='Resume download if file is partially downloaded.')

	251 parser.add_option('-f', '--force', action='store_true',

	252 help='Force download even if local file exists.')

	253 parser.add_option('-i', '--ignore_errors', action='store_true',

	254 help='Don\'t throw error if we find an invalid .sha1 file.')

	255 parser.add_option('-r', '--recursive', action='store_true',

	256 help='Scan folders recursively for .sha1 files. '

	257 'Must be used with -d/--directory')

	258 parser.add_option('-t', '--num_threads', default=1, type='int',

	259 help='Number of downloader threads to run.')

	260 parser.add_option('-d', '--directory', action='store_true',

	261 help='The target is a directory. '

	262 'Cannot be used with -s/--sha1_file.')

	263 parser.add_option('-s', '--sha1_file', action='store_true',

	264 help='The target is a file containing a sha1 sum. '

	265 'Cannot be used with -d/--directory.')

	266

	267 (options, args) = parser.parse_args()

	268 if not args:

	269 parser.error('Missing target.')

	270 if len(args) > 1:

	271 parser.error('Too many targets.')

	272 if not options.bucket:

	273 parser.error('Missing bucket. Specify bucket with --bucket.')

	274 if options.sha1_file and options.directory:

	275 parser.error('Both --directory and --sha1_file are specified, '

	276 'can only specify one.')

	277 elif options.recursive and not options.directory:

	278 parser.error('--recursive specified but --directory not specified.')

	279 elif options.output and options.directory:

	280 parser.error('--directory is specified, so --output has no effect.')

	281 else:

	282 input_filename = args[0]

	283

	284 # Set output filename if not specified.

	285 if not options.output and not options.directory:

	286 if not options.sha1_file:

	287 # Target is a sha1 sum, so output filename would also be the sha1 sum.

	288 options.output = input_filename

	289 elif options.sha1_file:

	290 # Target is a .sha1 file.

	291 if not input_filename.endswith('.sha1'):

	292 parser.error('--sha1_file is specified, but the input filename '

	293 'does not end with .sha1, and no --output is specified. '

	294 'Either make sure the input filename has a .sha1 '

	295 'extension, or specify --output.')

	296 options.output = input_filename[:-5]

	297 else:

	298 raise parser.error('Unreachable state.')

	299

	300 # Check if output file already exists.

	301 if not options.directory and not options.force and not options.no_resume:

	302 if os.path.exists(options.output):

	303 parser.error('Output file %s exists and --no_resume is specified.'

	304 % options.output)

	305

	306 # Make sure we can find a working instance of gsutil.

	307 if os.path.exists(GSUTIL_DEFAULT_PATH):

	308 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)

	309 else:

	310 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %

	311 GSUTIL_DEFAULT_PATH)

	312 return 1

	313

	314 # Check we have a valid bucket with valid permissions.

	315 base_url, code = check_bucket_permissions(options.bucket, gsutil)

	316 if code:

	317 return code

	318

	319 return download_from_google_storage(

	320 input_filename, base_url, gsutil, options.num_threads, options.directory,

	321 options.recursive, options.force, options.output, options.ignore_errors,

	322 options.sha1_file)

	323

	324

	325 if __name__ == '__main__':

	326 sys.exit(main(sys.argv))

OLD	NEW

« no previous file with comments | « no previous file | tests/gstools/download_test_data/rootfolder_text.txt » ('j') | upload_to_google_storage.py » ('J')