upload_to_google_storage.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: More review fixes Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Uploads files to Google Storage content addressed."""

	7

	8 import hashlib

	9 import optparse

	10 import os

	11 import Queue

	12 import re

	13 import sys

	14 import threading

	15 import time

	16

	17 from download_from_google_storage import check_bucket_permissions

	18 from download_from_google_storage import get_sha1

	19 from download_from_google_storage import Gsutil

	20 from download_from_google_storage import printer_worker

	21

	22 GSUTIL_DEFAULT_PATH = os.path.join(

	23 os.path.dirname(os.path.abspath(__file__)),

	24 'third_party', 'gsutil', 'gsutil')

	25

	26 USAGE_STRING = """%prog [options] target [target2 ...].

	27 Target is the file intended to be uploaded to Google Storage.

	28 If target is "-", then a list of files will be taken from standard input

	29

	30 This script will generate a file (original filename).sha1 containing the

	31 sha1 sum of the uploaded file.

	32 It is recommended that the .sha1 file is checked into the repository,

	33 the original file removed from the repository, and a hook added to the

	34 DEPS file to call download_from_google_storage.py.

	35

	36 Example usages

	37 --------------

	38

	39 Scan the current directory and upload all files larger than 1MB:

	40 find . -name .svn -prune -o -size +1000k -type f -print0 \| %prog -0 -

	41 """

	42

	43

	44 def get_md5(filename):

	45 md5_calculator = hashlib.md5()

	46 with open(filename, 'rb') as f:

	47 while True:

	48 chunk = f.read(1024*1024)

	49 if not chunk:

	50 break

	51 md5_calculator.update(chunk)

	52 return md5_calculator.hexdigest()

	53

	54

	55 def get_md5_cached(filename):

	56 """Don't calculate the MD5 if we can find a .md5 file."""

	57 # See if we can find an existing MD5 sum stored in a file.

	58 if os.path.exists('%s.md5' % filename):

	59 with open('%s.md5' % filename) as f:
	M-A Ruel 2013/03/08 02:07:00 I prefer 'rb' for consistency with the other calls I prefer 'rb' for consistency with the other calls Ryan Tseng 2013/03/08 02:34:37 Done. Show quoted text On 2013/03/08 02:07:00, Marc-Antoine Ruel wrote: > I prefer 'rb' for consistency with the other calls Done.
	60 md5_match = re.search('([a-z0-9]{32})', f.read())

	61 if md5_match:

	62 return md5_match.group(1)

	63 else:

	64 md5_hash = get_md5(filename)

	65 with open('%s.md5' % filename, 'w') as f:

	66 f.write(md5_hash)

	67 return md5_hash

	68

	69

	70 def _upload_worker(

	71 thread_num, q, base_url, gsutil, md5_lock, force,
	M-A Ruel 2013/03/08 02:07:00 s/q/upload_queue/ s/q/upload_queue/ Ryan Tseng 2013/03/08 02:34:37 Done. Show quoted text On 2013/03/08 02:07:00, Marc-Antoine Ruel wrote: > s/q/upload_queue/ Done.
	72 use_md5, stdout_queue, ret_codes):

	73 while True:

	74 filename, sha1_sum = q.get()

	75 if not filename:

	76 break

	77 file_url = '%s/%s' % (base_url, sha1_sum)

	78 if gsutil.check_call('ls', file_url)[0] == 0 and not force:

	79 # File exists, check MD5 hash.

	80 _, out, _ = gsutil.check_call('ls', '-L', file_url)

	81 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)

	82 if etag_match:

	83 remote_md5 = etag_match.group(1)

	84 # Calculate the MD5 checksum to match it to Google Storage's ETag.

	85 with md5_lock:

	86 if use_md5:

	87 local_md5 = get_md5_cached(filename)

	88 else:

	89 local_md5 = get_md5(filename)

	90 if local_md5 == remote_md5:

	91 stdout_queue.put(

	92 '%d> File %s already exists at %s and MD5 matches, exiting' %

	93 (thread_num, filename, file_url))

	94 continue

	95 stdout_queue.put('%d> Uploading %s to %s' % (

	96 thread_num, filename, file_url))

	97 code, _, err = gsutil.check_call('cp', '-q', filename, file_url)

	98 if code != 0:

	99 ret_codes.put(

	100 (code,

	101 'Encountered error on uploading %s to %s\n%s' %

	102 (filename, file_url, err)))

	103 continue

	104

	105

	106 def get_targets(args, parser, use_null_terminator):

	107 if not args:

	108 parser.error('Missing target.')

	109

	110 if len(args) == 1 and args[0] == '-':

	111 # Take stdin as a newline or null seperated list of files.

	112 if use_null_terminator:

	113 return sys.stdin.read().split('\0')

	114 else:

	115 return sys.stdin.read().splitlines()

	116 else:

	117 return args

	118

	119

	120 def upload_to_google_storage(

	121 input_filenames, base_url, gsutil, force,

	122 use_md5, num_threads, skip_hashing):

	123 # We only want one MD5 calculation happening at a time to avoid HD thrashing.

	124 md5_lock = threading.Lock()

	125

	126 # Start up all the worker threads.

	127 all_threads = []

	128 ret_codes = Queue.Queue()

	129 ret_codes.put((0, None))

	130 upload_queue = Queue.Queue()

	131 upload_timer = time.time()

	132 stdout_queue = Queue.Queue()

	133 for thread_num in range(num_threads):

	134 t = threading.Thread(

	135 target=_upload_worker,

	136 args=[thread_num, upload_queue, base_url, gsutil.clone(), md5_lock,

	137 force, use_md5, stdout_queue, ret_codes])

	138 t.daemon = True

	139 t.start()

	140 all_threads.append(t)

	141

	142 # We want to hash everything in a single thread since its faster.

	143 # The bottleneck is in disk IO, not CPU.

	144 hashing_start = time.time()

	145 for filename in input_filenames:

	146 if not os.path.exists(filename):

	147 print 'Error: %s not found, skipping.' % filename

	148 continue

	149 if os.path.exists('%s.sha1' % filename) and skip_hashing:

	150 print 'Found hash for %s, skipping.' % filename

	151 with open(filename + '.sha1', 'rb') as f:

	152 upload_queue.put((filename, f.read()))
	M-A Ruel 2013/03/08 02:07:00 you should limit the read size so the script doesn you should limit the read size so the script doesn't end up reading a 1mb .sha1 file by error. Not that it'll likely happen but still. And by "verification", I mean ensuring it's 40 hex characters. Ryan Tseng 2013/03/08 02:34:37 Oops, yes it should definitely verify that its a 4 Show quoted text On 2013/03/08 02:07:00, Marc-Antoine Ruel wrote: > you should limit the read size so the script doesn't end up reading a 1mb .sha1 > file by error. Not that it'll likely happen but still. > > And by "verification", I mean ensuring it's 40 hex characters. Oops, yes it should definitely verify that its a 40 char hex string. It should also throw an error and exit since something is almost certainly wrong.
	153 continue

	154 print 'Calculating hash for %s...' % filename,

	155 sha1_sum = get_sha1(filename)

	156 with open(filename + '.sha1', 'wb') as f:

	157 f.write(sha1_sum)

	158 print 'done'

	159 upload_queue.put((filename, sha1_sum))

	160 hashing_duration = time.time() - hashing_start

	161

	162 # Wait for everything to finish.

	163 for _ in all_threads:

	164 upload_queue.put((None, None)) # To mark the end of the work queue.

	165 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])
	M-A Ruel 2013/03/08 02:07:00 Start it earlier. Start it earlier. Ryan Tseng 2013/03/08 02:34:37 Done. Show quoted text On 2013/03/08 02:07:00, Marc-Antoine Ruel wrote: > Start it earlier. Done.
	166 printer_thread.daemon = True

	167 printer_thread.start()

	168 for t in all_threads:

	169 t.join()

	170 stdout_queue.put(None)

	171 printer_thread.join()

	172

	173 # Print timing information.

	174 print 'Hashing %s files took %1f seconds' % (

	175 len(input_filenames), hashing_duration)

	176 print 'Uploading took %1f seconds' % (time.time() - upload_timer)

	177

	178 # See if we ran into any errors.

	179 max_ret_code = 0

	180 for ret_code, message in ret_codes.queue:

	181 max_ret_code = max(ret_code, max_ret_code)

	182 if message:

	183 print >> sys.stderr, message

	184

	185 if not max_ret_code:

	186 print 'Success!'

	187

	188 return max_ret_code

	189

	190

	191 def main(args):

	192 parser = optparse.OptionParser(USAGE_STRING)

	193 parser.add_option('-b', '--bucket',

	194 help='Google Storage bucket to upload to.')

	195 parser.add_option('-e', '--boto', help='Specify a custom boto file.')

	196 parser.add_option('-f', '--force', action='store_true',

	197 help='Force upload even if remote file exists.')

	198 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,

	199 help='Path to the gsutil script.')

	200 parser.add_option('-m', '--use_md5', action='store_true',

	201 help='Generate MD5 files when scanning, and don\'t check '

	202 'the MD5 checksum if a .md5 file is found.')

	203 parser.add_option('-t', '--num_threads', default=1, type='int',

	204 help='Number of uploader threads to run.')

	205 parser.add_option('-s', '--skip_hashing', action='store_true',

	206 help='Skip hashing if .sha1 file exists.')

	207 parser.add_option('-0', '--use_null_terminator', action='store_true',

	208 help='Use \\0 instead of \\n when parsing '

	209 'the file list from stdin. This is useful if the input '

	210 'is coming from "find ... -print0".')

	211 (options, args) = parser.parse_args()

	212

	213 # Enumerate our inputs.

	214 input_filenames = get_targets(args, parser, options.use_null_terminator)

	215

	216 # Make sure we can find a working instance of gsutil.

	217 if os.path.exists(GSUTIL_DEFAULT_PATH):

	218 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)

	219 else:

	220 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %

	221 GSUTIL_DEFAULT_PATH)

	222 return 1

	223

	224 # Check we have a valid bucket with valid permissions.

	225 base_url, code = check_bucket_permissions(options.bucket, gsutil)

	226 if code:

	227 return code

	228

	229 return upload_to_google_storage(

	230 input_filenames, base_url, gsutil, options.force, options.use_md5,

	231 options.num_threads, options.skip_hashing)

	232

	233

	234 if __name__ == '__main__':

	235 sys.exit(main(sys.argv))

OLD	NEW

« download_from_google_storage.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »