build/google_storage_tools/upload_to_google_storage.py - Issue 11664024: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: build/google_storage_tools/upload_to_google_storage.py

Issue 11664024: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Moved files into new folder Created 7 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Script to upload files to Google Storage."""

	7

	8 import hashlib

	9 import optparse

	10 import os

	11 import Queue

	12 import re

	13 import subprocess

	14 import sys

	15 import tempfile

	16 import threading

	17 import time

	18 import zipfile

	19

	20 # TODO(hinoka): This is currently incorrect. Should find a better default.

	21 GSUTIL_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.normpath(__file__)),

	22 '..', '..', 'third_party', 'gsutil', 'gsutil')

	23

	24 USAGE_STRING = """%prog [options] target [target2 ...].

	25 Target is the file intended to be uploaded to Google Storage.

	26 If target is "-", then a list of files will be taken from standard input

	27

	28 This script will generate a file (original filename).sha1 containing the

	29 sha1 sum of the uploaded file.

	30 It is recommended that the .sha1 file is checked into the repository,

	31 the original file removed from the repository, and a hook added to the

	32 DEPS file to call download_from_google_storage.py.

	33

	34 Example usages

	35 --------------

	36

	37 Scan the current directory and upload all files larger than 1MB:

	38 find . -name .svn -prune -o -size +1000k -type f -print0 \| %prog -0 -

	39 """

	40

	41

	42 class Gsutil():

	43 def __init__(self, path, boto_path=None, timeout=None):

	44 if not os.path.exists(path):

	45 raise OSError('GSUtil not found in %s' % path)

	46 self.path = path

	47

	48 self.timeout = timeout

	49 self.boto_path = boto_path

	50

	51 def call(self, *args):

	52 def _thread_main():

	53 thr = threading.current_thread()

	54 env = os.environ.copy()

	55 if self.boto_path is not None:

	56 env['AWS_CREDENTIAL_FILE'] = self.boto_path

	57 thr.status = subprocess.call((sys.executable, self.path) + args, env=env)

	58 thr = threading.Thread(target=_thread_main)

	59 thr.start()

	60 thr.join(self.timeout)

	61 if thr.isAlive():

	62 raise RuntimeError('%s %s timed out after %d seconds.' % (

	63 self.path, ' '.join(args), self.timeout))

	64 return thr.status

	65

	66 def check_call(self, *args):

	67 def _thread_main():

	68 thr = threading.current_thread()

	69 env = os.environ.copy()

	70 if self.boto_path is not None:

	71 env['AWS_CREDENTIAL_FILE'] = self.boto_path

	72 p = subprocess.Popen((sys.executable, self.path) + args,

	73 stdout=subprocess.PIPE,

	74 stderr=subprocess.PIPE,

	75 env=env)

	76 code = p.wait()

	77 out, err = p.communicate()

	78 thr.status = (code, out, err)

	79

	80 thr = threading.Thread(target=_thread_main)

	81 thr.start()

	82 thr.join(self.timeout)

	83 if thr.isAlive():

	84 raise RuntimeError('%s %s timed out after %d seconds.' % (

	85 self.path, ' '.join(args), self.timeout))

	86 code, out, err = thr.status

	87 status_code_match = re.search('status=([0-9]+)', err)

	88 if status_code_match:

	89 return int(status_code_match.groups(1))

	90 elif ('You are attempting to access protected data with '

	91 'no configured credentials.' in err):

	92 return (403, out, err)

	93 elif 'No such object' in err:

	94 return (404, out, err)

	95 else:

	96 return (code, out, err)

	97

	98 def clone(self):

	99 return Gsutil(self.path, self.boto_path, self.timeout)

	100

	101

	102 def GetSHA1(filename):

	103 sha1 = hashlib.sha1()

	104 with open(filename, 'rb') as f:

	105 while True:

	106 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.

	107 chunk = f.read(1024*1024)

	108 if not chunk:

	109 break

	110 sha1.update(chunk)

	111 return sha1.hexdigest()

	112

	113

	114 def CheckSHA1(sha1_sum, filename):

	115 return sha1_sum == GetSHA1(filename)

	116

	117

	118 def GetMD5(filename, lock, use_md5):

	119 # See if we can find an existing MD5 sum stored in a file.

	120 if use_md5 and os.path.exists('%s.md5' % filename):

	121 with open('%s.md5' % filename) as f:

	122 md5_match = re.search('([a-z0-9]{32})', f.read())

	123 if md5_match:

	124 return md5_match.groups()[0]

	125

	126 # Calculate the MD5 checksum of the file.

	127 md5_calculator = hashlib.md5()

	128 with lock:

	129 with open(filename, 'rb') as f:

	130 while True:

	131 chunk = f.read(1024*1024)

	132 if not chunk:

	133 break

	134 md5_calculator.update(chunk)

	135 local_md5 = md5_calculator.hexdigest()

	136 if use_md5:

	137 with open('%s.md5' % filename, 'w') as f:

	138 f.write(local_md5)

	139 return local_md5

	140

	141

	142 def _upload_worker(thread_num, q, base_url, gsutil, options, md5_lock):

	143 while True:

	144 try:

	145 filename, sha1_sum = q.get_nowait()

	146 file_url = '%s/%s' % (base_url, sha1_sum)

	147 if gsutil.check_call('ls', file_url)[0] == 0 and not options.force:

	148 # File exists, check MD5 hash.

	149 _, out, _ = gsutil.check_call('ls', '-L', file_url)

	150 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)

	151 if etag_match:

	152 remote_md5 = etag_match.groups()[0]

	153 # Calculate the MD5 checksum to match it to Google Storage's ETag.

	154 local_md5 = GetMD5(filename, md5_lock, options.use_md5)

	155 if local_md5 == remote_md5:

	156 print ('File %s already exists at %s and MD5 matches, exiting' %

	157 (filename, file_url))

	158 continue

	159 print 'Uploading %s to %s' % (filename, file_url)

	160 code = gsutil.call('cp', '-q', filename, file_url)

	161 if code != 0:

	162 print >>sys.stderr, gsutil.stderr

	163 continue

	164 except Queue.Empty:

	165 return

	166

	167 def main(args):

	168 parser = optparse.OptionParser(USAGE_STRING)

	169 parser.add_option('-b', '--bucket', default='chrome-artifacts',

	170 help='Google Storage bucket to upload to.')

	171 parser.add_option('-e', '--boto', default=None,

	172 help='Specify a custom boto file.')

	173 parser.add_option('-f', '--force', action='store_true', default=False,

	174 help='Force upload even if remote file exists.')

	175 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,

	176 help='Path to the gsutil script.')

	177 parser.add_option('-m', '--use_md5', action='store_true', default=False,

	178 help='Generate MD5 files when scanning, and don\'t check '

	179 'the MD5 checksum if a .md5 file is found.')

	180 parser.add_option('-t', '--num_threads', default=1, type='int',

	181 help='Number of uploader threads to run.')

	182 parser.add_option('-s', '--skip_hashing', action='store_true', default=False,

	183 help='Skip hashing if .sha1 file exists.')

	184 parser.add_option('-0', '--use_null_terminator', action='store_true',

	185 default=False, help='Use \\0 instead of \\n when parsing '

	186 'the file list from stdin. This is useful if the input '

	187 'is coming from "find ... -print0".')

	188 (options, args) = parser.parse_args()

	189

	190 if len(args) < 1:

	191 parser.error('Missing target.')

	192 elif len(args) == 1 and args[0] == '-':

	193 # Take stdin as a newline or null seperated list of files.

	194 if options.use_null_terminator:

	195 input_filenames = [line for line in sys.stdin.read().split('\0')]

	196 else:

	197 input_filenames = [line.strip() for line in sys.stdin.readlines()]

	198 else:

	199 input_filenames = args

	200 base_url = 'gs://%s' % options.bucket

	201

	202 # Make sure we can find a working instance of gsutil.

	203 if os.path.exists(options.gsutil_path):

	204 gsutil = Gsutil(options.gsutil_path)

	205 else:

	206 for path in os.environ["PATH"].split(os.pathsep):

	207 if os.path.exists(path) and 'gsutil' in os.listdir(path):

	208 gsutil = Gsutil(os.path.join(path, 'gsutil'))

	209

	210 # Check if we have permissions to the Google Storage bucket.

	211 code, ls_out, ls_err = gsutil.check_call('ls', base_url)

	212 if code == 403:

	213 code, _, _ = gsutil.call('config')

	214 if code != 0:

	215 print >>sys.stderr, 'Error while authenticating to %s, exiting' % base_url

	216 return 403

	217 elif code == 404:

	218 print >>sys.stderr, '%s not found.' % base_url

	219 return 404

	220 elif code != 0:

	221 print >>sys.stderr, ls_err

	222 return code

	223

	224 # We want to hash everything in a single thread since its faster.

	225 # The bottleneck is in disk IO, not CPU.

	226 upload_queue = Queue.Queue()

	227 hash_timer = time.time()

	228 for filename in input_filenames:

	229 if not os.path.exists(filename):

	230 print 'Error: %s not found, skipping.' % filename

	231 continue

	232 if os.path.exists('%s.sha1' % filename) and options.skip_hashing:

	233 print 'Found hash for %s, skipping.' % filename

	234 upload_queue.put((filename, open('%s.sha1' % filename).read()))

	235 continue

	236 print 'Calculating hash for %s...' % filename,

	237 sha1_sum = GetSHA1(filename)

	238 with open(filename + '.sha1', 'w') as f:

	239 f.write(sha1_sum)

	240 print 'done'

	241 upload_queue.put((filename, sha1_sum))

	242 hash_time = time.time() - hash_timer

	243

	244 # Start up all the worker threads.

	245 all_threads = []

	246

	247 # We only want one MD5 calculation happening at a time.

	248 md5_lock = threading.Lock()

	249 upload_timer = time.time()

	250

	251 for thread_num in range(options.num_threads):

	252 t = threading.Thread(target=_upload_worker, args=[thread_num,

	253 upload_queue, base_url, gsutil.clone(), options, md5_lock])

	254 t.daemon = True

	255 t.start()

	256 all_threads.append(t)

	257

	258 # Wait for everything to finish.

	259 for t in all_threads:

	260 t.join()

	261

	262 print 'Success.'

	263 print 'Hashing %s files took %1f seconds' % (len(input_filenames), hash_time)

	264 print 'Uploading took %1f seconds' % (time.time() - upload_timer)

	265 return 0

	266

	267 if __name__ == '__main__':

	268 sys.exit(main(sys.argv))

OLD	NEW

« no previous file with comments | « build/google_storage_tools/download_from_google_storage.py ('k') | no next file » | no next file with comments »