Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(122)

Side by Side Diff: build/google_storage_tools/upload_to_google_storage.py

Issue 11664024: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Moved files into new folder Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « build/google_storage_tools/download_from_google_storage.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Script to upload files to Google Storage."""
7
8 import hashlib
9 import optparse
10 import os
11 import Queue
12 import re
13 import subprocess
14 import sys
15 import tempfile
16 import threading
17 import time
18 import zipfile
19
20 # TODO(hinoka): This is currently incorrect. Should find a better default.
21 GSUTIL_DEFAULT_PATH = os.path.join(os.path.dirname(os.path.normpath(__file__)),
22 '..', '..', 'third_party', 'gsutil', 'gsutil')
23
24 USAGE_STRING = """%prog [options] target [target2 ...].
25 Target is the file intended to be uploaded to Google Storage.
26 If target is "-", then a list of files will be taken from standard input
27
28 This script will generate a file (original filename).sha1 containing the
29 sha1 sum of the uploaded file.
30 It is recommended that the .sha1 file is checked into the repository,
31 the original file removed from the repository, and a hook added to the
32 DEPS file to call download_from_google_storage.py.
33
34 Example usages
35 --------------
36
37 Scan the current directory and upload all files larger than 1MB:
38 find . -name .svn -prune -o -size +1000k -type f -print0 | %prog -0 -
39 """
40
41
42 class Gsutil():
43 def __init__(self, path, boto_path=None, timeout=None):
44 if not os.path.exists(path):
45 raise OSError('GSUtil not found in %s' % path)
46 self.path = path
47
48 self.timeout = timeout
49 self.boto_path = boto_path
50
51 def call(self, *args):
52 def _thread_main():
53 thr = threading.current_thread()
54 env = os.environ.copy()
55 if self.boto_path is not None:
56 env['AWS_CREDENTIAL_FILE'] = self.boto_path
57 thr.status = subprocess.call((sys.executable, self.path) + args, env=env)
58 thr = threading.Thread(target=_thread_main)
59 thr.start()
60 thr.join(self.timeout)
61 if thr.isAlive():
62 raise RuntimeError('%s %s timed out after %d seconds.' % (
63 self.path, ' '.join(args), self.timeout))
64 return thr.status
65
66 def check_call(self, *args):
67 def _thread_main():
68 thr = threading.current_thread()
69 env = os.environ.copy()
70 if self.boto_path is not None:
71 env['AWS_CREDENTIAL_FILE'] = self.boto_path
72 p = subprocess.Popen((sys.executable, self.path) + args,
73 stdout=subprocess.PIPE,
74 stderr=subprocess.PIPE,
75 env=env)
76 code = p.wait()
77 out, err = p.communicate()
78 thr.status = (code, out, err)
79
80 thr = threading.Thread(target=_thread_main)
81 thr.start()
82 thr.join(self.timeout)
83 if thr.isAlive():
84 raise RuntimeError('%s %s timed out after %d seconds.' % (
85 self.path, ' '.join(args), self.timeout))
86 code, out, err = thr.status
87 status_code_match = re.search('status=([0-9]+)', err)
88 if status_code_match:
89 return int(status_code_match.groups(1))
90 elif ('You are attempting to access protected data with '
91 'no configured credentials.' in err):
92 return (403, out, err)
93 elif 'No such object' in err:
94 return (404, out, err)
95 else:
96 return (code, out, err)
97
98 def clone(self):
99 return Gsutil(self.path, self.boto_path, self.timeout)
100
101
102 def GetSHA1(filename):
103 sha1 = hashlib.sha1()
104 with open(filename, 'rb') as f:
105 while True:
106 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.
107 chunk = f.read(1024*1024)
108 if not chunk:
109 break
110 sha1.update(chunk)
111 return sha1.hexdigest()
112
113
114 def CheckSHA1(sha1_sum, filename):
115 return sha1_sum == GetSHA1(filename)
116
117
118 def GetMD5(filename, lock, use_md5):
119 # See if we can find an existing MD5 sum stored in a file.
120 if use_md5 and os.path.exists('%s.md5' % filename):
121 with open('%s.md5' % filename) as f:
122 md5_match = re.search('([a-z0-9]{32})', f.read())
123 if md5_match:
124 return md5_match.groups()[0]
125
126 # Calculate the MD5 checksum of the file.
127 md5_calculator = hashlib.md5()
128 with lock:
129 with open(filename, 'rb') as f:
130 while True:
131 chunk = f.read(1024*1024)
132 if not chunk:
133 break
134 md5_calculator.update(chunk)
135 local_md5 = md5_calculator.hexdigest()
136 if use_md5:
137 with open('%s.md5' % filename, 'w') as f:
138 f.write(local_md5)
139 return local_md5
140
141
142 def _upload_worker(thread_num, q, base_url, gsutil, options, md5_lock):
143 while True:
144 try:
145 filename, sha1_sum = q.get_nowait()
146 file_url = '%s/%s' % (base_url, sha1_sum)
147 if gsutil.check_call('ls', file_url)[0] == 0 and not options.force:
148 # File exists, check MD5 hash.
149 _, out, _ = gsutil.check_call('ls', '-L', file_url)
150 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)
151 if etag_match:
152 remote_md5 = etag_match.groups()[0]
153 # Calculate the MD5 checksum to match it to Google Storage's ETag.
154 local_md5 = GetMD5(filename, md5_lock, options.use_md5)
155 if local_md5 == remote_md5:
156 print ('File %s already exists at %s and MD5 matches, exiting' %
157 (filename, file_url))
158 continue
159 print 'Uploading %s to %s' % (filename, file_url)
160 code = gsutil.call('cp', '-q', filename, file_url)
161 if code != 0:
162 print >>sys.stderr, gsutil.stderr
163 continue
164 except Queue.Empty:
165 return
166
167 def main(args):
168 parser = optparse.OptionParser(USAGE_STRING)
169 parser.add_option('-b', '--bucket', default='chrome-artifacts',
170 help='Google Storage bucket to upload to.')
171 parser.add_option('-e', '--boto', default=None,
172 help='Specify a custom boto file.')
173 parser.add_option('-f', '--force', action='store_true', default=False,
174 help='Force upload even if remote file exists.')
175 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,
176 help='Path to the gsutil script.')
177 parser.add_option('-m', '--use_md5', action='store_true', default=False,
178 help='Generate MD5 files when scanning, and don\'t check '
179 'the MD5 checksum if a .md5 file is found.')
180 parser.add_option('-t', '--num_threads', default=1, type='int',
181 help='Number of uploader threads to run.')
182 parser.add_option('-s', '--skip_hashing', action='store_true', default=False,
183 help='Skip hashing if .sha1 file exists.')
184 parser.add_option('-0', '--use_null_terminator', action='store_true',
185 default=False, help='Use \\0 instead of \\n when parsing '
186 'the file list from stdin. This is useful if the input '
187 'is coming from "find ... -print0".')
188 (options, args) = parser.parse_args()
189
190 if len(args) < 1:
191 parser.error('Missing target.')
192 elif len(args) == 1 and args[0] == '-':
193 # Take stdin as a newline or null seperated list of files.
194 if options.use_null_terminator:
195 input_filenames = [line for line in sys.stdin.read().split('\0')]
196 else:
197 input_filenames = [line.strip() for line in sys.stdin.readlines()]
198 else:
199 input_filenames = args
200 base_url = 'gs://%s' % options.bucket
201
202 # Make sure we can find a working instance of gsutil.
203 if os.path.exists(options.gsutil_path):
204 gsutil = Gsutil(options.gsutil_path)
205 else:
206 for path in os.environ["PATH"].split(os.pathsep):
207 if os.path.exists(path) and 'gsutil' in os.listdir(path):
208 gsutil = Gsutil(os.path.join(path, 'gsutil'))
209
210 # Check if we have permissions to the Google Storage bucket.
211 code, ls_out, ls_err = gsutil.check_call('ls', base_url)
212 if code == 403:
213 code, _, _ = gsutil.call('config')
214 if code != 0:
215 print >>sys.stderr, 'Error while authenticating to %s, exiting' % base_url
216 return 403
217 elif code == 404:
218 print >>sys.stderr, '%s not found.' % base_url
219 return 404
220 elif code != 0:
221 print >>sys.stderr, ls_err
222 return code
223
224 # We want to hash everything in a single thread since its faster.
225 # The bottleneck is in disk IO, not CPU.
226 upload_queue = Queue.Queue()
227 hash_timer = time.time()
228 for filename in input_filenames:
229 if not os.path.exists(filename):
230 print 'Error: %s not found, skipping.' % filename
231 continue
232 if os.path.exists('%s.sha1' % filename) and options.skip_hashing:
233 print 'Found hash for %s, skipping.' % filename
234 upload_queue.put((filename, open('%s.sha1' % filename).read()))
235 continue
236 print 'Calculating hash for %s...' % filename,
237 sha1_sum = GetSHA1(filename)
238 with open(filename + '.sha1', 'w') as f:
239 f.write(sha1_sum)
240 print 'done'
241 upload_queue.put((filename, sha1_sum))
242 hash_time = time.time() - hash_timer
243
244 # Start up all the worker threads.
245 all_threads = []
246
247 # We only want one MD5 calculation happening at a time.
248 md5_lock = threading.Lock()
249 upload_timer = time.time()
250
251 for thread_num in range(options.num_threads):
252 t = threading.Thread(target=_upload_worker, args=[thread_num,
253 upload_queue, base_url, gsutil.clone(), options, md5_lock])
254 t.daemon = True
255 t.start()
256 all_threads.append(t)
257
258 # Wait for everything to finish.
259 for t in all_threads:
260 t.join()
261
262 print 'Success.'
263 print 'Hashing %s files took %1f seconds' % (len(input_filenames), hash_time)
264 print 'Uploading took %1f seconds' % (time.time() - upload_timer)
265 return 0
266
267 if __name__ == '__main__':
268 sys.exit(main(sys.argv))
OLDNEW
« no previous file with comments | « build/google_storage_tools/download_from_google_storage.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698