Chromium Code Reviews

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Review changes, fixed race condition Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments.
Jump to:
View unified diff |
« download_from_google_storage.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Uploads files to Google Storage content addressed."""
7
8 import hashlib
9 import optparse
10 import os
11 import Queue
12 import re
13 import sys
14 import threading
15 import time
16
17 from download_from_google_storage import CheckBucketPermissions
18 from download_from_google_storage import GetSHA1
19 from download_from_google_storage import Gsutil
20
21 GSUTIL_DEFAULT_PATH = os.path.join(
22 os.path.dirname(os.path.abspath(__file__)),
23 'third_party', 'gsutil', 'gsutil')
24
25 USAGE_STRING = """%prog [options] target [target2 ...].
26 Target is the file intended to be uploaded to Google Storage.
27 If target is "-", then a list of files will be taken from standard input
28
29 This script will generate a file (original filename).sha1 containing the
30 sha1 sum of the uploaded file.
31 It is recommended that the .sha1 file is checked into the repository,
32 the original file removed from the repository, and a hook added to the
33 DEPS file to call download_from_google_storage.py.
34
35 Example usages
36 --------------
37
38 Scan the current directory and upload all files larger than 1MB:
39 find . -name .svn -prune -o -size +1000k -type f -print0 | %prog -0 -
40 """
41
42
43 def GetMD5(filename, lock):
M-A Ruel 2013/03/06 19:49:59 I don't think it's useful to pass 'lock' to this f
Ryan Tseng 2013/03/06 20:19:08 Done.
44 md5_calculator = hashlib.md5()
45 with lock:
46 with open(filename, 'rb') as f:
47 while True:
48 chunk = f.read(1024*1024)
49 if not chunk:
50 break
51 md5_calculator.update(chunk)
52 return md5_calculator.hexdigest()
53
54
55 def GetMD5Cached(filename, lock):
56 """Don't calculate the MD5 if we can find a .md5 file."""
57 # See if we can find an existing MD5 sum stored in a file.
58 if os.path.exists('%s.md5' % filename):
59 with open('%s.md5' % filename) as f:
60 md5_match = re.search('([a-z0-9]{32})', f.read())
61 if md5_match:
62 return md5_match.group(1)
63 else:
64 md5_hash = GetMD5(filename, lock)
65 with open('%s.md5' % filename, 'w') as f:
66 f.write(md5_hash)
67 return md5_hash
68
69
70 def _upload_worker(
71 thread_num, q, base_url, gsutil, md5_lock, force, use_md5, ret_codes):
72 while True:
73 filename, sha1_sum = q.get()
74 if not filename:
75 break
76 file_url = '%s/%s' % (base_url, sha1_sum)
77 if gsutil.check_call('ls', file_url)[0] == 0 and not force:
78 # File exists, check MD5 hash.
79 _, out, _ = gsutil.check_call('ls', '-L', file_url)
80 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)
81 if etag_match:
82 remote_md5 = etag_match.group(1)
83 # Calculate the MD5 checksum to match it to Google Storage's ETag.
84 if use_md5:
85 local_md5 = GetMD5Cached(filename, md5_lock)
86 else:
87 local_md5 = GetMD5(filename, md5_lock)
88 if local_md5 == remote_md5:
89 print ('File %s already exists at %s and MD5 matches, exiting' %
90 (filename, file_url))
91 continue
92 print 'Uploading %s to %s' % (filename, file_url)
93 code, _, err = gsutil.check_call('cp', '-q', filename, file_url)
94 if code != 0:
95 ret_codes.put(
96 (code,
97 'Encountered error on uploading %s to %s\n%s' %
98 (filename, file_url, err)))
99 continue
100
101
102 def get_targets(args, parser, use_null_terminator):
103 if not args:
104 parser.error('Missing target.')
105
106 if len(args) == 1 and args[0] == '-':
107 # Take stdin as a newline or null seperated list of files.
108 if use_null_terminator:
109 return sys.stdin.read().split('\0')
110 else:
111 return sys.stdin.read().splitlines()
112 else:
113 return args
114
115
116 def upload_to_google_storage(
117 input_filenames, base_url, gsutil, force,
118 use_md5, num_threads, skip_hashing):
119 # We only want one MD5 calculation happening at a time to avoid HD thrashing.
120 md5_lock = threading.Lock()
121
122 # Start up all the worker threads.
123 all_threads = []
124 ret_codes = Queue.Queue()
125 ret_codes.put((0, None))
126 upload_queue = Queue.Queue()
127 upload_timer = time.time()
128 for thread_num in range(num_threads):
129 t = threading.Thread(
130 target=_upload_worker,
131 args=[thread_num, upload_queue, base_url,
132 gsutil.clone(), md5_lock, force, use_md5, ret_codes])
133 t.daemon = True
134 t.start()
135 all_threads.append(t)
136
137 # We want to hash everything in a single thread since its faster.
138 # The bottleneck is in disk IO, not CPU.
139 hash_timer = time.time() # For timing statistics.
140 for filename in input_filenames:
141 if not os.path.exists(filename):
142 print 'Error: %s not found, skipping.' % filename
143 continue
144 if os.path.exists('%s.sha1' % filename) and skip_hashing:
145 print 'Found hash for %s, skipping.' % filename
146 upload_queue.put((filename, open('%s.sha1' % filename).read()))
147 continue
148 print 'Calculating hash for %s...' % filename,
149 sha1_sum = GetSHA1(filename)
150 with open(filename + '.sha1', 'wb') as f:
151 f.write(sha1_sum)
152 print 'done'
153 upload_queue.put((filename, sha1_sum))
154 hash_time = time.time() - hash_timer
155
156 # Wait for everything to finish.
157 for _ in all_threads:
158 upload_queue.put((None, None)) # To mark the end of the work queue.
159 for t in all_threads:
160 t.join()
161
162 # Print timing information.
163 print 'Hashing %s files took %1f seconds' % (len(input_filenames), hash_time)
164 print 'Uploading took %1f seconds' % (time.time() - upload_timer)
165
166 # See if we ran into any errors.
167 max_ret_code = 0
168 for ret_code, message in ret_codes.queue:
169 max_ret_code = max(ret_code, max_ret_code)
170 if message:
171 print >> sys.stderr, message
172
173 if not max_ret_code:
174 print 'Success.'
175 else:
176 print 'We encountered some error(s).'
177
178 return max_ret_code
179
180
181 def main(args):
182 parser = optparse.OptionParser(USAGE_STRING)
183 parser.add_option('-b', '--bucket',
184 help='Google Storage bucket to upload to.')
185 parser.add_option('-e', '--boto', help='Specify a custom boto file.')
186 parser.add_option('-f', '--force', action='store_true',
187 help='Force upload even if remote file exists.')
188 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,
189 help='Path to the gsutil script.')
190 parser.add_option('-m', '--use_md5', action='store_true',
191 help='Generate MD5 files when scanning, and don\'t check '
192 'the MD5 checksum if a .md5 file is found.')
193 parser.add_option('-t', '--num_threads', default=1, type='int',
194 help='Number of uploader threads to run.')
195 parser.add_option('-s', '--skip_hashing', action='store_true',
196 help='Skip hashing if .sha1 file exists.')
197 parser.add_option('-0', '--use_null_terminator', action='store_true',
198 help='Use \\0 instead of \\n when parsing '
199 'the file list from stdin. This is useful if the input '
200 'is coming from "find ... -print0".')
201 (options, args) = parser.parse_args()
202
203 # Enumerate our inputs.
204 input_filenames = get_targets(args, parser, options.use_null_terminator)
205
206 # Make sure we can find a working instance of gsutil.
207 if os.path.exists(GSUTIL_DEFAULT_PATH):
208 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)
209 else:
210 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %
211 GSUTIL_DEFAULT_PATH)
212 return 1
213
214 # Check we have a valid bucket with valid permissions.
215 base_url, code = CheckBucketPermissions(options.bucket, gsutil)
216 if code:
217 return code
218
219 return upload_to_google_storage(
220 input_filenames, base_url, gsutil, options.force, options.use_md5,
221 options.num_threads, options.skip_hashing)
222
223
224 if __name__ == '__main__':
225 sys.exit(main(sys.argv))
OLDNEW
« download_from_google_storage.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine