Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(420)

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Added exception types, renamed variables Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« download_from_google_storage.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Uploads files to Google Storage content addressed."""
7
8 import hashlib
9 import optparse
10 import os
11 import Queue
12 import re
13 import sys
14 import threading
15 import time
16
17 from download_from_google_storage import check_bucket_permissions
18 from download_from_google_storage import get_sha1
19 from download_from_google_storage import Gsutil
20 from download_from_google_storage import printer_worker
21
22 GSUTIL_DEFAULT_PATH = os.path.join(
23 os.path.dirname(os.path.abspath(__file__)),
24 'third_party', 'gsutil', 'gsutil')
25
26 USAGE_STRING = """%prog [options] target [target2 ...].
27 Target is the file intended to be uploaded to Google Storage.
28 If target is "-", then a list of files will be taken from standard input
29
30 This script will generate a file (original filename).sha1 containing the
31 sha1 sum of the uploaded file.
32 It is recommended that the .sha1 file is checked into the repository,
33 the original file removed from the repository, and a hook added to the
34 DEPS file to call download_from_google_storage.py.
35
36 Example usages
37 --------------
38
39 Scan the current directory and upload all files larger than 1MB:
40 find . -name .svn -prune -o -size +1000k -type f -print0 | %prog -0 -
41 """
42
43
44 def get_md5(filename):
45 md5_calculator = hashlib.md5()
46 with open(filename, 'rb') as f:
47 while True:
48 chunk = f.read(1024*1024)
49 if not chunk:
50 break
51 md5_calculator.update(chunk)
52 return md5_calculator.hexdigest()
53
54
55 def get_md5_cached(filename):
M-A Ruel 2013/03/07 19:41:22 BTW, I'm fine with using a lock here if you prefer
Ryan Tseng 2013/03/07 20:35:18 Either's okay. This actually looks cleaner, and a
56 """Don't calculate the MD5 if we can find a .md5 file."""
57 # See if we can find an existing MD5 sum stored in a file.
58 if os.path.exists('%s.md5' % filename):
59 with open('%s.md5' % filename) as f:
60 md5_match = re.search('([a-z0-9]{32})', f.read())
61 if md5_match:
62 return md5_match.group(1)
63 else:
64 md5_hash = get_md5(filename)
65 with open('%s.md5' % filename, 'w') as f:
66 f.write(md5_hash)
67 return md5_hash
68
69
70 def _upload_worker(
71 thread_num, q, base_url, gsutil, md5_lock, force,
72 use_md5, stdout_queue, ret_codes):
73 while True:
74 filename, sha1_sum = q.get()
75 if not filename:
76 break
77 file_url = '%s/%s' % (base_url, sha1_sum)
78 if gsutil.check_call('ls', file_url)[0] == 0 and not force:
79 # File exists, check MD5 hash.
80 _, out, _ = gsutil.check_call('ls', '-L', file_url)
M-A Ruel 2013/03/07 19:41:22 If the error is 403, it will still try uploading?
Ryan Tseng 2013/03/07 20:35:18 Hm.. if it 403s, it'll still try to upload, and fa
81 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)
82 if etag_match:
83 remote_md5 = etag_match.group(1)
84 # Calculate the MD5 checksum to match it to Google Storage's ETag.
85 with md5_lock:
86 if use_md5:
87 local_md5 = get_md5_cached(filename)
88 else:
89 local_md5 = get_md5(filename)
90 if local_md5 == remote_md5:
91 stdout_queue.put(
92 'File %s already exists at %s and MD5 matches, exiting' %
93 (filename, file_url))
94 continue
95 stdout_queue.put('Uploading %s to %s' % (filename, file_url))
96 code, _, err = gsutil.check_call('cp', '-q', filename, file_url)
97 if code != 0:
98 ret_codes.put(
99 (code,
100 'Encountered error on uploading %s to %s\n%s' %
101 (filename, file_url, err)))
102 continue
103
104
105 def get_targets(args, parser, use_null_terminator):
106 if not args:
107 parser.error('Missing target.')
108
109 if len(args) == 1 and args[0] == '-':
110 # Take stdin as a newline or null seperated list of files.
111 if use_null_terminator:
112 return sys.stdin.read().split('\0')
113 else:
114 return sys.stdin.read().splitlines()
115 else:
116 return args
117
118
119 def upload_to_google_storage(
120 input_filenames, base_url, gsutil, force,
121 use_md5, num_threads, skip_hashing):
122 # We only want one MD5 calculation happening at a time to avoid HD thrashing.
123 md5_lock = threading.Lock()
124
125 # Start up all the worker threads.
126 all_threads = []
127 ret_codes = Queue.Queue()
128 ret_codes.put((0, None))
129 upload_queue = Queue.Queue()
130 upload_timer = time.time()
131 stdout_queue = Queue.Queue()
132 for thread_num in range(num_threads):
133 t = threading.Thread(
134 target=_upload_worker,
135 args=[thread_num, upload_queue, base_url, gsutil.clone(), md5_lock,
136 force, use_md5, stdout_queue, ret_codes])
137 t.daemon = True
138 t.start()
139 all_threads.append(t)
140
141 # We want to hash everything in a single thread since its faster.
142 # The bottleneck is in disk IO, not CPU.
143 hashing_start = time.time()
144 for filename in input_filenames:
145 if not os.path.exists(filename):
146 print 'Error: %s not found, skipping.' % filename
147 continue
148 if os.path.exists('%s.sha1' % filename) and skip_hashing:
149 print 'Found hash for %s, skipping.' % filename
150 upload_queue.put((filename, open('%s.sha1' % filename).read()))
151 continue
152 print 'Calculating hash for %s...' % filename,
153 sha1_sum = get_sha1(filename)
154 with open(filename + '.sha1', 'wb') as f:
155 f.write(sha1_sum)
156 print 'done'
157 upload_queue.put((filename, sha1_sum))
158 hashing_duration = time.time() - hashing_start
159
160 # Wait for everything to finish.
161 for _ in all_threads:
162 upload_queue.put((None, None)) # To mark the end of the work queue.
163 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])
164 printer_thread.daemon = True
165 printer_thread.start()
166 for t in all_threads:
167 t.join()
168 stdout_queue.put(None)
169 printer_thread.join()
170
171 # Print timing information.
172 print 'Hashing %s files took %1f seconds' % (
173 len(input_filenames), hashing_duration)
174 print 'Uploading took %1f seconds' % (time.time() - upload_timer)
175
176 # See if we ran into any errors.
177 max_ret_code = 0
178 for ret_code, message in ret_codes.queue:
M-A Ruel 2013/03/07 19:41:22 That works?
Ryan Tseng 2013/03/07 20:35:18 ret_codes.queue returns an iterable queue. I belie
179 max_ret_code = max(ret_code, max_ret_code)
180 if message:
181 print >> sys.stderr, message
182
183 if not max_ret_code:
M-A Ruel 2013/03/07 19:41:22 I don't think this block is useful, the error mess
Ryan Tseng 2013/03/07 20:35:18 Doen: Removed the else clause, and made success so
184 print 'Success.'
185 else:
186 print 'We encountered some error(s).'
187
188 return max_ret_code
189
190
191 def main(args):
192 parser = optparse.OptionParser(USAGE_STRING)
193 parser.add_option('-b', '--bucket',
194 help='Google Storage bucket to upload to.')
195 parser.add_option('-e', '--boto', help='Specify a custom boto file.')
196 parser.add_option('-f', '--force', action='store_true',
197 help='Force upload even if remote file exists.')
198 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,
199 help='Path to the gsutil script.')
200 parser.add_option('-m', '--use_md5', action='store_true',
201 help='Generate MD5 files when scanning, and don\'t check '
202 'the MD5 checksum if a .md5 file is found.')
203 parser.add_option('-t', '--num_threads', default=1, type='int',
204 help='Number of uploader threads to run.')
205 parser.add_option('-s', '--skip_hashing', action='store_true',
206 help='Skip hashing if .sha1 file exists.')
207 parser.add_option('-0', '--use_null_terminator', action='store_true',
208 help='Use \\0 instead of \\n when parsing '
209 'the file list from stdin. This is useful if the input '
210 'is coming from "find ... -print0".')
211 (options, args) = parser.parse_args()
212
213 # Enumerate our inputs.
214 input_filenames = get_targets(args, parser, options.use_null_terminator)
215
216 # Make sure we can find a working instance of gsutil.
217 if os.path.exists(GSUTIL_DEFAULT_PATH):
218 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)
219 else:
220 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %
221 GSUTIL_DEFAULT_PATH)
222 return 1
223
224 # Check we have a valid bucket with valid permissions.
225 base_url, code = check_bucket_permissions(options.bucket, gsutil)
226 if code:
227 return code
228
229 return upload_to_google_storage(
230 input_filenames, base_url, gsutil, options.force, options.use_md5,
231 options.num_threads, options.skip_hashing)
232
233
234 if __name__ == '__main__':
235 sys.exit(main(sys.argv))
OLDNEW
« download_from_google_storage.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698