Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(457)

Side by Side Diff: upload_to_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Review fixes Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« tests/gstools_unittest.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Uploads files to Google Storage content addressed."""
7
8 import hashlib
9 import optparse
10 import os
11 import Queue
12 import re
13 import sys
14 import threading
15 import time
16
17 from download_from_google_storage import check_bucket_permissions
18 from download_from_google_storage import get_sha1
19 from download_from_google_storage import Gsutil
20 from download_from_google_storage import printer_worker
21
22 GSUTIL_DEFAULT_PATH = os.path.join(
23 os.path.dirname(os.path.abspath(__file__)),
24 'third_party', 'gsutil', 'gsutil')
25
26 USAGE_STRING = """%prog [options] target [target2 ...].
27 Target is the file intended to be uploaded to Google Storage.
28 If target is "-", then a list of files will be taken from standard input
29
30 This script will generate a file (original filename).sha1 containing the
31 sha1 sum of the uploaded file.
32 It is recommended that the .sha1 file is checked into the repository,
33 the original file removed from the repository, and a hook added to the
34 DEPS file to call download_from_google_storage.py.
35
36 Example usages
37 --------------
38
39 Scan the current directory and upload all files larger than 1MB:
40 find . -name .svn -prune -o -size +1000k -type f -print0 | %prog -0 -
41 """
42
43
44 def get_md5(filename):
45 md5_calculator = hashlib.md5()
46 with open(filename, 'rb') as f:
47 while True:
48 chunk = f.read(1024*1024)
49 if not chunk:
50 break
51 md5_calculator.update(chunk)
52 return md5_calculator.hexdigest()
53
54
55 def get_md5_cached(filename):
56 """Don't calculate the MD5 if we can find a .md5 file."""
57 # See if we can find an existing MD5 sum stored in a file.
58 if os.path.exists('%s.md5' % filename):
59 with open('%s.md5' % filename, 'rb') as f:
60 md5_match = re.search('([a-z0-9]{32})', f.read())
61 if md5_match:
62 return md5_match.group(1)
63 else:
64 md5_hash = get_md5(filename)
65 with open('%s.md5' % filename, 'w') as f:
66 f.write(md5_hash)
67 return md5_hash
68
69
70 def _upload_worker(
71 thread_num, upload_queue, base_url, gsutil, md5_lock, force,
72 use_md5, stdout_queue, ret_codes):
73 while True:
74 filename, sha1_sum = upload_queue.get()
75 if not filename:
76 break
77 file_url = '%s/%s' % (base_url, sha1_sum)
78 if gsutil.check_call('ls', file_url)[0] == 0 and not force:
79 # File exists, check MD5 hash.
80 _, out, _ = gsutil.check_call('ls', '-L', file_url)
81 etag_match = re.search('ETag:\s+([a-z0-9]{32})', out)
82 if etag_match:
83 remote_md5 = etag_match.group(1)
84 # Calculate the MD5 checksum to match it to Google Storage's ETag.
85 with md5_lock:
86 if use_md5:
87 local_md5 = get_md5_cached(filename)
88 else:
89 local_md5 = get_md5(filename)
90 if local_md5 == remote_md5:
91 stdout_queue.put(
92 '%d> File %s already exists at %s and MD5 matches, exiting' %
93 (thread_num, filename, file_url))
94 continue
95 stdout_queue.put('%d> Uploading %s to %s' % (
96 thread_num, filename, file_url))
97 code, _, err = gsutil.check_call('cp', '-q', filename, file_url)
98 if code != 0:
99 ret_codes.put(
100 (code,
101 'Encountered error on uploading %s to %s\n%s' %
102 (filename, file_url, err)))
103 continue
104
105
106 def get_targets(args, parser, use_null_terminator):
107 if not args:
108 parser.error('Missing target.')
109
110 if len(args) == 1 and args[0] == '-':
111 # Take stdin as a newline or null seperated list of files.
112 if use_null_terminator:
113 return sys.stdin.read().split('\0')
114 else:
115 return sys.stdin.read().splitlines()
116 else:
117 return args
118
119
120 def upload_to_google_storage(
121 input_filenames, base_url, gsutil, force,
122 use_md5, num_threads, skip_hashing):
123 # We only want one MD5 calculation happening at a time to avoid HD thrashing.
124 md5_lock = threading.Lock()
125
126 # Start up all the worker threads plus the printer thread.
127 all_threads = []
128 ret_codes = Queue.Queue()
129 ret_codes.put((0, None))
130 upload_queue = Queue.Queue()
131 upload_timer = time.time()
132 stdout_queue = Queue.Queue()
133 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])
134 printer_thread.daemon = True
135 printer_thread.start()
136 for thread_num in range(num_threads):
137 t = threading.Thread(
138 target=_upload_worker,
139 args=[thread_num, upload_queue, base_url, gsutil.clone(), md5_lock,
140 force, use_md5, stdout_queue, ret_codes])
141 t.daemon = True
142 t.start()
143 all_threads.append(t)
144
145 # We want to hash everything in a single thread since its faster.
146 # The bottleneck is in disk IO, not CPU.
147 hashing_start = time.time()
148 for filename in input_filenames:
149 if not os.path.exists(filename):
150 print 'Error: %s not found, skipping.' % filename
151 continue
152 if os.path.exists('%s.sha1' % filename) and skip_hashing:
153 print 'Found hash for %s, skipping.' % filename
154 with open(filename + '.sha1', 'rb') as f:
155 sha1_file = f.read(1024)
156 if not re.match('^([a-z0-9]{40})$', sha1_file):
157 print >> sys.stderr, 'Invalid sha1 hash file %s.sha1' % filename
158 return 1
159 upload_queue.put((filename, sha1_file))
160 continue
161 print 'Calculating hash for %s...' % filename,
162 sha1_sum = get_sha1(filename)
163 with open(filename + '.sha1', 'wb') as f:
164 f.write(sha1_sum)
165 print 'done'
166 upload_queue.put((filename, sha1_sum))
167 hashing_duration = time.time() - hashing_start
168
169 # Wait for everything to finish.
170 for _ in all_threads:
171 upload_queue.put((None, None)) # To mark the end of the work queue.
172 for t in all_threads:
173 t.join()
174 stdout_queue.put(None)
175 printer_thread.join()
176
177 # Print timing information.
178 print 'Hashing %s files took %1f seconds' % (
179 len(input_filenames), hashing_duration)
180 print 'Uploading took %1f seconds' % (time.time() - upload_timer)
181
182 # See if we ran into any errors.
183 max_ret_code = 0
184 for ret_code, message in ret_codes.queue:
185 max_ret_code = max(ret_code, max_ret_code)
186 if message:
187 print >> sys.stderr, message
188
189 if not max_ret_code:
190 print 'Success!'
191
192 return max_ret_code
193
194
195 def main(args):
196 parser = optparse.OptionParser(USAGE_STRING)
197 parser.add_option('-b', '--bucket',
198 help='Google Storage bucket to upload to.')
199 parser.add_option('-e', '--boto', help='Specify a custom boto file.')
200 parser.add_option('-f', '--force', action='store_true',
201 help='Force upload even if remote file exists.')
202 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,
203 help='Path to the gsutil script.')
204 parser.add_option('-m', '--use_md5', action='store_true',
205 help='Generate MD5 files when scanning, and don\'t check '
206 'the MD5 checksum if a .md5 file is found.')
207 parser.add_option('-t', '--num_threads', default=1, type='int',
208 help='Number of uploader threads to run.')
209 parser.add_option('-s', '--skip_hashing', action='store_true',
210 help='Skip hashing if .sha1 file exists.')
211 parser.add_option('-0', '--use_null_terminator', action='store_true',
212 help='Use \\0 instead of \\n when parsing '
213 'the file list from stdin. This is useful if the input '
214 'is coming from "find ... -print0".')
215 (options, args) = parser.parse_args()
216
217 # Enumerate our inputs.
218 input_filenames = get_targets(args, parser, options.use_null_terminator)
219
220 # Make sure we can find a working instance of gsutil.
221 if os.path.exists(GSUTIL_DEFAULT_PATH):
222 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)
223 else:
224 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %
225 GSUTIL_DEFAULT_PATH)
226 return 1
227
228 # Check we have a valid bucket with valid permissions.
229 base_url, code = check_bucket_permissions(options.bucket, gsutil)
230 if code:
231 return code
232
233 return upload_to_google_storage(
234 input_filenames, base_url, gsutil, options.force, options.use_md5,
235 options.num_threads, options.skip_hashing)
236
237
238 if __name__ == '__main__':
239 sys.exit(main(sys.argv))
OLDNEW
« tests/gstools_unittest.py ('K') | « tests/gstools_unittest.py ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698