Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(72)

Side by Side Diff: download_from_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Added exception types, renamed variables Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Download files from Google Storage based on SHA1 sums."""
7
8
9 import hashlib
10 import optparse
11 import os
12 import Queue
13 import re
14 import sys
15 import threading
16 import time
17
18 import subprocess2
19
20
21 GSUTIL_DEFAULT_PATH = os.path.join(
22 os.path.dirname(os.path.abspath(__file__)),
23 'third_party', 'gsutil', 'gsutil')
24
25
26 class FileNotFoundError(IOError):
27 pass
28
29
30 class InvalidFileError(IOError):
31 pass
32
33
34 # Common utilities
35 class Gsutil(object):
36 """Call gsutil with some predefined settings."""
37 def __init__(self, path, boto_path=None, timeout=None):
38 if not os.path.exists(path):
39 raise FileNotFoundError('GSUtil not found in %s' % path)
40 self.path = path
41 self.timeout = timeout
42 self.boto_path = boto_path
43
44 def call(self, *args):
45 env = os.environ.copy()
46 if self.boto_path is not None:
47 env['AWS_CREDENTIAL_FILE'] = self.boto_path
48 return subprocess2.call((sys.executable, self.path) + args,
49 env=env,
50 timeout=self.timeout)
51
52 def check_call(self, *args):
53 env = os.environ.copy()
54 if self.boto_path is not None:
55 env['AWS_CREDENTIAL_FILE'] = self.boto_path
56 ((out, err), code) = subprocess2.communicate(
57 (sys.executable, self.path) + args,
58 stdout=subprocess2.PIPE,
59 stderr=subprocess2.PIPE,
60 env=env,
61 timeout=self.timeout)
62
63 # Parse output.
64 status_code_match = re.search('status=([0-9]+)', err)
65 if status_code_match:
66 return int(status_code_match.groups(1))
67 elif ('You are attempting to access protected data with '
68 'no configured credentials.' in err):
69 return (403, out, err)
70 elif 'No such object' in err:
71 return (404, out, err)
72 else:
73 return (code, out, err)
74
75 def clone(self):
76 return Gsutil(self.path, self.boto_path, self.timeout)
77
78
79 def check_bucket_permissions(bucket, gsutil):
80 if not bucket:
81 print >> sys.stderr, 'Missing bucket %s.'
82 return (None, 1)
83 base_url = 'gs://%s' % bucket
84
85 code, _, ls_err = gsutil.check_call('ls', base_url)
86 if code == 403:
87 code, _, _ = gsutil.call('config')
88 if code != 0:
89 print >> sys.stderr, 'Error while authenticating to %s.' % base_url
90 elif code == 404:
91 print >> sys.stderr, '%s not found.' % base_url
92 elif code != 0:
93 print >> sys.stderr, ls_err
94 return (base_url, code)
95
96
97 def get_sha1(filename):
98 sha1 = hashlib.sha1()
99 with open(filename, 'rb') as f:
100 while True:
101 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.
102 chunk = f.read(1024*1024)
103 if not chunk:
104 break
105 sha1.update(chunk)
106 return sha1.hexdigest()
107
108
109 # Download-specific code starts here
110
111 def enumerate_work_queue(input_filename, work_queue, directory,
112 recursive, ignore_errors, output, sha1_file):
113 if sha1_file:
114 if not os.path.exists(input_filename):
115 if not ignore_errors:
116 raise FileNotFoundError('%s not found.' % input_filename)
117 print >> sys.stderr, '%s not found.' % input_filename
118 with open(input_filename, 'rb') as f:
119 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
120 if sha1_match:
121 work_queue.put(
122 (sha1_match.groups(1)[0], input_filename.replace('.sha1', '')))
123 return 1
124 if not ignore_errors:
125 raise InvalidFileError('No sha1 sum found in %s.' % input_filename)
126 print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename
127 return 0
128
129 if not directory:
130 work_queue.put((input_filename, output))
131 return 1
132
133 work_queue_size = 0
134 for root, dirs, files in os.walk(input_filename):
135 if not recursive:
136 for item in dirs[:]:
137 dirs.remove(item)
138 else:
139 for exclude in ['.svn', '.git']:
140 if exclude in dirs:
141 dirs.remove(exclude)
142 for filename in files:
143 full_path = os.path.join(root, filename)
144 if full_path.endswith('.sha1'):
145 with open(full_path, 'rb') as f:
146 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
147 if sha1_match:
148 work_queue.put(
149 (sha1_match.groups(1)[0], full_path.replace('.sha1', '')))
150 work_queue_size += 1
151 else:
152 if not ignore_errors:
153 raise InvalidFileError('No sha1 sum found in %s.' % filename)
154 print >> sys.stderr, 'No sha1 sum found in %s.' % filename
155 return work_queue_size
156
157
158 def _downloader_worker_thread(thread_num, q, force, base_url, gsutil, out_q):
159 while True:
160 input_sha1_sum, output_filename = q.get()
161 if input_sha1_sum is None:
162 out_q.put('Thread %d is done' % thread_num)
M-A Ruel 2013/03/07 19:41:22 I'd prefer you to prefix all the messages with '%d
Ryan Tseng 2013/03/07 20:35:18 Removed then
163 return
164 if os.path.exists(output_filename) and not force:
165 if get_sha1(output_filename) == input_sha1_sum:
166 out_q.put(
167 'File %s exists and SHA1 sum (%s) matches. Skipping.' % (
M-A Ruel 2013/03/07 19:41:22 I don't think it's useful to print the hash, it's
Ryan Tseng 2013/03/07 20:35:18 Done, an error code is queued in the return code q
168 output_filename , input_sha1_sum))
169 continue
170 # Check if file exists.
171 file_url = '%s/%s' % (base_url, input_sha1_sum)
172 if gsutil.check_call('ls', file_url)[0] != 0:
173 out_q.put('File %s for %s does not exist, skipping.' % (
174 file_url, output_filename))
175 continue
176 # Fetch the file.
177 out_q.put('Downloading %s to %s...' % (file_url, output_filename))
178 code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename)
M-A Ruel 2013/03/07 19:41:22 Will it fail if the file was already present?
Ryan Tseng 2013/03/07 20:35:18 It won't fail, "gsutil cp" will just overwrite the
M-A Ruel 2013/03/07 22:26:56 Perfect, I just wanted to make sure you asserted t
179 if code != 0:
180 out_q.put(err)
181 return code
182
183
184 def printer_worker(output_queue):
185 while True:
186 line = output_queue.get()
187 # Its pausible we want to print empty lines.
M-A Ruel 2013/03/07 19:41:22 plausible
Ryan Tseng 2013/03/07 20:35:18 Done.
188 if line is None:
189 break
190 print line
191
192
193 def download_from_google_storage(
194 input_filename, base_url, gsutil, num_threads, directory, recursive,
195 force, output, ignore_errors, sha1_file):
196 # Start up all the worker threads.
197 all_threads = []
198 download_timer = time.time()
199 stdout_queue = Queue.Queue()
200 work_queue = Queue.Queue()
201 for thread_num in range(num_threads):
202 t = threading.Thread(
203 target=_downloader_worker_thread,
204 args=[thread_num, work_queue, force, base_url,
205 gsutil.clone(), stdout_queue])
206 t.daemon = True
207 t.start()
208 all_threads.append(t)
209
210 # Enumerate our work queue.
211 work_queue_size = enumerate_work_queue(
212 input_filename, work_queue, directory, recursive,
213 ignore_errors, output, sha1_file)
214 for _ in all_threads:
215 work_queue.put((None, None)) # Used to tell worker threads to stop.
216
217
M-A Ruel 2013/03/07 19:41:22 3 lines -> 1 line
Ryan Tseng 2013/03/07 20:35:18 Done.
218
219 # Wait for all downloads to finish.
M-A Ruel 2013/03/07 19:41:22 You should start this thread because starting to e
Ryan Tseng 2013/03/07 20:35:18 Done.
220 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])
221 printer_thread.daemon = True
222 printer_thread.start()
223 for t in all_threads:
224 t.join()
225 stdout_queue.put(None)
226 printer_thread.join()
227
228 print 'Success.'
229 print 'Downloading %d files took %1f second(s)' % (
230 work_queue_size, time.time() - download_timer)
231 return 0
232
233
234 def main(args):
235 usage = ('usage: %prog [options] target\nTarget must be:\n'
236 '(default) a sha1 sum ([A-Za-z0-9]{40}).\n(-s or --sha1_file) a '
237 '.sha1 file, containing a sha1 sum on the first line. (-d or '
238 '--directory) A directory to scan for .sha1 files. ')
239 parser = optparse.OptionParser(usage)
240 parser.add_option('-o', '--output',
241 help='Specify the output file name. Defaults to:\n'
242 '(a) Given a SHA1 hash, the name is the SHA1 hash.\n'
243 '(b) Given a .sha1 file or directory, the name will '
244 'match (.*).sha1.')
245 parser.add_option('-b', '--bucket',
246 help='Google Storage bucket to fetch from.')
247 parser.add_option('-e', '--boto',
248 help='Specify a custom boto file.')
249 parser.add_option('-c', '--no_resume', action='store_true',
250 help='Resume download if file is partially downloaded.')
251 parser.add_option('-f', '--force', action='store_true',
252 help='Force download even if local file exists.')
253 parser.add_option('-i', '--ignore_errors', action='store_true',
254 help='Don\'t throw error if we find an invalid .sha1 file.')
255 parser.add_option('-r', '--recursive', action='store_true',
256 help='Scan folders recursively for .sha1 files. '
257 'Must be used with -d/--directory')
258 parser.add_option('-t', '--num_threads', default=1, type='int',
259 help='Number of downloader threads to run.')
260 parser.add_option('-d', '--directory', action='store_true',
261 help='The target is a directory. '
262 'Cannot be used with -s/--sha1_file.')
263 parser.add_option('-s', '--sha1_file', action='store_true',
264 help='The target is a file containing a sha1 sum. '
265 'Cannot be used with -d/--directory.')
266
267 (options, args) = parser.parse_args()
268 if not args:
269 parser.error('Missing target.')
270 if len(args) > 1:
271 parser.error('Too many targets.')
272 if not options.bucket:
273 parser.error('Missing bucket. Specify bucket with --bucket.')
274 if options.sha1_file and options.directory:
275 parser.error('Both --directory and --sha1_file are specified, '
276 'can only specify one.')
277 elif options.recursive and not options.directory:
278 parser.error('--recursive specified but --directory not specified.')
279 elif options.output and options.directory:
280 parser.error('--directory is specified, so --output has no effect.')
281 else:
282 input_filename = args[0]
283
284 # Set output filename if not specified.
285 if not options.output and not options.directory:
286 if not options.sha1_file:
287 # Target is a sha1 sum, so output filename would also be the sha1 sum.
288 options.output = input_filename
289 elif options.sha1_file:
290 # Target is a .sha1 file.
291 if not input_filename.endswith('.sha1'):
292 parser.error('--sha1_file is specified, but the input filename '
293 'does not end with .sha1, and no --output is specified. '
294 'Either make sure the input filename has a .sha1 '
295 'extension, or specify --output.')
296 options.output = input_filename[:-5]
297 else:
298 raise parser.error('Unreachable state.')
299
300 # Check if output file already exists.
301 if not options.directory and not options.force and not options.no_resume:
302 if os.path.exists(options.output):
303 parser.error('Output file %s exists and --no_resume is specified.'
304 % options.output)
305
306 # Make sure we can find a working instance of gsutil.
307 if os.path.exists(GSUTIL_DEFAULT_PATH):
308 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)
309 else:
310 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %
311 GSUTIL_DEFAULT_PATH)
312 return 1
313
314 # Check we have a valid bucket with valid permissions.
315 base_url, code = check_bucket_permissions(options.bucket, gsutil)
316 if code:
317 return code
318
319 return download_from_google_storage(
320 input_filename, base_url, gsutil, options.num_threads, options.directory,
321 options.recursive, options.force, options.output, options.ignore_errors,
322 options.sha1_file)
323
324
325 if __name__ == '__main__':
326 sys.exit(main(sys.argv))
OLDNEW
« no previous file with comments | « no previous file | tests/gstools/download_test_data/rootfolder_text.txt » ('j') | upload_to_google_storage.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698