Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(472)

Side by Side Diff: download_from_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Review fixes Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Download files from Google Storage based on SHA1 sums."""
7
8
9 import hashlib
10 import optparse
11 import os
12 import Queue
13 import re
14 import sys
15 import threading
16 import time
17
18 import subprocess2
19
20
21 GSUTIL_DEFAULT_PATH = os.path.join(
22 os.path.dirname(os.path.abspath(__file__)),
23 'third_party', 'gsutil', 'gsutil')
24
25
26 class FileNotFoundError(IOError):
27 pass
28
29
30 class InvalidFileError(IOError):
31 pass
32
33
34 # Common utilities
35 class Gsutil(object):
36 """Call gsutil with some predefined settings."""
37 def __init__(self, path, boto_path=None, timeout=None):
38 if not os.path.exists(path):
39 raise FileNotFoundError('GSUtil not found in %s' % path)
40 self.path = path
41 self.timeout = timeout
42 self.boto_path = boto_path
43
44 def call(self, *args):
45 env = os.environ.copy()
46 if self.boto_path is not None:
M-A Ruel 2013/03/07 22:26:56 if self.boto_path:
Ryan Tseng 2013/03/07 22:42:21 Done.
47 env['AWS_CREDENTIAL_FILE'] = self.boto_path
48 return subprocess2.call((sys.executable, self.path) + args,
49 env=env,
50 timeout=self.timeout)
51
52 def check_call(self, *args):
53 env = os.environ.copy()
54 if self.boto_path is not None:
M-A Ruel 2013/03/07 22:26:56 if self.boto_path:
Ryan Tseng 2013/03/07 22:42:21 Done.
55 env['AWS_CREDENTIAL_FILE'] = self.boto_path
56 ((out, err), code) = subprocess2.communicate(
57 (sys.executable, self.path) + args,
58 stdout=subprocess2.PIPE,
59 stderr=subprocess2.PIPE,
60 env=env,
61 timeout=self.timeout)
62
63 # Parse output.
64 status_code_match = re.search('status=([0-9]+)', err)
65 if status_code_match:
66 return int(status_code_match.groups(1))
67 elif ('You are attempting to access protected data with '
68 'no configured credentials.' in err):
69 return (403, out, err)
70 elif 'No such object' in err:
71 return (404, out, err)
72 else:
73 return (code, out, err)
74
75 def clone(self):
76 return Gsutil(self.path, self.boto_path, self.timeout)
77
78
79 def check_bucket_permissions(bucket, gsutil):
80 if not bucket:
81 print >> sys.stderr, 'Missing bucket %s.'
82 return (None, 1)
83 base_url = 'gs://%s' % bucket
84
85 code, _, ls_err = gsutil.check_call('ls', base_url)
86 if code == 403:
87 code, _, _ = gsutil.call('config')
88 if code != 0:
89 print >> sys.stderr, 'Error while authenticating to %s.' % base_url
90 elif code == 404:
91 print >> sys.stderr, '%s not found.' % base_url
92 elif code != 0:
93 print >> sys.stderr, ls_err
94 return (base_url, code)
95
96
97 def get_sha1(filename):
98 sha1 = hashlib.sha1()
99 with open(filename, 'rb') as f:
100 while True:
101 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.
102 chunk = f.read(1024*1024)
103 if not chunk:
104 break
105 sha1.update(chunk)
106 return sha1.hexdigest()
107
108
109 # Download-specific code starts here
110
111 def enumerate_work_queue(input_filename, work_queue, directory,
112 recursive, ignore_errors, output, sha1_file):
113 if sha1_file:
114 if not os.path.exists(input_filename):
115 if not ignore_errors:
116 raise FileNotFoundError('%s not found.' % input_filename)
117 print >> sys.stderr, '%s not found.' % input_filename
118 with open(input_filename, 'rb') as f:
119 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
120 if sha1_match:
121 work_queue.put(
122 (sha1_match.groups(1)[0], input_filename.replace('.sha1', '')))
123 return 1
124 if not ignore_errors:
125 raise InvalidFileError('No sha1 sum found in %s.' % input_filename)
126 print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename
127 return 0
128
129 if not directory:
130 work_queue.put((input_filename, output))
131 return 1
132
133 work_queue_size = 0
134 for root, dirs, files in os.walk(input_filename):
135 if not recursive:
136 for item in dirs[:]:
137 dirs.remove(item)
138 else:
139 for exclude in ['.svn', '.git']:
140 if exclude in dirs:
141 dirs.remove(exclude)
142 for filename in files:
143 full_path = os.path.join(root, filename)
144 if full_path.endswith('.sha1'):
145 with open(full_path, 'rb') as f:
146 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
147 if sha1_match:
148 work_queue.put(
149 (sha1_match.groups(1)[0], full_path.replace('.sha1', '')))
150 work_queue_size += 1
151 else:
152 if not ignore_errors:
153 raise InvalidFileError('No sha1 sum found in %s.' % filename)
154 print >> sys.stderr, 'No sha1 sum found in %s.' % filename
155 return work_queue_size
156
157
158 def _downloader_worker_thread(thread_num, q, force, base_url,
159 gsutil, out_q, ret_codes):
160 while True:
161 input_sha1_sum, output_filename = q.get()
162 if input_sha1_sum is None:
163 return
164 if os.path.exists(output_filename) and not force:
165 if get_sha1(output_filename) == input_sha1_sum:
166 out_q.put(
167 '%d> File %s exists and SHA1 matches. Skipping.' % (
168 thread_num, output_filename))
169 continue
170 # Check if file exists.
171 file_url = '%s/%s' % (base_url, input_sha1_sum)
172 if gsutil.check_call('ls', file_url)[0] != 0:
173 out_q.put('%d> File %s for %s does not exist, skipping.' % (
174 thread_num, file_url, output_filename))
175 ret_codes.put((1, 'File %s for %s does not exist.' % (
176 file_url, output_filename)))
177 continue
178 # Fetch the file.
179 out_q.put('%d> Downloading %s...' % (
180 thread_num, output_filename))
181 code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename)
182 if code != 0:
183 out_q.put('%d> %s' % (thread_num, err))
184 ret_codes.put((code, err))
185
186
187 def printer_worker(output_queue):
188 while True:
189 line = output_queue.get()
190 # Its plausible we want to print empty lines.
191 if line is None:
192 break
193 print line
194
195
196 def download_from_google_storage(
197 input_filename, base_url, gsutil, num_threads, directory, recursive,
198 force, output, ignore_errors, sha1_file):
199 # Start up all the worker threads.
200 all_threads = []
201 download_timer = time.time()
202 stdout_queue = Queue.Queue()
203 work_queue = Queue.Queue()
204 ret_codes = Queue.Queue()
205 ret_codes.put((1, None))
206 for thread_num in range(num_threads):
207 t = threading.Thread(
208 target=_downloader_worker_thread,
209 args=[thread_num, work_queue, force, base_url,
210 gsutil.clone(), stdout_queue, ret_codes])
211 t.daemon = True
212 t.start()
213 all_threads.append(t)
214 printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])
215 printer_thread.daemon = True
216 printer_thread.start()
217
218 # Enumerate our work queue.
219 work_queue_size = enumerate_work_queue(
220 input_filename, work_queue, directory, recursive,
221 ignore_errors, output, sha1_file)
222 for _ in all_threads:
223 work_queue.put((None, None)) # Used to tell worker threads to stop.
224
225 # Wait for all downloads to finish.
226 for t in all_threads:
227 t.join()
228 stdout_queue.put(None)
229 printer_thread.join()
230
231 # See if we ran into any errors.
232 max_ret_code = 0
233 for ret_code, message in ret_codes.queue:
234 max_ret_code = max(ret_code, max_ret_code)
235 if message:
236 print >> sys.stderr, message
237 if not max_ret_code:
238 print 'Success!'
239
240 print 'Downloading %d files took %1f second(s)' % (
241 work_queue_size, time.time() - download_timer)
242 return max_ret_code
243
244
245 def main(args):
246 usage = ('usage: %prog [options] target\nTarget must be:\n'
247 '(default) a sha1 sum ([A-Za-z0-9]{40}).\n(-s or --sha1_file) a '
248 '.sha1 file, containing a sha1 sum on the first line. (-d or '
249 '--directory) A directory to scan for .sha1 files. ')
250 parser = optparse.OptionParser(usage)
251 parser.add_option('-o', '--output',
252 help='Specify the output file name. Defaults to:\n'
253 '(a) Given a SHA1 hash, the name is the SHA1 hash.\n'
254 '(b) Given a .sha1 file or directory, the name will '
255 'match (.*).sha1.')
256 parser.add_option('-b', '--bucket',
257 help='Google Storage bucket to fetch from.')
258 parser.add_option('-e', '--boto',
259 help='Specify a custom boto file.')
260 parser.add_option('-c', '--no_resume', action='store_true',
261 help='Resume download if file is partially downloaded.')
262 parser.add_option('-f', '--force', action='store_true',
263 help='Force download even if local file exists.')
264 parser.add_option('-i', '--ignore_errors', action='store_true',
265 help='Don\'t throw error if we find an invalid .sha1 file.')
266 parser.add_option('-r', '--recursive', action='store_true',
267 help='Scan folders recursively for .sha1 files. '
268 'Must be used with -d/--directory')
269 parser.add_option('-t', '--num_threads', default=1, type='int',
270 help='Number of downloader threads to run.')
271 parser.add_option('-d', '--directory', action='store_true',
272 help='The target is a directory. '
273 'Cannot be used with -s/--sha1_file.')
274 parser.add_option('-s', '--sha1_file', action='store_true',
275 help='The target is a file containing a sha1 sum. '
276 'Cannot be used with -d/--directory.')
277
278 (options, args) = parser.parse_args()
279 if not args:
280 parser.error('Missing target.')
281 if len(args) > 1:
282 parser.error('Too many targets.')
283 if not options.bucket:
284 parser.error('Missing bucket. Specify bucket with --bucket.')
285 if options.sha1_file and options.directory:
286 parser.error('Both --directory and --sha1_file are specified, '
287 'can only specify one.')
288 elif options.recursive and not options.directory:
289 parser.error('--recursive specified but --directory not specified.')
290 elif options.output and options.directory:
291 parser.error('--directory is specified, so --output has no effect.')
292 else:
293 input_filename = args[0]
294
295 # Set output filename if not specified.
296 if not options.output and not options.directory:
297 if not options.sha1_file:
298 # Target is a sha1 sum, so output filename would also be the sha1 sum.
299 options.output = input_filename
300 elif options.sha1_file:
301 # Target is a .sha1 file.
302 if not input_filename.endswith('.sha1'):
303 parser.error('--sha1_file is specified, but the input filename '
304 'does not end with .sha1, and no --output is specified. '
305 'Either make sure the input filename has a .sha1 '
306 'extension, or specify --output.')
307 options.output = input_filename[:-5]
308 else:
309 raise parser.error('Unreachable state.')
310
311 # Check if output file already exists.
312 if not options.directory and not options.force and not options.no_resume:
313 if os.path.exists(options.output):
314 parser.error('Output file %s exists and --no_resume is specified.'
315 % options.output)
316
317 # Make sure we can find a working instance of gsutil.
318 if os.path.exists(GSUTIL_DEFAULT_PATH):
319 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)
320 else:
321 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %
322 GSUTIL_DEFAULT_PATH)
323 return 1
324
325 # Check we have a valid bucket with valid permissions.
326 base_url, code = check_bucket_permissions(options.bucket, gsutil)
327 if code:
328 return code
329
330 return download_from_google_storage(
331 input_filename, base_url, gsutil, options.num_threads, options.directory,
332 options.recursive, options.force, options.output, options.ignore_errors,
333 options.sha1_file)
334
335
336 if __name__ == '__main__':
337 sys.exit(main(sys.argv))
OLDNEW
« no previous file with comments | « no previous file | tests/gstools/download_test_data/rootfolder_text.txt » ('j') | upload_to_google_storage.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698