Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(52)

Side by Side Diff: download_from_google_storage.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master
Patch Set: Review changes, fixed race condition Created 7 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Download files from Google Storage based on SHA1 sums."""
7
8
9 import hashlib
10 import optparse
11 import os
12 import Queue
13 import re
14 import sys
15 import threading
16 import time
17
18 import subprocess2
19
20
21 GSUTIL_DEFAULT_PATH = os.path.join(
22 os.path.dirname(os.path.abspath(__file__)),
23 'third_party', 'gsutil', 'gsutil')
24
25
26 # Common utilities
27 class Gsutil(object):
28 """Call gsutil with some predefined settings."""
29 def __init__(self, path, boto_path=None, timeout=None):
30 if not os.path.exists(path):
31 raise OSError('GSUtil not found in %s' % path)
32 self.path = path
33 self.timeout = timeout
34 self.boto_path = boto_path
35
36 def call(self, *args):
37 env = os.environ.copy()
38 if self.boto_path is not None:
39 env['AWS_CREDENTIAL_FILE'] = self.boto_path
40 return subprocess2.call((sys.executable, self.path) + args,
41 env=env,
42 timeout=self.timeout)
43
44 def check_call(self, *args):
45 env = os.environ.copy()
46 if self.boto_path is not None:
47 env['AWS_CREDENTIAL_FILE'] = self.boto_path
48 ((out, err), code) = subprocess2.communicate(
49 (sys.executable, self.path) + args,
50 stdout=subprocess2.PIPE,
51 stderr=subprocess2.PIPE,
52 env=env,
53 timeout=self.timeout)
54
55 # Parse output.
56 status_code_match = re.search('status=([0-9]+)', err)
57 if status_code_match:
58 return int(status_code_match.groups(1))
59 elif ('You are attempting to access protected data with '
60 'no configured credentials.' in err):
61 return (403, out, err)
62 elif 'No such object' in err:
63 return (404, out, err)
64 else:
65 return (code, out, err)
66
67 def clone(self):
68 return Gsutil(self.path, self.boto_path, self.timeout)
69
70
71 def CheckBucketPermissions(bucket, gsutil):
72 if not bucket:
73 print >> sys.stderr, 'Missing bucket %s.'
74 return (None, 1)
75 base_url = 'gs://%s' % bucket
76
77 # Check if we have permissions to the Google Storage bucket.
M-A Ruel 2013/03/06 19:49:59 You can remove this comment.
Ryan Tseng 2013/03/06 20:19:08 Done.
78 code, _, ls_err = gsutil.check_call('ls', base_url)
79 if code == 403:
80 code, _, _ = gsutil.call('config')
81 if code != 0:
82 print >> sys.stderr, 'Error while authenticating to %s.' % base_url
83 elif code == 404:
84 print >> sys.stderr, '%s not found.' % base_url
85 elif code != 0:
86 print >> sys.stderr, ls_err
87 return (base_url, code)
88
89
90 def GetSHA1(filename):
91 sha1 = hashlib.sha1()
92 with open(filename, 'rb') as f:
93 while True:
94 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.
95 chunk = f.read(1024*1024)
96 if not chunk:
97 break
98 sha1.update(chunk)
99 return sha1.hexdigest()
100
101
102 # Download-specific code starts here
103
104 def enumerate_work_queue(input_filename, work_queue, directory,
105 recursive, ignore_errors, output, sha1_file):
106 if sha1_file:
107 if not os.path.exists(input_filename):
108 print >> sys.stderr, '%s not found.' % input_filename
109 if not ignore_errors:
110 raise Exception('%s not found.' % input_filename)
111 with open(input_filename, 'rb') as f:
112 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
113 if sha1_match:
114 work_queue.put(
115 (sha1_match.groups(1)[0], input_filename.replace('.sha1', '')))
116 return 1
117 print >> sys.stderr, 'No sha1 sum found in %s.' % input_filename
118 if not ignore_errors:
119 raise Exception('No sha1 sum found in %s.' % input_filename)
120 return 0
121
122 if not directory:
123 work_queue.put((input_filename, output))
124 return 1
125
126 work_queue_size = 0
127 for root, dirs, files in os.walk(input_filename):
128 if not recursive:
129 for item in dirs[:]:
130 dirs.remove(item)
131 else:
132 for exclude in ['.svn', '.git']:
133 if exclude in dirs:
134 dirs.remove(exclude)
135 for filename in files:
136 full_path = os.path.join(root, filename)
137 if full_path.endswith('.sha1'):
138 with open(full_path, 'rb') as f:
139 sha1_match = re.match('^([A-Za-z0-9]{40})$', f.read(1024).rstrip())
140 if sha1_match:
141 work_queue.put(
142 (sha1_match.groups(1)[0], full_path.replace('.sha1', '')))
143 work_queue_size += 1
144 else:
145 print >> sys.stderr, 'No sha1 sum found in %s.' % filename
146 if not ignore_errors:
147 raise Exception('No sha1 sum found in %s.' % filename)
148 return work_queue_size
149
150
151 def _downloader_worker_thread(thread_num, q, force, base_url, gsutil, out_q):
152 while True:
153 input_sha1_sum, output_filename = q.get()
154 if input_sha1_sum is None:
155 out_q.put('Thread %d is done' % thread_num)
156 return
157 if os.path.exists(output_filename) and not force:
158 if GetSHA1(output_filename) == input_sha1_sum:
159 out_q.put(
160 'File %s exists and SHA1 sum (%s) matches. Skipping.' % (
161 output_filename , input_sha1_sum))
162 continue
163 # Check if file exists.
164 file_url = '%s/%s' % (base_url, input_sha1_sum)
165 if gsutil.check_call('ls', file_url)[0] != 0:
166 out_q.put('File %s for %s does not exist, skipping.' % (
167 file_url, output_filename))
168 continue
169 # Fetch the file.
170 out_q.put('Downloading %s to %s...' % (file_url, output_filename))
171 code, _, err = gsutil.check_call('cp', '-q', file_url, output_filename)
172 if code != 0:
173 out_q.put(err)
174 return code
175
176
177 def download_from_google_storage(
178 input_filename, base_url, gsutil, num_threads, directory, recursive,
179 force, output, ignore_errors, sha1_file):
180 # Start up all the worker threads.
181 all_threads = []
182 download_timer = time.time()
183 stdout_queue = Queue.Queue()
184 work_queue = Queue.Queue()
185 for thread_num in range(num_threads):
186 t = threading.Thread(
187 target=_downloader_worker_thread,
188 args=[thread_num, work_queue, force, base_url,
189 gsutil.clone(), stdout_queue])
190 t.daemon = True
191 t.start()
192 all_threads.append(t)
193
194 # Enumerate our work queue.
195 work_queue_size = enumerate_work_queue(
196 input_filename, work_queue, directory, recursive,
197 ignore_errors, output, sha1_file)
198 for _ in all_threads:
199 work_queue.put((None, None)) # Used to tell worker threads to stop.
200
201 # Wait for all downloads to finish.
202 while not work_queue.empty() and any(t.is_alive() for t in all_threads):
M-A Ruel 2013/03/06 19:49:59 There's a possibility that the code would get ther
Ryan Tseng 2013/03/06 20:19:08 isn't appending to a Queue() synchronous? If we p
Ryan Tseng 2013/03/06 20:24:45 oh nevermind I see what you're saying.
203 print stdout_queue.get()
204 while not stdout_queue.empty():
205 print stdout_queue.get()
206
207 print 'Success.'
208 print 'Downloading %d files took %1f second(s)' % (
209 work_queue_size, time.time() - download_timer)
210 return 0
211
212
213 def main(args):
214 usage = ('usage: %prog [options] target\nTarget must be:\n'
215 '(default) a sha1 sum ([A-Za-z0-9]{40}).\n(-s or --sha1_file) a '
216 '.sha1 file, containing a sha1 sum on the first line. (-d or '
217 '--directory) A directory to scan for .sha1 files. ')
218 parser = optparse.OptionParser(usage)
219 parser.add_option('-o', '--output',
220 help='Specify the output file name. Defaults to:\n'
221 '(a) Given a SHA1 hash, the name is the SHA1 hash.\n'
222 '(b) Given a .sha1 file or directory, the name will '
223 'match (.*).sha1.')
224 parser.add_option('-b', '--bucket',
225 help='Google Storage bucket to fetch from.')
226 parser.add_option('-e', '--boto',
227 help='Specify a custom boto file.')
228 parser.add_option('-c', '--no_resume', action='store_true',
229 help='Resume download if file is partially downloaded.')
230 parser.add_option('-f', '--force', action='store_true',
231 help='Force download even if local file exists.')
232 parser.add_option('-i', '--ignore_errors', action='store_true',
233 help='Don\'t throw error if we find an invalid .sha1 file.')
234 parser.add_option('-r', '--recursive', action='store_true',
235 help='Scan folders recursively for .sha1 files. '
236 'Must be used with -d/--directory')
237 parser.add_option('-t', '--num_threads', default=1, type='int',
238 help='Number of downloader threads to run.')
239 parser.add_option('-d', '--directory', action='store_true',
240 help='The target is a directory. '
241 'Cannot be used with -s/--sha1_file.')
242 parser.add_option('-s', '--sha1_file', action='store_true',
243 help='The target is a file containing a sha1 sum. '
244 'Cannot be used with -d/--directory.')
245
246 (options, args) = parser.parse_args()
247 if not args:
248 parser.error('Missing target.')
249 if len(args) > 1:
250 parser.error('Too many targets.')
251 if not options.bucket:
252 parser.error('Missing bucket. Specify bucket with --bucket.')
253 if options.sha1_file and options.directory:
254 parser.error('Both --directory and --sha1_file are specified, '
255 'can only specify one.')
256 elif options.recursive and not options.directory:
257 parser.error('--recursive specified but --directory not specified.')
258 elif options.output and options.directory:
259 parser.error('--directory is specified, so --output has no effect.')
260 else:
261 input_filename = args[0]
262
263 # Set output filename if not specified.
264 if not options.output and not options.directory:
265 if not options.sha1_file:
266 # Target is a sha1 sum, so output filename would also be the sha1 sum.
267 options.output = input_filename
268 elif options.sha1_file:
269 # Target is a .sha1 file.
270 if not input_filename.endswith('.sha1'):
271 parser.error('--sha1_file is specified, but the input filename '
272 'does not end with .sha1, and no --output is specified. '
273 'Either make sure the input filename has a .sha1 '
274 'extension, or specify --output.')
275 options.output = input_filename[:-5]
276 else:
277 raise NotImplementedError('Unreachable state.')
278
279 # Check if output file already exists.
280 if not options.directory and not options.force and not options.no_resume:
281 if os.path.exists(options.output):
282 parser.error('Output file %s exists and --no_resume is specified.'
283 % options.output)
284
285 # Make sure we can find a working instance of gsutil.
286 if os.path.exists(GSUTIL_DEFAULT_PATH):
287 gsutil = Gsutil(GSUTIL_DEFAULT_PATH)
288 else:
289 print >> sys.stderr, ('gsutil not found in %s, bad depot_tools checkout?' %
290 GSUTIL_DEFAULT_PATH)
291 return 1
292
293 # Check we have a valid bucket with valid permissions.
294 base_url, code = CheckBucketPermissions(options.bucket, gsutil)
295 if code:
296 return code
297
298 return download_from_google_storage(
299 input_filename, base_url, gsutil, options.num_threads, options.directory,
300 options.recursive, options.force, options.output, options.ignore_errors,
301 options.sha1_file)
302
303
304 if __name__ == '__main__':
305 sys.exit(main(sys.argv))
OLDNEW
« no previous file with comments | « no previous file | tests/gstools/download_test_data/rootfolder_text.txt » ('j') | upload_to_google_storage.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698