Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(448)

Side by Side Diff: build/download_from_google_storage.py

Issue 11664024: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Changed upload script, fixed to run Created 7 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « no previous file | build/upload_to_google_storage.py » ('j') | build/upload_to_google_storage.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Script to download files from Google Storage."""
7
8
9 import hashlib
10 import optparse
11 import os
12 import Queue
13 import re
14 import subprocess
15 import sys
16 import tempfile
17 import threading
18 import time
19 import zipfile
20
21 # TODO(hinoka): This is currently incorrect. Should find a better default.
M-A Ruel 2013/01/17 19:53:17 Why is it incorrect.
22 GSUTIL_DEFAULT_PATH = os.path.join(
23 os.path.dirname(os.path.normpath(__file__)),
M-A Ruel 2013/01/17 19:53:17 You want abspath or realpath, not normpath.
24 '..', '..', 'third_party', 'gsutil', 'gsutil')
25
26
27 class Gsutil():
M-A Ruel 2013/01/17 19:53:17 object
28 def __init__(self, path, boto_path=None, timeout=None):
M-A Ruel 2013/01/17 19:53:17 Are the default values necessary?
29 if not os.path.exists(path):
30 raise OSError('GSUtil not found in %s' % path)
31 self.path = path
32
33 self.timeout = timeout
34 self.boto_path = boto_path
35
36 def call(self, *args):
37 def _thread_main():
38 thr = threading.current_thread()
39 env = os.environ.copy()
M-A Ruel 2013/01/17 19:53:17 You can make this in the main thread instead of th
40 if self.boto_path is not None:
41 env['AWS_CREDENTIAL_FILE'] = self.boto_path
42 thr.status = subprocess.call((sys.executable, self.path) + args, env=env)
M-A Ruel 2013/01/17 19:53:17 subprocess2.check_call(timeout=) works well and is
43 t = threading.Thread(target=_thread_main)
44 t.start()
45 t.join(self.timeout)
46 if thr.isAlive():
47 raise RuntimeError('%s %s timed out after %d seconds.' % (
48 self.path, ' '.join(args), self.timeout))
49 return thr.status
50
51 def check_call(self, *args):
52 def _thread_main():
53 thr = threading.current_thread()
54 env = os.environ.copy()
55 if self.boto_path is not None:
56 env['AWS_CREDENTIAL_FILE'] = self.boto_path
57 p = subprocess.Popen((sys.executable, self.path) + args,
58 stdout=subprocess.PIPE,
59 stderr=subprocess.PIPE,
60 env=env)
61 code = p.wait()
62 out, err = p.communicate()
63 thr.status = (code, out, err)
64
65 thr = threading.Thread(target=_thread_main)
66 thr.start()
67 thr.join(self.timeout)
68 if thr.isAlive():
69 raise RuntimeError('%s %s timed out after %d seconds.' % (
70 self.path, ' '.join(args), self.timeout))
71 code, out, err = thr.status
72 status_code_match = re.search('status=([0-9]+)', err)
73 if status_code_match:
74 return int(status_code_match.groups(1))
75 elif ('You are attempting to access protected data with '
76 'no configured credentials.' in err):
77 return (403, out, err)
78 elif 'No such object' in err:
79 return (404, out, err)
80 else:
81 return (code, out, err)
82
83 def clone(self):
84 return Gsutil(self.path, self.boto_path, self.timeout)
85
86
87 def CheckSHA1(sha1_sum, filename):
88 sha1 = hashlib.sha1()
89 with open(filename, 'rb') as f:
90 while True:
91 # Read in 1mb chunks, so it doesn't all have to be loaded into memory.
92 chunk = f.read(1024*1024)
93 if not chunk:
94 break
95 sha1.update(chunk)
96 return sha1_sum == sha1.hexdigest()
97
98
99 def _downloader_worker_thread(thread_num, q, options, base_url, gsutil):
100 while True:
101 try:
102 input_sha1_sum, output_filename = q.get_nowait()
M-A Ruel 2013/01/17 19:53:17 Please move the except: up here and move the rest
103 if os.path.exists(output_filename) and not options.force:
104 if CheckSHA1(input_sha1_sum, output_filename):
105 print 'File %s exists and SHA1 sum (%s) matches. Skipping.' % (
106 output_filename , input_sha1_sum)
107 continue
108 # Check if file exists.
109 file_url = '%s/%s' % (base_url, input_sha1_sum)
110 if gsutil.check_call('ls', file_url) != 0:
111 print >>sys.stderr, 'File %s for %s does not exist, skipping.' % (
M-A Ruel 2013/01/17 19:53:17 You are outputing from a thread? I'd recommend aga
112 file_url, output_filename)
113 continue
114 # Fetch the file.
115 print 'Downloading %s to %s...' % (file_url, output_filename)
116 code, out, err = gsutil.call('cp', '-q', file_url, output_filename)
117 if code != 0:
118 print >>sys.stderr, gsutil.stderr
119 return code
120 except Queue.Empty:
121 return
122
123
124 def main(args):
125 usage = ('usage: %prog [options] target\nTarget must be:\n'
126 '(default) a sha1 sum ([A-Za-z0-9]{40}).\n(-s or --sha1_file) a '
127 '.sha1 file, containing a sha1 sum on the first line. (-d or '
128 '--directory) A directory to scan for .sha1 files. ')
129 parser = optparse.OptionParser(usage)
130 parser.add_option('-o', '--output', default=None,
131 help='Specify the output file name. Defaults to:\n'
132 '(a) Given a SHA1 hash, the name is the SHA1 hash.\n'
133 '(b) Given a .sha1 file or directory, the name will '
134 'match (.*).sha1.')
135 parser.add_option('-b', '--bucket', default='chrome-artifacts',
136 help='Google Storage bucket to fetch from.')
137 parser.add_option('-e', '--boto', default=None,
138 help='Specify a custom boto file.')
139 parser.add_option('-c', '--no_resume', action='store_true', default=False,
140 help='Resume download if file is partially downloaded.')
141 parser.add_option('-f', '--force', action='store_true', default=False,
142 help='Force download even if local file exists.')
143 parser.add_option('-r', '--recursive', action='store_true', default=False,
144 help='Scan folders recursively for .sha1 files. '
145 'Must be used with -d/--directory')
146 parser.add_option('-t', '--num_threads', default=1, type='int',
147 help='Number of downloader threads to run.')
148 parser.add_option('-d', '--directory', action='store_true', default=False,
149 help='The target is a directory. '
150 'Cannot be used with -s/--sha1_file.')
151 parser.add_option('-s', '--sha1_file', action='store_true', default=False,
152 help='The target is a file containing a sha1 sum. '
M-A Ruel 2013/01/17 19:53:17 What's the goal of this flag? Is it necessary/usef
153 'Cannot be used with -d/--directory.')
154 # This file should be stored in tools/deps_scripts/ and we want the path to
155 # third_party/gsutil/gsutil
156 parser.add_option('-g', '--gsutil_path', default=GSUTIL_DEFAULT_PATH,
157 help='Path to the gsutil script.')
158
159 (options, args) = parser.parse_args()
160 if len(args) < 1:
161 parser.error('Missing target.')
162 elif len(args) > 1:
M-A Ruel 2013/01/17 19:53:17 s/elif/if/ on all of these
163 # TODO(hinoka): Multi target support.
M-A Ruel 2013/01/17 19:53:17 Not necessary.
164 parser.error('Too many targets.')
165 elif options.sha1_file and options.directory:
166 parser.error('Both --directory and --sha1_file are specified, '
167 'can only specify one.')
168 elif options.recursive and not options.directory:
169 parser.error('--recursive specified but --directory not specified.')
170 elif options.output and options.directory:
171 parser.error('--directory is specified, so --output has no effect.')
172 else:
173 input_filename = args[0]
174
175 # Set output filename if not specified.
176 if not options.output and not options.directory:
177 if not options.sha1_file:
178 # Target is a sha1 sum, so output filename would also be the sha1 sum.
179 options.output = input_filename
180 elif options.sha1_file:
181 # Target is a .sha1 file.
182 if not input_filename.endswith('.sha1'):
183 parser.error('--sha1_file is specified, but the input filename '
184 'does not end with .sha1, and no --output is specified. '
185 'Either make sure the input filename has a .sha1 '
186 'extension, or specify --output.')
187 options.output = input_filename[:-5]
188 else:
189 raise Exception('Unreachable state.')
M-A Ruel 2013/01/17 19:53:17 raise NotImplementedError()
190
191 # Check if output file already exists.
192 if not options.directory and not options.force and not options.no_resume:
193 if os.path.exists(options.output):
194 parser.error('Output file %s exists and --no_resume is specified.'
195 % options.output)
196
197
198 base_url = 'gs://%s' % options.bucket
199
200 # Make sure we can find a working instance of gsutil.
201 if os.path.exists(options.gsutil_path):
202 gsutil = Gsutil(options.gsutil_path, boto_path=options.boto)
203 else:
204 for path in os.environ["PATH"].split(os.pathsep):
205 if os.path.exists(path) and 'gsutil' in os.listdir(path):
206 gsutil = Gsutil(os.path.join(path, 'gsutil'), boto_path=options.boto)
207
208 # Check if we have permissions to the Google Storage bucket.
M-A Ruel 2013/01/17 19:53:17 Since the argument parsing code is non trivial, wh
209 code, ls_out, ls_err = gsutil.check_call('ls', base_url)
210 if code == 403:
211 code, _, _ = gsutil.call('config')
212 if code != 0:
213 print >>sys.stderr, 'Error while authenticating to %s, exiting' % base_url
214 return 403
215 elif code == 404:
216 print >>sys.stderr, '%s not found.' % base_url
217 return 404
218 elif code != 0:
219 print >>sys.stderr, ls_err
220 return code
221
222 # Enumerate our work queue.
223 work_queue = Queue.Queue()
224 work_queue_size = 0
225 if options.directory:
226 if options.recursive:
227 for root, dirs, files in os.walk(input_filename):
228 if '.svn' in dirs:
M-A Ruel 2013/01/17 19:53:17 And .git?
229 dirs.remove('.svn')
230 if not options.recursive:
231 for item in dirs:
232 dirs.remove(item)
233 for filename in files:
234 full_path = os.path.join(root, filename)
235 if full_path.endswith('.sha1'):
236 with open(full_path) as f:
237 sha1_match = re.search('([A-Za-z0-9]{40})', f.read(1024))
M-A Ruel 2013/01/17 19:53:17 match(r'^...$', ...) ?
238 if sha1_match:
M-A Ruel 2013/01/17 19:53:17 Silently drop otherwise?
239 work_queue.put((sha1_match.groups(1)[0],
240 full_path.replace('.sha1', '')))
241 work_queue_size += 1
242 else:
243 work_queue.put((input_filename, options.output))
244 work_queue_size += 1
245
246 # Start up all the worker threads.
M-A Ruel 2013/01/17 19:53:17 Why not start up before the enumeration?
247 all_threads = []
248 download_timer = time.time()
249 for thread_num in range(options.num_threads):
250 t = threading.Thread(target=_downloader_worker_thread, args=[thread_num,
251 work_queue, options, base_url, gsutil.clone()])
252 t.daemon = True
253 t.start()
254 all_threads.append(t)
255
256 # Wait for all downloads to finish.
257 for t in all_threads:
258 t.join()
259
260 print 'Success.'
261 print 'Downloading %d files took %1f second(s)' % (
262 work_queue_size, time.time() - download_timer)
263 return 0
264
265
266 if __name__ == '__main__':
267 sys.exit(main(sys.argv))
OLDNEW
« no previous file with comments | « no previous file | build/upload_to_google_storage.py » ('j') | build/upload_to_google_storage.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698