OLD | NEW |
1 #!/usr/bin/python | 1 #!/usr/bin/python |
2 | 2 |
3 # pylint: disable=C0301 | 3 # pylint: disable=C0301 |
4 """ | 4 """ |
5 Copyright 2014 Google Inc. | 5 Copyright 2014 Google Inc. |
6 | 6 |
7 Use of this source code is governed by a BSD-style license that can be | 7 Use of this source code is governed by a BSD-style license that can be |
8 found in the LICENSE file. | 8 found in the LICENSE file. |
9 | 9 |
10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper | 10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper |
11 for the XML API). | 11 for the XML API). |
12 | 12 |
13 API/library references: | 13 API/library references: |
14 - https://developers.google.com/storage/docs/reference-guide | 14 - https://developers.google.com/storage/docs/reference-guide |
15 - http://googlecloudstorage.blogspot.com/2012/09/google-cloud-storage-tutorial-u
sing-boto.html | 15 - http://googlecloudstorage.blogspot.com/2012/09/google-cloud-storage-tutorial-u
sing-boto.html |
16 """ | 16 """ |
17 # pylint: enable=C0301 | 17 # pylint: enable=C0301 |
18 | 18 |
19 # System-level imports | 19 # System-level imports |
20 import errno | 20 import errno |
| 21 import hashlib |
21 import os | 22 import os |
22 import posixpath | 23 import posixpath |
23 import re | 24 import re |
24 import sys | 25 import sys |
25 | 26 |
26 # Imports from third-party code | 27 # Imports from third-party code |
27 TRUNK_DIRECTORY = os.path.abspath(os.path.join( | 28 TRUNK_DIRECTORY = os.path.abspath(os.path.join( |
28 os.path.dirname(__file__), os.pardir, os.pardir)) | 29 os.path.dirname(__file__), os.pardir, os.pardir)) |
29 for import_subdir in ['boto']: | 30 for import_subdir in ['boto']: |
30 import_dirpath = os.path.join( | 31 import_dirpath = os.path.join( |
(...skipping 101 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
132 """Delete a single file within a GS bucket. | 133 """Delete a single file within a GS bucket. |
133 | 134 |
134 TODO(epoger): what if bucket or path does not exist? Should probably raise | 135 TODO(epoger): what if bucket or path does not exist? Should probably raise |
135 an exception. Implement, and add a test to exercise this. | 136 an exception. Implement, and add a test to exercise this. |
136 | 137 |
137 Params: | 138 Params: |
138 bucket: GS bucket to delete a file from | 139 bucket: GS bucket to delete a file from |
139 path: full path (Posix-style) of the file within the bucket to delete | 140 path: full path (Posix-style) of the file within the bucket to delete |
140 """ | 141 """ |
141 b = self._connect_to_bucket(bucket_name=bucket) | 142 b = self._connect_to_bucket(bucket_name=bucket) |
142 item = Key(b) | 143 key = Key(b) |
143 item.key = path | 144 key.name = path |
144 try: | 145 try: |
145 item.delete() | 146 key.delete() |
146 except BotoServerError, e: | 147 except BotoServerError, e: |
147 e.body = (repr(e.body) + | 148 e.body = (repr(e.body) + |
148 ' while deleting bucket=%s, path=%s' % (bucket, path)) | 149 ' while deleting bucket=%s, path=%s' % (bucket, path)) |
149 raise | 150 raise |
150 | 151 |
| 152 def get_last_modified_time(self, bucket, path): |
| 153 """Gets the timestamp of when this file was last modified. |
| 154 |
| 155 Params: |
| 156 bucket: GS bucket in which to look for the file |
| 157 path: full path (Posix-style) of the file within the bucket to check |
| 158 |
| 159 Returns the last modified time, as a freeform string. If the file was not |
| 160 found, returns None. |
| 161 """ |
| 162 b = self._connect_to_bucket(bucket_name=bucket) |
| 163 try: |
| 164 key = b.get_key(key_name=path) |
| 165 if not key: |
| 166 return None |
| 167 return key.last_modified |
| 168 except BotoServerError, e: |
| 169 e.body = (repr(e.body) + |
| 170 ' while getting attributes of bucket=%s, path=%s' % ( |
| 171 bucket, path)) |
| 172 raise |
| 173 |
151 def upload_file(self, source_path, dest_bucket, dest_path, | 174 def upload_file(self, source_path, dest_bucket, dest_path, |
152 predefined_acl=None, fine_grained_acl_list=None): | 175 only_if_modified=False, predefined_acl=None, |
| 176 fine_grained_acl_list=None): |
153 """Upload contents of a local file to Google Storage. | 177 """Upload contents of a local file to Google Storage. |
154 | 178 |
155 TODO(epoger): Add the only_if_modified param provided by upload_file() in | |
156 https://github.com/google/skia-buildbot/blob/master/slave/skia_slave_scripts
/utils/old_gs_utils.py , | |
157 so we can replace that function with this one. | |
158 | |
159 params: | 179 params: |
160 source_path: full path (local-OS-style) on local disk to read from | 180 source_path: full path (local-OS-style) on local disk to read from |
161 dest_bucket: GCS bucket to copy the file to | 181 dest_bucket: GCS bucket to copy the file to |
162 dest_path: full path (Posix-style) within that bucket | 182 dest_path: full path (Posix-style) within that bucket |
| 183 only_if_modified: if True, only upload the file if it would actually |
| 184 change the content on Google Storage (uploads the file if dest_path |
| 185 does not exist, or if it exists but has different contents than |
| 186 source_path). Note that this may take longer than just uploading the |
| 187 file without checking first, due to extra round-trips! |
163 predefined_acl: which predefined ACL to apply to the file on Google | 188 predefined_acl: which predefined ACL to apply to the file on Google |
164 Storage; must be one of the PredefinedACL values defined above. | 189 Storage; must be one of the PredefinedACL values defined above. |
165 If None, inherits dest_bucket's default object ACL. | 190 If None, inherits dest_bucket's default object ACL. |
166 TODO(epoger): add unittests for this param, although it seems to work | 191 TODO(epoger): add unittests for this param, although it seems to work |
167 in my manual testing | 192 in my manual testing |
168 fine_grained_acl_list: list of (id_type, id_value, permission) tuples | 193 fine_grained_acl_list: list of (id_type, id_value, permission) tuples |
169 to apply to the uploaded file (on top of the predefined_acl), | 194 to apply to the uploaded file (on top of the predefined_acl), |
170 or None if predefined_acl is sufficient | 195 or None if predefined_acl is sufficient |
171 """ | 196 """ |
172 b = self._connect_to_bucket(bucket_name=dest_bucket) | 197 b = self._connect_to_bucket(bucket_name=dest_bucket) |
173 item = Key(b) | 198 |
174 item.key = dest_path | 199 if only_if_modified: |
| 200 old_key = b.get_key(key_name=dest_path) |
| 201 if old_key: |
| 202 local_md5 = '"%s"' % _get_local_md5(path=source_path) |
| 203 if local_md5 == old_key.etag: |
| 204 print 'Skipping upload of unmodified file %s : %s' % ( |
| 205 source_path, local_md5) |
| 206 return |
| 207 |
| 208 key = Key(b) |
| 209 key.name = dest_path |
175 try: | 210 try: |
176 item.set_contents_from_filename(filename=source_path, | 211 key.set_contents_from_filename(filename=source_path, |
177 policy=predefined_acl) | 212 policy=predefined_acl) |
178 except BotoServerError, e: | 213 except BotoServerError, e: |
179 e.body = (repr(e.body) + | 214 e.body = (repr(e.body) + |
180 ' while uploading source_path=%s to bucket=%s, path=%s' % ( | 215 ' while uploading source_path=%s to bucket=%s, path=%s' % ( |
181 source_path, dest_bucket, item.key)) | 216 source_path, dest_bucket, key.name)) |
182 raise | 217 raise |
183 # TODO(epoger): This may be inefficient, because it calls | 218 # TODO(epoger): This may be inefficient, because it calls |
184 # _connect_to_bucket() again. Depending on how expensive that | 219 # _connect_to_bucket() again. Depending on how expensive that |
185 # call is, we may want to optimize this. | 220 # call is, we may want to optimize this. |
186 for (id_type, id_value, permission) in fine_grained_acl_list or []: | 221 for (id_type, id_value, permission) in fine_grained_acl_list or []: |
187 self.set_acl( | 222 self.set_acl( |
188 bucket=dest_bucket, path=item.key, | 223 bucket=dest_bucket, path=key.name, |
189 id_type=id_type, id_value=id_value, permission=permission) | 224 id_type=id_type, id_value=id_value, permission=permission) |
190 | 225 |
191 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, | 226 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, |
192 predefined_acl=None, fine_grained_acl_list=None): | 227 predefined_acl=None, fine_grained_acl_list=None): |
193 """Recursively upload contents of a local directory to Google Storage. | 228 """Recursively upload contents of a local directory to Google Storage. |
194 | 229 |
195 params: | 230 params: |
196 source_dir: full path (local-OS-style) on local disk of directory to copy | 231 source_dir: full path (local-OS-style) on local disk of directory to copy |
197 contents of | 232 contents of |
198 dest_bucket: GCS bucket to copy the files into | 233 dest_bucket: GCS bucket to copy the files into |
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
230 else: | 265 else: |
231 remote_path = filename | 266 remote_path = filename |
232 | 267 |
233 if os.path.isdir(local_path): | 268 if os.path.isdir(local_path): |
234 self.upload_dir_contents( # recurse | 269 self.upload_dir_contents( # recurse |
235 source_dir=local_path, dest_bucket=dest_bucket, | 270 source_dir=local_path, dest_bucket=dest_bucket, |
236 dest_dir=remote_path, | 271 dest_dir=remote_path, |
237 predefined_acl=predefined_acl, | 272 predefined_acl=predefined_acl, |
238 fine_grained_acl_list=fine_grained_acl_list) | 273 fine_grained_acl_list=fine_grained_acl_list) |
239 else: | 274 else: |
240 item = Key(b) | 275 key = Key(b) |
241 item.key = remote_path | 276 key.name = remote_path |
242 try: | 277 try: |
243 item.set_contents_from_filename( | 278 key.set_contents_from_filename( |
244 filename=local_path, policy=predefined_acl) | 279 filename=local_path, policy=predefined_acl) |
245 except BotoServerError, e: | 280 except BotoServerError, e: |
246 e.body = (repr(e.body) + | 281 e.body = (repr(e.body) + |
247 ' while uploading local_path=%s to bucket=%s, path=%s' % ( | 282 ' while uploading local_path=%s to bucket=%s, path=%s' % ( |
248 local_path, dest_bucket, remote_path)) | 283 local_path, dest_bucket, remote_path)) |
249 raise | 284 raise |
250 # TODO(epoger): This may be inefficient, because it calls | 285 # TODO(epoger): This may be inefficient, because it calls |
251 # _connect_to_bucket() for every file. Depending on how expensive that | 286 # _connect_to_bucket() for every file. Depending on how expensive that |
252 # call is, we may want to optimize this. | 287 # call is, we may want to optimize this. |
253 for (id_type, id_value, permission) in fine_grained_acl_list or []: | 288 for (id_type, id_value, permission) in fine_grained_acl_list or []: |
254 self.set_acl( | 289 self.set_acl( |
255 bucket=dest_bucket, path=remote_path, | 290 bucket=dest_bucket, path=remote_path, |
256 id_type=id_type, id_value=id_value, permission=permission) | 291 id_type=id_type, id_value=id_value, permission=permission) |
257 | 292 |
258 def download_file(self, source_bucket, source_path, dest_path, | 293 def download_file(self, source_bucket, source_path, dest_path, |
259 create_subdirs_if_needed=False): | 294 create_subdirs_if_needed=False): |
260 """Downloads a single file from Google Cloud Storage to local disk. | 295 """Downloads a single file from Google Cloud Storage to local disk. |
261 | 296 |
262 Args: | 297 Args: |
263 source_bucket: GCS bucket to download the file from | 298 source_bucket: GCS bucket to download the file from |
264 source_path: full path (Posix-style) within that bucket | 299 source_path: full path (Posix-style) within that bucket |
265 dest_path: full path (local-OS-style) on local disk to copy the file to | 300 dest_path: full path (local-OS-style) on local disk to copy the file to |
266 create_subdirs_if_needed: boolean; whether to create subdirectories as | 301 create_subdirs_if_needed: boolean; whether to create subdirectories as |
267 needed to create dest_path | 302 needed to create dest_path |
268 """ | 303 """ |
269 b = self._connect_to_bucket(bucket_name=source_bucket) | 304 b = self._connect_to_bucket(bucket_name=source_bucket) |
270 item = Key(b) | 305 key = Key(b) |
271 item.key = source_path | 306 key.name = source_path |
272 if create_subdirs_if_needed: | 307 if create_subdirs_if_needed: |
273 _makedirs_if_needed(os.path.dirname(dest_path)) | 308 _makedirs_if_needed(os.path.dirname(dest_path)) |
274 with open(dest_path, 'w') as f: | 309 with open(dest_path, 'w') as f: |
275 try: | 310 try: |
276 item.get_contents_to_file(fp=f) | 311 key.get_contents_to_file(fp=f) |
277 except BotoServerError, e: | 312 except BotoServerError, e: |
278 e.body = (repr(e.body) + | 313 e.body = (repr(e.body) + |
279 ' while downloading bucket=%s, path=%s to local_path=%s' % ( | 314 ' while downloading bucket=%s, path=%s to local_path=%s' % ( |
280 source_bucket, source_path, dest_path)) | 315 source_bucket, source_path, dest_path)) |
281 raise | 316 raise |
282 | 317 |
283 def download_dir_contents(self, source_bucket, source_dir, dest_dir): | 318 def download_dir_contents(self, source_bucket, source_dir, dest_dir): |
284 """Recursively download contents of a Google Storage directory to local disk | 319 """Recursively download contents of a Google Storage directory to local disk |
285 | 320 |
286 params: | 321 params: |
287 source_bucket: GCS bucket to copy the files from | 322 source_bucket: GCS bucket to copy the files from |
288 source_dir: full path (Posix-style) within that bucket; read the files | 323 source_dir: full path (Posix-style) within that bucket; read the files |
289 from this directory | 324 from this directory |
290 dest_dir: full path (local-OS-style) on local disk of directory to copy | 325 dest_dir: full path (local-OS-style) on local disk of directory to copy |
291 the files into | 326 the files into |
292 | 327 |
293 The copy operates as a "merge with overwrite": any files in source_dir will | 328 The copy operates as a "merge with overwrite": any files in source_dir will |
294 be "overlaid" on top of the existing content in dest_dir. Existing files | 329 be "overlaid" on top of the existing content in dest_dir. Existing files |
295 with the same names will be overwritten. | 330 with the same names will be overwritten. |
296 | 331 |
297 TODO(epoger): Download multiple files simultaneously to reduce latency. | 332 TODO(epoger): Download multiple files simultaneously to reduce latency. |
298 """ | 333 """ |
299 _makedirs_if_needed(dest_dir) | 334 _makedirs_if_needed(dest_dir) |
300 b = self._connect_to_bucket(bucket_name=source_bucket) | 335 b = self._connect_to_bucket(bucket_name=source_bucket) |
301 (dirs, files) = self.list_bucket_contents( | 336 (dirs, files) = self.list_bucket_contents( |
302 bucket=source_bucket, subdir=source_dir) | 337 bucket=source_bucket, subdir=source_dir) |
303 | 338 |
304 for filename in files: | 339 for filename in files: |
305 item = Key(b) | 340 key = Key(b) |
306 item.key = posixpath.join(source_dir, filename) | 341 key.name = posixpath.join(source_dir, filename) |
307 dest_path = os.path.join(dest_dir, filename) | 342 dest_path = os.path.join(dest_dir, filename) |
308 with open(dest_path, 'w') as f: | 343 with open(dest_path, 'w') as f: |
309 try: | 344 try: |
310 item.get_contents_to_file(fp=f) | 345 key.get_contents_to_file(fp=f) |
311 except BotoServerError, e: | 346 except BotoServerError, e: |
312 e.body = (repr(e.body) + | 347 e.body = (repr(e.body) + |
313 ' while downloading bucket=%s, path=%s to local_path=%s' % ( | 348 ' while downloading bucket=%s, path=%s to local_path=%s' % ( |
314 source_bucket, item.key, dest_path)) | 349 source_bucket, key.name, dest_path)) |
315 raise | 350 raise |
316 | 351 |
317 for dirname in dirs: | 352 for dirname in dirs: |
318 self.download_dir_contents( # recurse | 353 self.download_dir_contents( # recurse |
319 source_bucket=source_bucket, | 354 source_bucket=source_bucket, |
320 source_dir=posixpath.join(source_dir, dirname), | 355 source_dir=posixpath.join(source_dir, dirname), |
321 dest_dir=os.path.join(dest_dir, dirname)) | 356 dest_dir=os.path.join(dest_dir, dirname)) |
322 | 357 |
323 def get_acl(self, bucket, path, id_type, id_value): | 358 def get_acl(self, bucket, path, id_type, id_value): |
324 """Retrieve partial access permissions on a single file in Google Storage. | 359 """Retrieve partial access permissions on a single file in Google Storage. |
(...skipping 99 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
424 bucket: name of the Google Storage bucket | 459 bucket: name of the Google Storage bucket |
425 subdir: directory within the bucket to list, or None for root directory | 460 subdir: directory within the bucket to list, or None for root directory |
426 """ | 461 """ |
427 # The GS command relies on the prefix (if any) ending with a slash. | 462 # The GS command relies on the prefix (if any) ending with a slash. |
428 prefix = subdir or '' | 463 prefix = subdir or '' |
429 if prefix and not prefix.endswith('/'): | 464 if prefix and not prefix.endswith('/'): |
430 prefix += '/' | 465 prefix += '/' |
431 prefix_length = len(prefix) if prefix else 0 | 466 prefix_length = len(prefix) if prefix else 0 |
432 | 467 |
433 b = self._connect_to_bucket(bucket_name=bucket) | 468 b = self._connect_to_bucket(bucket_name=bucket) |
434 lister = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/') | 469 items = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/') |
435 dirs = [] | 470 dirs = [] |
436 files = [] | 471 files = [] |
437 for item in lister: | 472 for item in items: |
438 t = type(item) | 473 t = type(item) |
439 if t is Key: | 474 if t is Key: |
440 files.append(item.key[prefix_length:]) | 475 files.append(item.name[prefix_length:]) |
441 elif t is Prefix: | 476 elif t is Prefix: |
442 dirs.append(item.name[prefix_length:-1]) | 477 dirs.append(item.name[prefix_length:-1]) |
443 return (dirs, files) | 478 return (dirs, files) |
444 | 479 |
445 def _connect_to_bucket(self, bucket_name): | 480 def _connect_to_bucket(self, bucket_name): |
446 """Returns a Bucket object we can use to access a particular bucket in GS. | 481 """Returns a Bucket object we can use to access a particular bucket in GS. |
447 | 482 |
448 Params: | 483 Params: |
449 bucket_name: name of the bucket (e.g., 'chromium-skia-gm') | 484 bucket_name: name of the bucket (e.g., 'chromium-skia-gm') |
450 """ | 485 """ |
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
493 exist yet. | 528 exist yet. |
494 | 529 |
495 Args: | 530 Args: |
496 path: full path of directory to create | 531 path: full path of directory to create |
497 """ | 532 """ |
498 try: | 533 try: |
499 os.makedirs(path) | 534 os.makedirs(path) |
500 except OSError as e: | 535 except OSError as e: |
501 if e.errno != errno.EEXIST: | 536 if e.errno != errno.EEXIST: |
502 raise | 537 raise |
| 538 |
| 539 |
| 540 def _get_local_md5(path): |
| 541 """Returns the MD5 hash of a file on local disk.""" |
| 542 hasher = hashlib.md5() |
| 543 with open(path, 'rb') as f: |
| 544 while True: |
| 545 data = f.read(64*1024) |
| 546 if not data: |
| 547 return hasher.hexdigest() |
| 548 hasher.update(data) |
OLD | NEW |