| OLD | NEW |
| 1 #!/usr/bin/python | 1 #!/usr/bin/python |
| 2 | 2 |
| 3 # pylint: disable=C0301 | 3 # pylint: disable=C0301 |
| 4 """ | 4 """ |
| 5 Copyright 2014 Google Inc. | 5 Copyright 2014 Google Inc. |
| 6 | 6 |
| 7 Use of this source code is governed by a BSD-style license that can be | 7 Use of this source code is governed by a BSD-style license that can be |
| 8 found in the LICENSE file. | 8 found in the LICENSE file. |
| 9 | 9 |
| 10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper | 10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper |
| (...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 97 """Types of identifiers we can use to set "fine-grained" ACLs.""" | 97 """Types of identifiers we can use to set "fine-grained" ACLs.""" |
| 98 GROUP_BY_DOMAIN = acl.GROUP_BY_DOMAIN | 98 GROUP_BY_DOMAIN = acl.GROUP_BY_DOMAIN |
| 99 GROUP_BY_EMAIL = acl.GROUP_BY_EMAIL | 99 GROUP_BY_EMAIL = acl.GROUP_BY_EMAIL |
| 100 GROUP_BY_ID = acl.GROUP_BY_ID | 100 GROUP_BY_ID = acl.GROUP_BY_ID |
| 101 USER_BY_EMAIL = acl.USER_BY_EMAIL | 101 USER_BY_EMAIL = acl.USER_BY_EMAIL |
| 102 USER_BY_ID = acl.USER_BY_ID | 102 USER_BY_ID = acl.USER_BY_ID |
| 103 | 103 |
| 104 class UploadIf: | 104 class UploadIf: |
| 105 """Cases in which we will upload a file. | 105 """Cases in which we will upload a file. |
| 106 | 106 |
| 107 Beware of performance tradeoffs. E.g., if the file is small, the extra | 107 Beware of performance tradeoffs. E.g., if you are uploading just one small |
| 108 round trip to check for file existence and/or checksum may take longer than | 108 file, the extra round trip to check for file existence and/or checksum may |
| 109 just uploading the file. | 109 take longer than just uploading the file. |
| 110 See http://skbug.com/2778 ('gs_utils: when uploading IF_NEW, batch up | 110 See http://skbug.com/2778 ('gs_utils: when uploading IF_NEW, batch up |
| 111 checks for existing files within a single remote directory') | 111 checks for existing files within a single remote directory') |
| 112 """ | 112 """ |
| 113 ALWAYS = 1 # always upload the file | 113 ALWAYS = 1 # always upload the file |
| 114 IF_NEW = 2 # if there is an existing file with the same name, | 114 IF_NEW = 2 # if there is an existing file with the same name, |
| 115 # leave it alone | 115 # leave it alone |
| 116 IF_MODIFIED = 3 # if there is an existing file with the same name and | 116 IF_MODIFIED = 3 # if there is an existing file with the same name and |
| 117 # contents, leave it alone | 117 # contents, leave it alone |
| 118 | 118 |
| 119 def __init__(self, boto_file_path=None): | 119 def __init__(self, boto_file_path=None): |
| (...skipping 117 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 237 except BotoServerError, e: | 237 except BotoServerError, e: |
| 238 e.body = (repr(e.body) + | 238 e.body = (repr(e.body) + |
| 239 ' while uploading source_path=%s to bucket=%s, path=%s' % ( | 239 ' while uploading source_path=%s to bucket=%s, path=%s' % ( |
| 240 source_path, b.name, key.name)) | 240 source_path, b.name, key.name)) |
| 241 raise | 241 raise |
| 242 for (id_type, id_value, permission) in fine_grained_acl_list or []: | 242 for (id_type, id_value, permission) in fine_grained_acl_list or []: |
| 243 self.set_acl( | 243 self.set_acl( |
| 244 bucket=b, path=key.name, | 244 bucket=b, path=key.name, |
| 245 id_type=id_type, id_value=id_value, permission=permission) | 245 id_type=id_type, id_value=id_value, permission=permission) |
| 246 | 246 |
| 247 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, **kwargs): | 247 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, |
| 248 upload_if=UploadIf.ALWAYS, **kwargs): |
| 248 """Recursively upload contents of a local directory to Google Storage. | 249 """Recursively upload contents of a local directory to Google Storage. |
| 249 | 250 |
| 250 params: | 251 params: |
| 251 source_dir: full path (local-OS-style) on local disk of directory to copy | 252 source_dir: full path (local-OS-style) on local disk of directory to copy |
| 252 contents of | 253 contents of |
| 253 dest_bucket: GS bucket to copy the files into | 254 dest_bucket: GS bucket to copy the files into |
| 254 dest_dir: full path (Posix-style) within that bucket; write the files into | 255 dest_dir: full path (Posix-style) within that bucket; write the files into |
| 255 this directory. If None, write into the root directory of the bucket. | 256 this directory. If None, write into the root directory of the bucket. |
| 257 upload_if: one of the UploadIf values, describing in which cases we should |
| 258 upload the file |
| 256 kwargs: any additional keyword arguments "inherited" from upload_file() | 259 kwargs: any additional keyword arguments "inherited" from upload_file() |
| 257 | 260 |
| 258 The copy operates as a merge: any files in source_dir will be "overlaid" on | 261 The copy operates as a merge: any files in source_dir will be "overlaid" on |
| 259 top of the existing content in dest_dir. Existing files with the same names | 262 top of the existing content in dest_dir. Existing files with the same names |
| 260 may or may not be overwritten, depending on the value of the upload_if kwarg | 263 may or may not be overwritten, depending on the value of upload_if. |
| 261 inherited from upload_file(). | |
| 262 | 264 |
| 263 TODO(epoger): Upload multiple files simultaneously to reduce latency. | 265 TODO(epoger): Upload multiple files simultaneously to reduce latency. |
| 264 | |
| 265 TODO(epoger): When upload_if==IF_NEW, batch up checks for existing files | |
| 266 within a single remote directory. See http://skbug.com/2778 | |
| 267 """ | 266 """ |
| 268 b = self._connect_to_bucket(bucket=dest_bucket) | 267 b = self._connect_to_bucket(bucket=dest_bucket) |
| 269 for filename in sorted(os.listdir(source_dir)): | 268 if not dest_dir: |
| 270 local_path = os.path.join(source_dir, filename) | 269 dest_dir = '' |
| 271 if dest_dir: | 270 |
| 272 remote_path = posixpath.join(dest_dir, filename) | 271 # Create a set of all files within source_dir. |
| 272 source_fileset = set() |
| 273 prefix_length = len(source_dir)+1 |
| 274 for dirpath, _, filenames in os.walk(source_dir): |
| 275 relative_dirpath = dirpath[prefix_length:] |
| 276 for filename in filenames: |
| 277 source_fileset.add(os.path.join(relative_dirpath, filename)) |
| 278 |
| 279 # If we are only uploading files conditionally, remove any unnecessary |
| 280 # files from source_fileset. |
| 281 if upload_if == self.UploadIf.ALWAYS: |
| 282 pass # there are no shortcuts... upload them all |
| 283 else: |
| 284 # Create a mapping of filename to Key for existing files within dest_dir |
| 285 existing_dest_filemap = {} |
| 286 prefix = dest_dir |
| 287 if prefix and not prefix.endswith('/'): |
| 288 prefix += '/' |
| 289 prefix_length = len(prefix) |
| 290 items = BucketListResultSet(bucket=b, prefix=prefix) |
| 291 for item in items: |
| 292 if type(item) is Key: |
| 293 existing_dest_filemap[item.name[prefix_length:]] = item |
| 294 |
| 295 # Now, depending on upload_if, trim files we should skip uploading. |
| 296 files_in_common = source_fileset.intersection( |
| 297 existing_dest_filemap.keys()) |
| 298 if upload_if == self.UploadIf.IF_NEW: |
| 299 source_fileset -= files_in_common |
| 300 elif upload_if == self.UploadIf.IF_MODIFIED: |
| 301 for rel_path in files_in_common: |
| 302 local_md5 = '"%s"' % _get_local_md5(path=os.path.join( |
| 303 source_dir, rel_path)) |
| 304 key = existing_dest_filemap[rel_path] |
| 305 if local_md5 == key.etag: |
| 306 source_fileset.remove(rel_path) |
| 273 else: | 307 else: |
| 274 remote_path = filename | 308 raise Exception('unknown value of upload_if: %s' % upload_if) |
| 275 | 309 |
| 276 if os.path.isdir(local_path): | 310 # Upload any files still in source_fileset. |
| 277 self.upload_dir_contents( # recurse | 311 for rel_path in sorted(source_fileset): |
| 278 source_dir=local_path, dest_bucket=b, dest_dir=remote_path, | 312 self.upload_file( |
| 279 **kwargs) | 313 source_path=os.path.join(source_dir, rel_path), |
| 280 else: | 314 dest_bucket=b, |
| 281 self.upload_file( | 315 dest_path=posixpath.join(dest_dir, rel_path), |
| 282 source_path=local_path, dest_bucket=b, dest_path=remote_path, | 316 upload_if=self.UploadIf.ALWAYS, |
| 283 **kwargs) | 317 **kwargs) |
| 284 | 318 |
| 285 def download_file(self, source_bucket, source_path, dest_path, | 319 def download_file(self, source_bucket, source_path, dest_path, |
| 286 create_subdirs_if_needed=False): | 320 create_subdirs_if_needed=False): |
| 287 """Downloads a single file from Google Cloud Storage to local disk. | 321 """Downloads a single file from Google Cloud Storage to local disk. |
| 288 | 322 |
| 289 Args: | 323 Args: |
| 290 source_bucket: GS bucket to download the file from | 324 source_bucket: GS bucket to download the file from |
| 291 source_path: full path (Posix-style) within that bucket | 325 source_path: full path (Posix-style) within that bucket |
| 292 dest_path: full path (local-OS-style) on local disk to copy the file to | 326 dest_path: full path (local-OS-style) on local disk to copy the file to |
| 293 create_subdirs_if_needed: boolean; whether to create subdirectories as | 327 create_subdirs_if_needed: boolean; whether to create subdirectories as |
| (...skipping 240 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 534 | 568 |
| 535 def _get_local_md5(path): | 569 def _get_local_md5(path): |
| 536 """Returns the MD5 hash of a file on local disk.""" | 570 """Returns the MD5 hash of a file on local disk.""" |
| 537 hasher = hashlib.md5() | 571 hasher = hashlib.md5() |
| 538 with open(path, 'rb') as f: | 572 with open(path, 'rb') as f: |
| 539 while True: | 573 while True: |
| 540 data = f.read(64*1024) | 574 data = f.read(64*1024) |
| 541 if not data: | 575 if not data: |
| 542 return hasher.hexdigest() | 576 return hasher.hexdigest() |
| 543 hasher.update(data) | 577 hasher.update(data) |
| OLD | NEW |