| OLD | NEW |
| 1 #!/usr/bin/python | 1 #!/usr/bin/python |
| 2 | 2 |
| 3 # pylint: disable=C0301 | 3 # pylint: disable=C0301 |
| 4 """ | 4 """ |
| 5 Copyright 2014 Google Inc. | 5 Copyright 2014 Google Inc. |
| 6 | 6 |
| 7 Use of this source code is governed by a BSD-style license that can be | 7 Use of this source code is governed by a BSD-style license that can be |
| 8 found in the LICENSE file. | 8 found in the LICENSE file. |
| 9 | 9 |
| 10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper | 10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper |
| (...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 153 bucket: GS bucket to delete a file from | 153 bucket: GS bucket to delete a file from |
| 154 path: full path (Posix-style) of the file within the bucket to delete | 154 path: full path (Posix-style) of the file within the bucket to delete |
| 155 """ | 155 """ |
| 156 b = self._connect_to_bucket(bucket=bucket) | 156 b = self._connect_to_bucket(bucket=bucket) |
| 157 key = Key(b) | 157 key = Key(b) |
| 158 key.name = path | 158 key.name = path |
| 159 try: | 159 try: |
| 160 key.delete() | 160 key.delete() |
| 161 except BotoServerError, e: | 161 except BotoServerError, e: |
| 162 e.body = (repr(e.body) + | 162 e.body = (repr(e.body) + |
| 163 ' while deleting bucket=%s, path=%s' % (b.name, path)) | 163 ' while deleting gs://%s/%s' % (b.name, path)) |
| 164 raise | 164 raise |
| 165 | 165 |
| 166 def get_last_modified_time(self, bucket, path): | 166 def get_last_modified_time(self, bucket, path): |
| 167 """Gets the timestamp of when this file was last modified. | 167 """Gets the timestamp of when this file was last modified. |
| 168 | 168 |
| 169 Params: | 169 Params: |
| 170 bucket: GS bucket in which to look for the file | 170 bucket: GS bucket in which to look for the file |
| 171 path: full path (Posix-style) of the file within the bucket to check | 171 path: full path (Posix-style) of the file within the bucket to check |
| 172 | 172 |
| 173 Returns the last modified time, as a freeform string. If the file was not | 173 Returns the last modified time, as a freeform string. If the file was not |
| 174 found, returns None. | 174 found, returns None. |
| 175 """ | 175 """ |
| 176 b = self._connect_to_bucket(bucket=bucket) | 176 b = self._connect_to_bucket(bucket=bucket) |
| 177 try: | 177 try: |
| 178 key = b.get_key(key_name=path) | 178 key = b.get_key(key_name=path) |
| 179 if not key: | 179 if not key: |
| 180 return None | 180 return None |
| 181 return key.last_modified | 181 return key.last_modified |
| 182 except BotoServerError, e: | 182 except BotoServerError, e: |
| 183 e.body = (repr(e.body) + | 183 e.body = (repr(e.body) + |
| 184 ' while getting attributes of bucket=%s, path=%s' % ( | 184 ' while getting attributes of gs://%s/%s' % (b.name, path)) |
| 185 b.name, path)) | |
| 186 raise | 185 raise |
| 187 | 186 |
| 188 def upload_file(self, source_path, dest_bucket, dest_path, | 187 def upload_file(self, source_path, dest_bucket, dest_path, |
| 189 upload_if=UploadIf.ALWAYS, | 188 upload_if=UploadIf.ALWAYS, |
| 190 predefined_acl=None, | 189 predefined_acl=None, |
| 191 fine_grained_acl_list=None): | 190 fine_grained_acl_list=None): |
| 192 """Upload contents of a local file to Google Storage. | 191 """Upload contents of a local file to Google Storage. |
| 193 | 192 |
| 194 params: | 193 params: |
| 195 source_path: full path (local-OS-style) on local disk to read from | 194 source_path: full path (local-OS-style) on local disk to read from |
| 196 dest_bucket: GS bucket to copy the file to | 195 dest_bucket: GS bucket to copy the file to |
| 197 dest_path: full path (Posix-style) within that bucket | 196 dest_path: full path (Posix-style) within that bucket |
| 198 upload_if: one of the UploadIf values, describing in which cases we should | 197 upload_if: one of the UploadIf values, describing in which cases we should |
| 199 upload the file | 198 upload the file |
| 200 predefined_acl: which predefined ACL to apply to the file on Google | 199 predefined_acl: which predefined ACL to apply to the file on Google |
| 201 Storage; must be one of the PredefinedACL values defined above. | 200 Storage; must be one of the PredefinedACL values defined above. |
| 202 If None, inherits dest_bucket's default object ACL. | 201 If None, inherits dest_bucket's default object ACL. |
| 203 fine_grained_acl_list: list of (id_type, id_value, permission) tuples | 202 fine_grained_acl_list: list of (id_type, id_value, permission) tuples |
| 204 to apply to the uploaded file (on top of the predefined_acl), | 203 to apply to the uploaded file (on top of the predefined_acl), |
| 205 or None if predefined_acl is sufficient | 204 or None if predefined_acl is sufficient |
| 206 | 205 |
| 207 TODO(epoger): Consider adding a do_compress parameter that would compress | 206 TODO(epoger): Consider adding a do_compress parameter that would compress |
| 208 the file using gzip before upload, and add a "Content-Encoding:gzip" header | 207 the file using gzip before upload, and add a "Content-Encoding:gzip" header |
| 209 so that HTTP downloads of the file would be unzipped automatically. | 208 so that HTTP downloads of the file would be unzipped automatically. |
| 210 See https://developers.google.com/storage/docs/gsutil/addlhelp/ | 209 See https://developers.google.com/storage/docs/gsutil/addlhelp/ |
| 211 WorkingWithObjectMetadata#content-encoding | 210 WorkingWithObjectMetadata#content-encoding |
| 212 """ | 211 """ |
| 213 b = self._connect_to_bucket(bucket=dest_bucket) | 212 b = self._connect_to_bucket(bucket=dest_bucket) |
| 213 local_md5 = None # filled in lazily |
| 214 | 214 |
| 215 if upload_if == self.UploadIf.IF_NEW: | 215 if upload_if == self.UploadIf.IF_NEW: |
| 216 old_key = b.get_key(key_name=dest_path) | 216 old_key = b.get_key(key_name=dest_path) |
| 217 if old_key: | 217 if old_key: |
| 218 print 'Skipping upload of existing file gs://%s/%s' % ( | 218 print 'Skipping upload of existing file gs://%s/%s' % ( |
| 219 b.name, dest_path) | 219 b.name, dest_path) |
| 220 return | 220 return |
| 221 elif upload_if == self.UploadIf.IF_MODIFIED: | 221 elif upload_if == self.UploadIf.IF_MODIFIED: |
| 222 old_key = b.get_key(key_name=dest_path) | 222 old_key = b.get_key(key_name=dest_path) |
| 223 if old_key: | 223 if old_key: |
| 224 local_md5 = '"%s"' % _get_local_md5(path=source_path) | 224 if not local_md5: |
| 225 if local_md5 == old_key.etag: | 225 local_md5 = _get_local_md5(path=source_path) |
| 226 if ('"%s"' % local_md5) == old_key.etag: |
| 226 print 'Skipping upload of unmodified file gs://%s/%s : %s' % ( | 227 print 'Skipping upload of unmodified file gs://%s/%s : %s' % ( |
| 227 b.name, dest_path, local_md5) | 228 b.name, dest_path, local_md5) |
| 228 return | 229 return |
| 229 elif upload_if != self.UploadIf.ALWAYS: | 230 elif upload_if != self.UploadIf.ALWAYS: |
| 230 raise Exception('unknown value of upload_if: %s' % upload_if) | 231 raise Exception('unknown value of upload_if: %s' % upload_if) |
| 231 | 232 |
| 232 key = Key(b) | 233 # Upload the file using a temporary name at first, in case the transfer |
| 233 key.name = dest_path | 234 # is interrupted partway through. |
| 235 if not local_md5: |
| 236 local_md5 = _get_local_md5(path=source_path) |
| 237 initial_key = Key(b) |
| 238 initial_key.name = dest_path + '-uploading-' + local_md5 |
| 234 try: | 239 try: |
| 235 key.set_contents_from_filename(filename=source_path, | 240 initial_key.set_contents_from_filename(filename=source_path, |
| 236 policy=predefined_acl) | 241 policy=predefined_acl) |
| 237 except BotoServerError, e: | 242 except BotoServerError, e: |
| 238 e.body = (repr(e.body) + | 243 e.body = (repr(e.body) + |
| 239 ' while uploading source_path=%s to bucket=%s, path=%s' % ( | 244 ' while uploading source_path=%s to gs://%s/%s' % ( |
| 240 source_path, b.name, key.name)) | 245 source_path, b.name, initial_key.name)) |
| 241 raise | 246 raise |
| 247 |
| 248 # Verify that the file contents were uploaded successfully. |
| 249 # |
| 250 # TODO(epoger): Check whether the boto library or XML API already do this... |
| 251 # if so, we may be duplicating effort here, and maybe we don't need to do |
| 252 # the whole "upload using temporary filename, then rename" thing. |
| 253 # |
| 254 # TODO(epoger): Confirm that the etag is set on the server side... |
| 255 # otherwise, we may just be validating another MD5 hash that was generated |
| 256 # on the client side before the file was uploaded! |
| 257 validate_key = b.get_key(key_name=initial_key.name) |
| 258 if validate_key.etag != ('"%s"' % local_md5): |
| 259 raise Exception('found wrong MD5 after uploading gs://%s/%s' % ( |
| 260 b.name, validate_key.name)) |
| 261 |
| 262 # Rename the file to its real name. |
| 263 # |
| 264 # TODO(epoger): I don't know how long this takes. I wish we could rename |
| 265 # the key instead, but AFAICT you can't do that. |
| 266 # Perhaps we could use Key.compose() to create a composite object pointing |
| 267 # at the original key? |
| 268 # See https://developers.google.com/storage/docs/composite-objects |
| 269 final_key = b.copy_key( |
| 270 new_key_name=dest_path, src_key_name=initial_key.name, |
| 271 src_bucket_name=b.name, preserve_acl=False) |
| 272 initial_key.delete() |
| 273 |
| 274 # Set ACLs on the file. |
| 275 # We do this *after* copy_key(), because copy_key's preserve_acl |
| 276 # functionality would incur a performance hit. |
| 242 for (id_type, id_value, permission) in fine_grained_acl_list or []: | 277 for (id_type, id_value, permission) in fine_grained_acl_list or []: |
| 243 self.set_acl( | 278 self.set_acl( |
| 244 bucket=b, path=key.name, | 279 bucket=b, path=final_key.name, |
| 245 id_type=id_type, id_value=id_value, permission=permission) | 280 id_type=id_type, id_value=id_value, permission=permission) |
| 246 | 281 |
| 247 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, | 282 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, |
| 248 upload_if=UploadIf.ALWAYS, **kwargs): | 283 upload_if=UploadIf.ALWAYS, **kwargs): |
| 249 """Recursively upload contents of a local directory to Google Storage. | 284 """Recursively upload contents of a local directory to Google Storage. |
| 250 | 285 |
| 251 params: | 286 params: |
| 252 source_dir: full path (local-OS-style) on local disk of directory to copy | 287 source_dir: full path (local-OS-style) on local disk of directory to copy |
| 253 contents of | 288 contents of |
| 254 dest_bucket: GS bucket to copy the files into | 289 dest_bucket: GS bucket to copy the files into |
| (...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 330 b = self._connect_to_bucket(bucket=source_bucket) | 365 b = self._connect_to_bucket(bucket=source_bucket) |
| 331 key = Key(b) | 366 key = Key(b) |
| 332 key.name = source_path | 367 key.name = source_path |
| 333 if create_subdirs_if_needed: | 368 if create_subdirs_if_needed: |
| 334 _makedirs_if_needed(os.path.dirname(dest_path)) | 369 _makedirs_if_needed(os.path.dirname(dest_path)) |
| 335 with open(dest_path, 'w') as f: | 370 with open(dest_path, 'w') as f: |
| 336 try: | 371 try: |
| 337 key.get_contents_to_file(fp=f) | 372 key.get_contents_to_file(fp=f) |
| 338 except BotoServerError, e: | 373 except BotoServerError, e: |
| 339 e.body = (repr(e.body) + | 374 e.body = (repr(e.body) + |
| 340 ' while downloading bucket=%s, path=%s to local_path=%s' % ( | 375 ' while downloading gs://%s/%s to local_path=%s' % ( |
| 341 b.name, source_path, dest_path)) | 376 b.name, source_path, dest_path)) |
| 342 raise | 377 raise |
| 343 | 378 |
| 344 def download_dir_contents(self, source_bucket, source_dir, dest_dir): | 379 def download_dir_contents(self, source_bucket, source_dir, dest_dir): |
| 345 """Recursively download contents of a Google Storage directory to local disk | 380 """Recursively download contents of a Google Storage directory to local disk |
| 346 | 381 |
| 347 params: | 382 params: |
| 348 source_bucket: GS bucket to copy the files from | 383 source_bucket: GS bucket to copy the files from |
| 349 source_dir: full path (Posix-style) within that bucket; read the files | 384 source_dir: full path (Posix-style) within that bucket; read the files |
| 350 from this directory | 385 from this directory |
| (...skipping 13 matching lines...) Expand all Loading... |
| 364 | 399 |
| 365 for filename in files: | 400 for filename in files: |
| 366 key = Key(b) | 401 key = Key(b) |
| 367 key.name = posixpath.join(source_dir, filename) | 402 key.name = posixpath.join(source_dir, filename) |
| 368 dest_path = os.path.join(dest_dir, filename) | 403 dest_path = os.path.join(dest_dir, filename) |
| 369 with open(dest_path, 'w') as f: | 404 with open(dest_path, 'w') as f: |
| 370 try: | 405 try: |
| 371 key.get_contents_to_file(fp=f) | 406 key.get_contents_to_file(fp=f) |
| 372 except BotoServerError, e: | 407 except BotoServerError, e: |
| 373 e.body = (repr(e.body) + | 408 e.body = (repr(e.body) + |
| 374 ' while downloading bucket=%s, path=%s to local_path=%s' % ( | 409 ' while downloading gs://%s/%s to local_path=%s' % ( |
| 375 b.name, key.name, dest_path)) | 410 b.name, key.name, dest_path)) |
| 376 raise | 411 raise |
| 377 | 412 |
| 378 for dirname in dirs: | 413 for dirname in dirs: |
| 379 self.download_dir_contents( # recurse | 414 self.download_dir_contents( # recurse |
| 380 source_bucket=source_bucket, | 415 source_bucket=source_bucket, |
| 381 source_dir=posixpath.join(source_dir, dirname), | 416 source_dir=posixpath.join(source_dir, dirname), |
| 382 dest_dir=os.path.join(dest_dir, dirname)) | 417 dest_dir=os.path.join(dest_dir, dirname)) |
| 383 | 418 |
| 384 def get_acl(self, bucket, path, id_type, id_value): | 419 def get_acl(self, bucket, path, id_type, id_value): |
| (...skipping 183 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 568 | 603 |
| 569 def _get_local_md5(path): | 604 def _get_local_md5(path): |
| 570 """Returns the MD5 hash of a file on local disk.""" | 605 """Returns the MD5 hash of a file on local disk.""" |
| 571 hasher = hashlib.md5() | 606 hasher = hashlib.md5() |
| 572 with open(path, 'rb') as f: | 607 with open(path, 'rb') as f: |
| 573 while True: | 608 while True: |
| 574 data = f.read(64*1024) | 609 data = f.read(64*1024) |
| 575 if not data: | 610 if not data: |
| 576 return hasher.hexdigest() | 611 return hasher.hexdigest() |
| 577 hasher.update(data) | 612 hasher.update(data) |
| OLD | NEW |