Index: py/utils/gs_utils.py |
diff --git a/py/utils/gs_utils.py b/py/utils/gs_utils.py |
index 1d39ef7f27e17d4dc0a8b7075a40dd41f96040ef..4a8f7ac6dc4144bfff42193eb942120ccb5f467a 100644 |
--- a/py/utils/gs_utils.py |
+++ b/py/utils/gs_utils.py |
@@ -101,6 +101,17 @@ class GSUtils(object): |
USER_BY_EMAIL = acl.USER_BY_EMAIL |
USER_BY_ID = acl.USER_BY_ID |
+ class UploadIf: |
+ """Cases in which we will upload a file. |
+ |
+ Beware of performance tradeoffs. E.g., if the file is small, the extra |
+ round trip to check for file existence and/or checksum may take longer than |
+ just uploading the file.""" |
+ ALWAYS = 1 # always upload the file |
+ IF_NEW = 2 # if there is an existing file with the same name, |
+ # leave it alone |
+ IF_MODIFIED = 3 # if there is an existing file with the same name and |
+ # contents, leave it alone |
def __init__(self, boto_file_path=None): |
"""Constructor. |
@@ -139,7 +150,7 @@ class GSUtils(object): |
bucket: GS bucket to delete a file from |
path: full path (Posix-style) of the file within the bucket to delete |
""" |
- b = self._connect_to_bucket(bucket_name=bucket) |
+ b = self._connect_to_bucket(bucket=bucket) |
key = Key(b) |
key.name = path |
try: |
@@ -159,7 +170,7 @@ class GSUtils(object): |
Returns the last modified time, as a freeform string. If the file was not |
found, returns None. |
""" |
- b = self._connect_to_bucket(bucket_name=bucket) |
+ b = self._connect_to_bucket(bucket=bucket) |
try: |
key = b.get_key(key_name=path) |
if not key: |
@@ -172,38 +183,48 @@ class GSUtils(object): |
raise |
def upload_file(self, source_path, dest_bucket, dest_path, |
- only_if_modified=False, predefined_acl=None, |
+ upload_if=UploadIf.ALWAYS, |
+ predefined_acl=None, |
fine_grained_acl_list=None): |
"""Upload contents of a local file to Google Storage. |
params: |
source_path: full path (local-OS-style) on local disk to read from |
- dest_bucket: GCS bucket to copy the file to |
+ dest_bucket: GS bucket to copy the file to |
dest_path: full path (Posix-style) within that bucket |
- only_if_modified: if True, only upload the file if it would actually |
- change the content on Google Storage (uploads the file if dest_path |
- does not exist, or if it exists but has different contents than |
- source_path). Note that this may take longer than just uploading the |
- file without checking first, due to extra round-trips! |
+ upload_if: one of the UploadIf values, describing in which cases we should |
+ upload the file |
predefined_acl: which predefined ACL to apply to the file on Google |
Storage; must be one of the PredefinedACL values defined above. |
If None, inherits dest_bucket's default object ACL. |
- TODO(epoger): add unittests for this param, although it seems to work |
- in my manual testing |
fine_grained_acl_list: list of (id_type, id_value, permission) tuples |
to apply to the uploaded file (on top of the predefined_acl), |
or None if predefined_acl is sufficient |
+ |
+ TODO(epoger): Consider adding a do_compress parameter that would compress |
+ the file using gzip before upload, and add a "Content-Encoding:gzip" header |
+ so that HTTP downloads of the file would be unzipped automatically. |
+ See https://developers.google.com/storage/docs/gsutil/addlhelp/ |
+ WorkingWithObjectMetadata#content-encoding |
""" |
- b = self._connect_to_bucket(bucket_name=dest_bucket) |
+ b = self._connect_to_bucket(bucket=dest_bucket) |
- if only_if_modified: |
+ if upload_if == self.UploadIf.IF_NEW: |
+ old_key = b.get_key(key_name=dest_path) |
+ if old_key: |
+ print 'Skipping upload of existing file gs://%s/%s' % ( |
+ dest_bucket, dest_path) |
+ return |
+ elif upload_if == self.UploadIf.IF_MODIFIED: |
old_key = b.get_key(key_name=dest_path) |
if old_key: |
local_md5 = '"%s"' % _get_local_md5(path=source_path) |
if local_md5 == old_key.etag: |
- print 'Skipping upload of unmodified file %s : %s' % ( |
- source_path, local_md5) |
+ print 'Skipping upload of unmodified file gs://%s/%s : %s' % ( |
+ dest_bucket, dest_path, local_md5) |
return |
+ elif upload_if != self.UploadIf.ALWAYS: |
+ raise Exception('unknown value of upload_if: %s' % upload_if) |
key = Key(b) |
key.name = dest_path |
@@ -215,49 +236,30 @@ class GSUtils(object): |
' while uploading source_path=%s to bucket=%s, path=%s' % ( |
source_path, dest_bucket, key.name)) |
raise |
- # TODO(epoger): This may be inefficient, because it calls |
- # _connect_to_bucket() again. Depending on how expensive that |
- # call is, we may want to optimize this. |
for (id_type, id_value, permission) in fine_grained_acl_list or []: |
self.set_acl( |
- bucket=dest_bucket, path=key.name, |
+ bucket=b, path=key.name, |
id_type=id_type, id_value=id_value, permission=permission) |
- def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, |
- predefined_acl=None, fine_grained_acl_list=None): |
+ def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, **kwargs): |
"""Recursively upload contents of a local directory to Google Storage. |
params: |
source_dir: full path (local-OS-style) on local disk of directory to copy |
contents of |
- dest_bucket: GCS bucket to copy the files into |
+ dest_bucket: GS bucket to copy the files into |
dest_dir: full path (Posix-style) within that bucket; write the files into |
this directory. If None, write into the root directory of the bucket. |
- predefined_acl: which predefined ACL to apply to the files on Google |
- Storage; must be one of the PredefinedACL values defined above. |
- If None, inherits dest_bucket's default object ACL. |
- TODO(epoger): add unittests for this param, although it seems to work |
- in my manual testing |
- fine_grained_acl_list: list of (id_type, id_value, permission) tuples |
- to apply to every file uploaded (on top of the predefined_acl), |
- or None if predefined_acl is sufficient |
+ kwargs: any additional keyword arguments "inherited" from upload_file() |
- The copy operates as a "merge with overwrite": any files in source_dir will |
- be "overlaid" on top of the existing content in dest_dir. Existing files |
- with the same names will be overwritten. |
+ The copy operates as a merge: any files in source_dir will be "overlaid" on |
+ top of the existing content in dest_dir. Existing files with the same names |
+ may or may not be overwritten, depending on the value of the upload_if kwarg |
+ inherited from upload_file(). |
TODO(epoger): Upload multiple files simultaneously to reduce latency. |
- |
- TODO(epoger): Add a "noclobber" mode that will not upload any files would |
- overwrite existing files in Google Storage. |
- |
- TODO(epoger): Consider adding a do_compress parameter that would compress |
- the file using gzip before upload, and add a "Content-Encoding:gzip" header |
- so that HTTP downloads of the file would be unzipped automatically. |
- See https://developers.google.com/storage/docs/gsutil/addlhelp/ |
- WorkingWithObjectMetadata#content-encoding |
""" |
- b = self._connect_to_bucket(bucket_name=dest_bucket) |
+ b = self._connect_to_bucket(bucket=dest_bucket) |
for filename in sorted(os.listdir(source_dir)): |
local_path = os.path.join(source_dir, filename) |
if dest_dir: |
@@ -267,41 +269,25 @@ class GSUtils(object): |
if os.path.isdir(local_path): |
self.upload_dir_contents( # recurse |
- source_dir=local_path, dest_bucket=dest_bucket, |
- dest_dir=remote_path, |
- predefined_acl=predefined_acl, |
- fine_grained_acl_list=fine_grained_acl_list) |
+ source_dir=local_path, dest_bucket=b, dest_dir=remote_path, |
+ **kwargs) |
else: |
- key = Key(b) |
- key.name = remote_path |
- try: |
- key.set_contents_from_filename( |
- filename=local_path, policy=predefined_acl) |
- except BotoServerError, e: |
- e.body = (repr(e.body) + |
- ' while uploading local_path=%s to bucket=%s, path=%s' % ( |
- local_path, dest_bucket, remote_path)) |
- raise |
- # TODO(epoger): This may be inefficient, because it calls |
- # _connect_to_bucket() for every file. Depending on how expensive that |
- # call is, we may want to optimize this. |
- for (id_type, id_value, permission) in fine_grained_acl_list or []: |
- self.set_acl( |
- bucket=dest_bucket, path=remote_path, |
- id_type=id_type, id_value=id_value, permission=permission) |
+ self.upload_file( |
+ source_path=local_path, dest_bucket=b, dest_path=remote_path, |
+ **kwargs) |
def download_file(self, source_bucket, source_path, dest_path, |
create_subdirs_if_needed=False): |
"""Downloads a single file from Google Cloud Storage to local disk. |
Args: |
- source_bucket: GCS bucket to download the file from |
+ source_bucket: GS bucket to download the file from |
source_path: full path (Posix-style) within that bucket |
dest_path: full path (local-OS-style) on local disk to copy the file to |
create_subdirs_if_needed: boolean; whether to create subdirectories as |
needed to create dest_path |
""" |
- b = self._connect_to_bucket(bucket_name=source_bucket) |
+ b = self._connect_to_bucket(bucket=source_bucket) |
key = Key(b) |
key.name = source_path |
if create_subdirs_if_needed: |
@@ -319,7 +305,7 @@ class GSUtils(object): |
"""Recursively download contents of a Google Storage directory to local disk |
params: |
- source_bucket: GCS bucket to copy the files from |
+ source_bucket: GS bucket to copy the files from |
source_dir: full path (Posix-style) within that bucket; read the files |
from this directory |
dest_dir: full path (local-OS-style) on local disk of directory to copy |
@@ -332,7 +318,7 @@ class GSUtils(object): |
TODO(epoger): Download multiple files simultaneously to reduce latency. |
""" |
_makedirs_if_needed(dest_dir) |
- b = self._connect_to_bucket(bucket_name=source_bucket) |
+ b = self._connect_to_bucket(bucket=source_bucket) |
(dirs, files) = self.list_bucket_contents( |
bucket=source_bucket, subdir=source_dir) |
@@ -378,7 +364,7 @@ class GSUtils(object): |
permissions have been set. |
""" |
field = self._field_by_id_type[id_type] |
- b = self._connect_to_bucket(bucket_name=bucket) |
+ b = self._connect_to_bucket(bucket=bucket) |
acls = b.get_acl(key_name=path) |
matching_entries = [entry for entry in acls.entries.entry_list |
if (entry.scope.type == id_type) and |
@@ -427,7 +413,7 @@ class GSUtils(object): |
assert Permission.WRITE == get_acl(bucket, path, id_type, id_value) |
""" |
field = self._field_by_id_type[id_type] |
- b = self._connect_to_bucket(bucket_name=bucket) |
+ b = self._connect_to_bucket(bucket=bucket) |
acls = b.get_acl(key_name=path) |
# Remove any existing entries that refer to the same id_type/id_value, |
@@ -465,7 +451,7 @@ class GSUtils(object): |
prefix += '/' |
prefix_length = len(prefix) if prefix else 0 |
- b = self._connect_to_bucket(bucket_name=bucket) |
+ b = self._connect_to_bucket(bucket=bucket) |
items = BucketListResultSet(bucket=b, prefix=prefix, delimiter='/') |
dirs = [] |
files = [] |
@@ -477,16 +463,19 @@ class GSUtils(object): |
dirs.append(item.name[prefix_length:-1]) |
return (dirs, files) |
- def _connect_to_bucket(self, bucket_name): |
+ def _connect_to_bucket(self, bucket): |
"""Returns a Bucket object we can use to access a particular bucket in GS. |
Params: |
- bucket_name: name of the bucket (e.g., 'chromium-skia-gm') |
+ bucket: name of the bucket (e.g., 'chromium-skia-gm'), or a Bucket |
+ object (in which case this param is just returned as-is) |
""" |
+ if type(bucket) is Bucket: |
+ return bucket |
try: |
- return self._create_connection().get_bucket(bucket_name=bucket_name) |
+ return self._create_connection().get_bucket(bucket_name=bucket) |
except BotoServerError, e: |
- e.body = repr(e.body) + ' while connecting to bucket=%s' % bucket_name |
+ e.body = repr(e.body) + ' while connecting to bucket=%s' % bucket |
raise |
def _create_connection(self): |