OLD | NEW |
1 #!/usr/bin/python | 1 #!/usr/bin/python |
2 | 2 |
3 # pylint: disable=C0301 | 3 # pylint: disable=C0301 |
4 """ | 4 """ |
5 Copyright 2014 Google Inc. | 5 Copyright 2014 Google Inc. |
6 | 6 |
7 Use of this source code is governed by a BSD-style license that can be | 7 Use of this source code is governed by a BSD-style license that can be |
8 found in the LICENSE file. | 8 found in the LICENSE file. |
9 | 9 |
10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper | 10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper |
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
97 """Types of identifiers we can use to set "fine-grained" ACLs.""" | 97 """Types of identifiers we can use to set "fine-grained" ACLs.""" |
98 GROUP_BY_DOMAIN = acl.GROUP_BY_DOMAIN | 98 GROUP_BY_DOMAIN = acl.GROUP_BY_DOMAIN |
99 GROUP_BY_EMAIL = acl.GROUP_BY_EMAIL | 99 GROUP_BY_EMAIL = acl.GROUP_BY_EMAIL |
100 GROUP_BY_ID = acl.GROUP_BY_ID | 100 GROUP_BY_ID = acl.GROUP_BY_ID |
101 USER_BY_EMAIL = acl.USER_BY_EMAIL | 101 USER_BY_EMAIL = acl.USER_BY_EMAIL |
102 USER_BY_ID = acl.USER_BY_ID | 102 USER_BY_ID = acl.USER_BY_ID |
103 | 103 |
104 class UploadIf: | 104 class UploadIf: |
105 """Cases in which we will upload a file. | 105 """Cases in which we will upload a file. |
106 | 106 |
107 Beware of performance tradeoffs. E.g., if the file is small, the extra | 107 Beware of performance tradeoffs. E.g., if you are uploading just one small |
108 round trip to check for file existence and/or checksum may take longer than | 108 file, the extra round trip to check for file existence and/or checksum may |
109 just uploading the file. | 109 take longer than just uploading the file. |
110 See http://skbug.com/2778 ('gs_utils: when uploading IF_NEW, batch up | 110 See http://skbug.com/2778 ('gs_utils: when uploading IF_NEW, batch up |
111 checks for existing files within a single remote directory') | 111 checks for existing files within a single remote directory') |
112 """ | 112 """ |
113 ALWAYS = 1 # always upload the file | 113 ALWAYS = 1 # always upload the file |
114 IF_NEW = 2 # if there is an existing file with the same name, | 114 IF_NEW = 2 # if there is an existing file with the same name, |
115 # leave it alone | 115 # leave it alone |
116 IF_MODIFIED = 3 # if there is an existing file with the same name and | 116 IF_MODIFIED = 3 # if there is an existing file with the same name and |
117 # contents, leave it alone | 117 # contents, leave it alone |
118 | 118 |
119 def __init__(self, boto_file_path=None): | 119 def __init__(self, boto_file_path=None): |
(...skipping 117 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
237 except BotoServerError, e: | 237 except BotoServerError, e: |
238 e.body = (repr(e.body) + | 238 e.body = (repr(e.body) + |
239 ' while uploading source_path=%s to bucket=%s, path=%s' % ( | 239 ' while uploading source_path=%s to bucket=%s, path=%s' % ( |
240 source_path, b.name, key.name)) | 240 source_path, b.name, key.name)) |
241 raise | 241 raise |
242 for (id_type, id_value, permission) in fine_grained_acl_list or []: | 242 for (id_type, id_value, permission) in fine_grained_acl_list or []: |
243 self.set_acl( | 243 self.set_acl( |
244 bucket=b, path=key.name, | 244 bucket=b, path=key.name, |
245 id_type=id_type, id_value=id_value, permission=permission) | 245 id_type=id_type, id_value=id_value, permission=permission) |
246 | 246 |
247 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, **kwargs): | 247 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, |
| 248 upload_if=UploadIf.ALWAYS, **kwargs): |
248 """Recursively upload contents of a local directory to Google Storage. | 249 """Recursively upload contents of a local directory to Google Storage. |
249 | 250 |
250 params: | 251 params: |
251 source_dir: full path (local-OS-style) on local disk of directory to copy | 252 source_dir: full path (local-OS-style) on local disk of directory to copy |
252 contents of | 253 contents of |
253 dest_bucket: GS bucket to copy the files into | 254 dest_bucket: GS bucket to copy the files into |
254 dest_dir: full path (Posix-style) within that bucket; write the files into | 255 dest_dir: full path (Posix-style) within that bucket; write the files into |
255 this directory. If None, write into the root directory of the bucket. | 256 this directory. If None, write into the root directory of the bucket. |
| 257 upload_if: one of the UploadIf values, describing in which cases we should |
| 258 upload the file |
256 kwargs: any additional keyword arguments "inherited" from upload_file() | 259 kwargs: any additional keyword arguments "inherited" from upload_file() |
257 | 260 |
258 The copy operates as a merge: any files in source_dir will be "overlaid" on | 261 The copy operates as a merge: any files in source_dir will be "overlaid" on |
259 top of the existing content in dest_dir. Existing files with the same names | 262 top of the existing content in dest_dir. Existing files with the same names |
260 may or may not be overwritten, depending on the value of the upload_if kwarg | 263 may or may not be overwritten, depending on the value of upload_if. |
261 inherited from upload_file(). | |
262 | 264 |
263 TODO(epoger): Upload multiple files simultaneously to reduce latency. | 265 TODO(epoger): Upload multiple files simultaneously to reduce latency. |
264 | |
265 TODO(epoger): When upload_if==IF_NEW, batch up checks for existing files | |
266 within a single remote directory. See http://skbug.com/2778 | |
267 """ | 266 """ |
268 b = self._connect_to_bucket(bucket=dest_bucket) | 267 b = self._connect_to_bucket(bucket=dest_bucket) |
269 for filename in sorted(os.listdir(source_dir)): | 268 if not dest_dir: |
270 local_path = os.path.join(source_dir, filename) | 269 dest_dir = '' |
271 if dest_dir: | 270 |
272 remote_path = posixpath.join(dest_dir, filename) | 271 # Create a set of all files within source_dir. |
| 272 source_fileset = set() |
| 273 prefix_length = len(source_dir)+1 |
| 274 for dirpath, _, filenames in os.walk(source_dir): |
| 275 relative_dirpath = dirpath[prefix_length:] |
| 276 for filename in filenames: |
| 277 source_fileset.add(os.path.join(relative_dirpath, filename)) |
| 278 |
| 279 # If we are only uploading files conditionally, remove any unnecessary |
| 280 # files from source_fileset. |
| 281 if upload_if == self.UploadIf.ALWAYS: |
| 282 pass # there are no shortcuts... upload them all |
| 283 else: |
| 284 # Create a mapping of filename to Key for existing files within dest_dir |
| 285 existing_dest_filemap = {} |
| 286 prefix = dest_dir |
| 287 if prefix and not prefix.endswith('/'): |
| 288 prefix += '/' |
| 289 prefix_length = len(prefix) |
| 290 items = BucketListResultSet(bucket=b, prefix=prefix) |
| 291 for item in items: |
| 292 if type(item) is Key: |
| 293 existing_dest_filemap[item.name[prefix_length:]] = item |
| 294 |
| 295 # Now, depending on upload_if, trim files we should skip uploading. |
| 296 files_in_common = source_fileset.intersection( |
| 297 existing_dest_filemap.keys()) |
| 298 if upload_if == self.UploadIf.IF_NEW: |
| 299 source_fileset -= files_in_common |
| 300 elif upload_if == self.UploadIf.IF_MODIFIED: |
| 301 for rel_path in files_in_common: |
| 302 local_md5 = '"%s"' % _get_local_md5(path=os.path.join( |
| 303 source_dir, rel_path)) |
| 304 key = existing_dest_filemap[rel_path] |
| 305 if local_md5 == key.etag: |
| 306 source_fileset.remove(rel_path) |
273 else: | 307 else: |
274 remote_path = filename | 308 raise Exception('unknown value of upload_if: %s' % upload_if) |
275 | 309 |
276 if os.path.isdir(local_path): | 310 # Upload any files still in source_fileset. |
277 self.upload_dir_contents( # recurse | 311 for rel_path in sorted(source_fileset): |
278 source_dir=local_path, dest_bucket=b, dest_dir=remote_path, | 312 self.upload_file( |
279 **kwargs) | 313 source_path=os.path.join(source_dir, rel_path), |
280 else: | 314 dest_bucket=b, |
281 self.upload_file( | 315 dest_path=posixpath.join(dest_dir, rel_path), |
282 source_path=local_path, dest_bucket=b, dest_path=remote_path, | 316 upload_if=self.UploadIf.ALWAYS, |
283 **kwargs) | 317 **kwargs) |
284 | 318 |
285 def download_file(self, source_bucket, source_path, dest_path, | 319 def download_file(self, source_bucket, source_path, dest_path, |
286 create_subdirs_if_needed=False): | 320 create_subdirs_if_needed=False): |
287 """Downloads a single file from Google Cloud Storage to local disk. | 321 """Downloads a single file from Google Cloud Storage to local disk. |
288 | 322 |
289 Args: | 323 Args: |
290 source_bucket: GS bucket to download the file from | 324 source_bucket: GS bucket to download the file from |
291 source_path: full path (Posix-style) within that bucket | 325 source_path: full path (Posix-style) within that bucket |
292 dest_path: full path (local-OS-style) on local disk to copy the file to | 326 dest_path: full path (local-OS-style) on local disk to copy the file to |
293 create_subdirs_if_needed: boolean; whether to create subdirectories as | 327 create_subdirs_if_needed: boolean; whether to create subdirectories as |
(...skipping 240 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
534 | 568 |
535 def _get_local_md5(path): | 569 def _get_local_md5(path): |
536 """Returns the MD5 hash of a file on local disk.""" | 570 """Returns the MD5 hash of a file on local disk.""" |
537 hasher = hashlib.md5() | 571 hasher = hashlib.md5() |
538 with open(path, 'rb') as f: | 572 with open(path, 'rb') as f: |
539 while True: | 573 while True: |
540 data = f.read(64*1024) | 574 data = f.read(64*1024) |
541 if not data: | 575 if not data: |
542 return hasher.hexdigest() | 576 return hasher.hexdigest() |
543 hasher.update(data) | 577 hasher.update(data) |
OLD | NEW |