Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(125)

Side by Side Diff: py/utils/gs_utils.py

Issue 420553002: gs_utils: when uploading a whole dir using IF_NEW, check for existence of multiple files in a singl… (Closed) Base URL: https://skia.googlesource.com/common.git@master
Patch Set: check for existence of multiple files at once Created 6 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | py/utils/gs_utils_manualtest.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/python 1 #!/usr/bin/python
2 2
3 # pylint: disable=C0301 3 # pylint: disable=C0301
4 """ 4 """
5 Copyright 2014 Google Inc. 5 Copyright 2014 Google Inc.
6 6
7 Use of this source code is governed by a BSD-style license that can be 7 Use of this source code is governed by a BSD-style license that can be
8 found in the LICENSE file. 8 found in the LICENSE file.
9 9
10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper 10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after
97 """Types of identifiers we can use to set "fine-grained" ACLs.""" 97 """Types of identifiers we can use to set "fine-grained" ACLs."""
98 GROUP_BY_DOMAIN = acl.GROUP_BY_DOMAIN 98 GROUP_BY_DOMAIN = acl.GROUP_BY_DOMAIN
99 GROUP_BY_EMAIL = acl.GROUP_BY_EMAIL 99 GROUP_BY_EMAIL = acl.GROUP_BY_EMAIL
100 GROUP_BY_ID = acl.GROUP_BY_ID 100 GROUP_BY_ID = acl.GROUP_BY_ID
101 USER_BY_EMAIL = acl.USER_BY_EMAIL 101 USER_BY_EMAIL = acl.USER_BY_EMAIL
102 USER_BY_ID = acl.USER_BY_ID 102 USER_BY_ID = acl.USER_BY_ID
103 103
104 class UploadIf: 104 class UploadIf:
105 """Cases in which we will upload a file. 105 """Cases in which we will upload a file.
106 106
107 Beware of performance tradeoffs. E.g., if the file is small, the extra 107 Beware of performance tradeoffs. E.g., if you are uploading just one small
108 round trip to check for file existence and/or checksum may take longer than 108 file, the extra round trip to check for file existence and/or checksum may
109 just uploading the file. 109 take longer than just uploading the file.
110 See http://skbug.com/2778 ('gs_utils: when uploading IF_NEW, batch up 110 See http://skbug.com/2778 ('gs_utils: when uploading IF_NEW, batch up
111 checks for existing files within a single remote directory') 111 checks for existing files within a single remote directory')
112 """ 112 """
113 ALWAYS = 1 # always upload the file 113 ALWAYS = 1 # always upload the file
114 IF_NEW = 2 # if there is an existing file with the same name, 114 IF_NEW = 2 # if there is an existing file with the same name,
115 # leave it alone 115 # leave it alone
116 IF_MODIFIED = 3 # if there is an existing file with the same name and 116 IF_MODIFIED = 3 # if there is an existing file with the same name and
117 # contents, leave it alone 117 # contents, leave it alone
118 118
119 def __init__(self, boto_file_path=None): 119 def __init__(self, boto_file_path=None):
(...skipping 117 matching lines...) Expand 10 before | Expand all | Expand 10 after
237 except BotoServerError, e: 237 except BotoServerError, e:
238 e.body = (repr(e.body) + 238 e.body = (repr(e.body) +
239 ' while uploading source_path=%s to bucket=%s, path=%s' % ( 239 ' while uploading source_path=%s to bucket=%s, path=%s' % (
240 source_path, b.name, key.name)) 240 source_path, b.name, key.name))
241 raise 241 raise
242 for (id_type, id_value, permission) in fine_grained_acl_list or []: 242 for (id_type, id_value, permission) in fine_grained_acl_list or []:
243 self.set_acl( 243 self.set_acl(
244 bucket=b, path=key.name, 244 bucket=b, path=key.name,
245 id_type=id_type, id_value=id_value, permission=permission) 245 id_type=id_type, id_value=id_value, permission=permission)
246 246
247 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, **kwargs): 247 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir,
248 upload_if=UploadIf.ALWAYS, **kwargs):
248 """Recursively upload contents of a local directory to Google Storage. 249 """Recursively upload contents of a local directory to Google Storage.
249 250
250 params: 251 params:
251 source_dir: full path (local-OS-style) on local disk of directory to copy 252 source_dir: full path (local-OS-style) on local disk of directory to copy
252 contents of 253 contents of
253 dest_bucket: GS bucket to copy the files into 254 dest_bucket: GS bucket to copy the files into
254 dest_dir: full path (Posix-style) within that bucket; write the files into 255 dest_dir: full path (Posix-style) within that bucket; write the files into
255 this directory. If None, write into the root directory of the bucket. 256 this directory. If None, write into the root directory of the bucket.
257 upload_if: one of the UploadIf values, describing in which cases we should
258 upload the file
256 kwargs: any additional keyword arguments "inherited" from upload_file() 259 kwargs: any additional keyword arguments "inherited" from upload_file()
257 260
258 The copy operates as a merge: any files in source_dir will be "overlaid" on 261 The copy operates as a merge: any files in source_dir will be "overlaid" on
259 top of the existing content in dest_dir. Existing files with the same names 262 top of the existing content in dest_dir. Existing files with the same names
260 may or may not be overwritten, depending on the value of the upload_if kwarg 263 may or may not be overwritten, depending on the value of upload_if.
261 inherited from upload_file().
262 264
263 TODO(epoger): Upload multiple files simultaneously to reduce latency. 265 TODO(epoger): Upload multiple files simultaneously to reduce latency.
264
265 TODO(epoger): When upload_if==IF_NEW, batch up checks for existing files
266 within a single remote directory. See http://skbug.com/2778
267 """ 266 """
268 b = self._connect_to_bucket(bucket=dest_bucket) 267 b = self._connect_to_bucket(bucket=dest_bucket)
269 for filename in sorted(os.listdir(source_dir)): 268 if not dest_dir:
270 local_path = os.path.join(source_dir, filename) 269 dest_dir = ''
271 if dest_dir: 270
272 remote_path = posixpath.join(dest_dir, filename) 271 # Create a set of all files within source_dir.
272 source_fileset = set()
273 prefix_length = len(source_dir)+1
274 for dirpath, _, filenames in os.walk(source_dir):
275 relative_dirpath = dirpath[prefix_length:]
276 for filename in filenames:
277 source_fileset.add(os.path.join(relative_dirpath, filename))
278
279 # If we are only uploading files conditionally, remove any unnecessary
280 # files from source_fileset.
281 if upload_if == self.UploadIf.ALWAYS:
282 pass # there are no shortcuts... upload them all
283 else:
284 # Create a mapping of filename to Key for existing files within dest_dir
285 existing_dest_filemap = {}
286 prefix = dest_dir
287 if prefix and not prefix.endswith('/'):
288 prefix += '/'
289 prefix_length = len(prefix)
290 items = BucketListResultSet(bucket=b, prefix=prefix)
291 for item in items:
292 if type(item) is Key:
293 existing_dest_filemap[item.name[prefix_length:]] = item
294
295 # Now, depending on upload_if, trim files we should skip uploading.
296 files_in_common = source_fileset.intersection(
297 existing_dest_filemap.keys())
298 if upload_if == self.UploadIf.IF_NEW:
299 source_fileset -= files_in_common
300 elif upload_if == self.UploadIf.IF_MODIFIED:
301 for rel_path in files_in_common:
302 local_md5 = '"%s"' % _get_local_md5(path=os.path.join(
303 source_dir, rel_path))
304 key = existing_dest_filemap[rel_path]
305 if local_md5 == key.etag:
306 source_fileset.remove(rel_path)
273 else: 307 else:
274 remote_path = filename 308 raise Exception('unknown value of upload_if: %s' % upload_if)
275 309
276 if os.path.isdir(local_path): 310 # Upload any files still in source_fileset.
277 self.upload_dir_contents( # recurse 311 for rel_path in sorted(source_fileset):
278 source_dir=local_path, dest_bucket=b, dest_dir=remote_path, 312 self.upload_file(
279 **kwargs) 313 source_path=os.path.join(source_dir, rel_path),
280 else: 314 dest_bucket=b,
281 self.upload_file( 315 dest_path=posixpath.join(dest_dir, rel_path),
282 source_path=local_path, dest_bucket=b, dest_path=remote_path, 316 upload_if=self.UploadIf.ALWAYS,
283 **kwargs) 317 **kwargs)
284 318
285 def download_file(self, source_bucket, source_path, dest_path, 319 def download_file(self, source_bucket, source_path, dest_path,
286 create_subdirs_if_needed=False): 320 create_subdirs_if_needed=False):
287 """Downloads a single file from Google Cloud Storage to local disk. 321 """Downloads a single file from Google Cloud Storage to local disk.
288 322
289 Args: 323 Args:
290 source_bucket: GS bucket to download the file from 324 source_bucket: GS bucket to download the file from
291 source_path: full path (Posix-style) within that bucket 325 source_path: full path (Posix-style) within that bucket
292 dest_path: full path (local-OS-style) on local disk to copy the file to 326 dest_path: full path (local-OS-style) on local disk to copy the file to
293 create_subdirs_if_needed: boolean; whether to create subdirectories as 327 create_subdirs_if_needed: boolean; whether to create subdirectories as
(...skipping 240 matching lines...) Expand 10 before | Expand all | Expand 10 after
534 568
535 def _get_local_md5(path): 569 def _get_local_md5(path):
536 """Returns the MD5 hash of a file on local disk.""" 570 """Returns the MD5 hash of a file on local disk."""
537 hasher = hashlib.md5() 571 hasher = hashlib.md5()
538 with open(path, 'rb') as f: 572 with open(path, 'rb') as f:
539 while True: 573 while True:
540 data = f.read(64*1024) 574 data = f.read(64*1024)
541 if not data: 575 if not data:
542 return hasher.hexdigest() 576 return hasher.hexdigest()
543 hasher.update(data) 577 hasher.update(data)
OLDNEW
« no previous file with comments | « no previous file | py/utils/gs_utils_manualtest.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698