py/utils/gs_utils.py - Issue 420553002: gs_utils: when uploading a whole dir using IF_NEW, check for existence of multiple files in a singl…

Side by Side Diff: py/utils/gs_utils.py

Issue 420553002: gs_utils: when uploading a whole dir using IF_NEW, check for existence of multiple files in a singl… (Closed) Base URL: https://skia.googlesource.com/common.git@master

Patch Set: check for existence of multiple files at once Created 6 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #!/usr/bin/python	1 #!/usr/bin/python

2	2

3 # pylint: disable=C0301	3 # pylint: disable=C0301

4 """	4 """

5 Copyright 2014 Google Inc.	5 Copyright 2014 Google Inc.

6	6

7 Use of this source code is governed by a BSD-style license that can be	7 Use of this source code is governed by a BSD-style license that can be

8 found in the LICENSE file.	8 found in the LICENSE file.

9	9

10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper	10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper

(...skipping 86 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
97 """Types of identifiers we can use to set "fine-grained" ACLs."""	97 """Types of identifiers we can use to set "fine-grained" ACLs."""

98 GROUP_BY_DOMAIN = acl.GROUP_BY_DOMAIN	98 GROUP_BY_DOMAIN = acl.GROUP_BY_DOMAIN

99 GROUP_BY_EMAIL = acl.GROUP_BY_EMAIL	99 GROUP_BY_EMAIL = acl.GROUP_BY_EMAIL

100 GROUP_BY_ID = acl.GROUP_BY_ID	100 GROUP_BY_ID = acl.GROUP_BY_ID

101 USER_BY_EMAIL = acl.USER_BY_EMAIL	101 USER_BY_EMAIL = acl.USER_BY_EMAIL

102 USER_BY_ID = acl.USER_BY_ID	102 USER_BY_ID = acl.USER_BY_ID

103	103

104 class UploadIf:	104 class UploadIf:

105 """Cases in which we will upload a file.	105 """Cases in which we will upload a file.

106	106

107 Beware of performance tradeoffs. E.g., if the file is small, the extra	107 Beware of performance tradeoffs. E.g., if you are uploading just one small

108 round trip to check for file existence and/or checksum may take longer than	108 file, the extra round trip to check for file existence and/or checksum may

109 just uploading the file.	109 take longer than just uploading the file.

110 See http://skbug.com/2778 ('gs_utils: when uploading IF_NEW, batch up	110 See http://skbug.com/2778 ('gs_utils: when uploading IF_NEW, batch up

111 checks for existing files within a single remote directory')	111 checks for existing files within a single remote directory')

112 """	112 """

113 ALWAYS = 1 # always upload the file	113 ALWAYS = 1 # always upload the file

114 IF_NEW = 2 # if there is an existing file with the same name,	114 IF_NEW = 2 # if there is an existing file with the same name,

115 # leave it alone	115 # leave it alone

116 IF_MODIFIED = 3 # if there is an existing file with the same name and	116 IF_MODIFIED = 3 # if there is an existing file with the same name and

117 # contents, leave it alone	117 # contents, leave it alone

118	118

119 def __init__(self, boto_file_path=None):	119 def __init__(self, boto_file_path=None):

(...skipping 117 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
237 except BotoServerError, e:	237 except BotoServerError, e:

238 e.body = (repr(e.body) +	238 e.body = (repr(e.body) +

239 ' while uploading source_path=%s to bucket=%s, path=%s' % (	239 ' while uploading source_path=%s to bucket=%s, path=%s' % (

240 source_path, b.name, key.name))	240 source_path, b.name, key.name))

241 raise	241 raise

242 for (id_type, id_value, permission) in fine_grained_acl_list or []:	242 for (id_type, id_value, permission) in fine_grained_acl_list or []:

243 self.set_acl(	243 self.set_acl(

244 bucket=b, path=key.name,	244 bucket=b, path=key.name,

245 id_type=id_type, id_value=id_value, permission=permission)	245 id_type=id_type, id_value=id_value, permission=permission)

246	246

247 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, **kwargs):	247 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir,

	248 upload_if=UploadIf.ALWAYS, **kwargs):

248 """Recursively upload contents of a local directory to Google Storage.	249 """Recursively upload contents of a local directory to Google Storage.

249	250

250 params:	251 params:

251 source_dir: full path (local-OS-style) on local disk of directory to copy	252 source_dir: full path (local-OS-style) on local disk of directory to copy

252 contents of	253 contents of

253 dest_bucket: GS bucket to copy the files into	254 dest_bucket: GS bucket to copy the files into

254 dest_dir: full path (Posix-style) within that bucket; write the files into	255 dest_dir: full path (Posix-style) within that bucket; write the files into

255 this directory. If None, write into the root directory of the bucket.	256 this directory. If None, write into the root directory of the bucket.

	257 upload_if: one of the UploadIf values, describing in which cases we should

	258 upload the file

256 kwargs: any additional keyword arguments "inherited" from upload_file()	259 kwargs: any additional keyword arguments "inherited" from upload_file()

257	260

258 The copy operates as a merge: any files in source_dir will be "overlaid" on	261 The copy operates as a merge: any files in source_dir will be "overlaid" on

259 top of the existing content in dest_dir. Existing files with the same names	262 top of the existing content in dest_dir. Existing files with the same names

260 may or may not be overwritten, depending on the value of the upload_if kwarg	263 may or may not be overwritten, depending on the value of upload_if.

261 inherited from upload_file().

262	264

263 TODO(epoger): Upload multiple files simultaneously to reduce latency.	265 TODO(epoger): Upload multiple files simultaneously to reduce latency.

264

265 TODO(epoger): When upload_if==IF_NEW, batch up checks for existing files

266 within a single remote directory. See http://skbug.com/2778

267 """	266 """

268 b = self._connect_to_bucket(bucket=dest_bucket)	267 b = self._connect_to_bucket(bucket=dest_bucket)

269 for filename in sorted(os.listdir(source_dir)):	268 if not dest_dir:

270 local_path = os.path.join(source_dir, filename)	269 dest_dir = ''

271 if dest_dir:	270

272 remote_path = posixpath.join(dest_dir, filename)	271 # Create a set of all files within source_dir.

	272 source_fileset = set()

	273 prefix_length = len(source_dir)+1

	274 for dirpath, _, filenames in os.walk(source_dir):

	275 relative_dirpath = dirpath[prefix_length:]

	276 for filename in filenames:

	277 source_fileset.add(os.path.join(relative_dirpath, filename))

	278

	279 # If we are only uploading files conditionally, remove any unnecessary

	280 # files from source_fileset.

	281 if upload_if == self.UploadIf.ALWAYS:

	282 pass # there are no shortcuts... upload them all

	283 else:

	284 # Create a mapping of filename to Key for existing files within dest_dir

	285 existing_dest_filemap = {}

	286 prefix = dest_dir

	287 if prefix and not prefix.endswith('/'):

	288 prefix += '/'

	289 prefix_length = len(prefix)

	290 items = BucketListResultSet(bucket=b, prefix=prefix)

	291 for item in items:

	292 if type(item) is Key:

	293 existing_dest_filemap[item.name[prefix_length:]] = item

	294

	295 # Now, depending on upload_if, trim files we should skip uploading.

	296 files_in_common = source_fileset.intersection(

	297 existing_dest_filemap.keys())

	298 if upload_if == self.UploadIf.IF_NEW:

	299 source_fileset -= files_in_common

	300 elif upload_if == self.UploadIf.IF_MODIFIED:

	301 for rel_path in files_in_common:

	302 local_md5 = '"%s"' % _get_local_md5(path=os.path.join(

	303 source_dir, rel_path))

	304 key = existing_dest_filemap[rel_path]

	305 if local_md5 == key.etag:

	306 source_fileset.remove(rel_path)

273 else:	307 else:

274 remote_path = filename	308 raise Exception('unknown value of upload_if: %s' % upload_if)

275	309

276 if os.path.isdir(local_path):	310 # Upload any files still in source_fileset.

277 self.upload_dir_contents( # recurse	311 for rel_path in sorted(source_fileset):

278 source_dir=local_path, dest_bucket=b, dest_dir=remote_path,	312 self.upload_file(

279 **kwargs)	313 source_path=os.path.join(source_dir, rel_path),

280 else:	314 dest_bucket=b,

281 self.upload_file(	315 dest_path=posixpath.join(dest_dir, rel_path),

282 source_path=local_path, dest_bucket=b, dest_path=remote_path,	316 upload_if=self.UploadIf.ALWAYS,

283 **kwargs)	317 **kwargs)

284	318

285 def download_file(self, source_bucket, source_path, dest_path,	319 def download_file(self, source_bucket, source_path, dest_path,

286 create_subdirs_if_needed=False):	320 create_subdirs_if_needed=False):

287 """Downloads a single file from Google Cloud Storage to local disk.	321 """Downloads a single file from Google Cloud Storage to local disk.

288	322

289 Args:	323 Args:

290 source_bucket: GS bucket to download the file from	324 source_bucket: GS bucket to download the file from

291 source_path: full path (Posix-style) within that bucket	325 source_path: full path (Posix-style) within that bucket

292 dest_path: full path (local-OS-style) on local disk to copy the file to	326 dest_path: full path (local-OS-style) on local disk to copy the file to

293 create_subdirs_if_needed: boolean; whether to create subdirectories as	327 create_subdirs_if_needed: boolean; whether to create subdirectories as

(...skipping 240 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
534	568

535 def _get_local_md5(path):	569 def _get_local_md5(path):

536 """Returns the MD5 hash of a file on local disk."""	570 """Returns the MD5 hash of a file on local disk."""

537 hasher = hashlib.md5()	571 hasher = hashlib.md5()

538 with open(path, 'rb') as f:	572 with open(path, 'rb') as f:

539 while True:	573 while True:

540 data = f.read(64*1024)	574 data = f.read(64*1024)

541 if not data:	575 if not data:

542 return hasher.hexdigest()	576 return hasher.hexdigest()

543 hasher.update(data)	577 hasher.update(data)

OLD	NEW

« no previous file with comments | « no previous file | py/utils/gs_utils_manualtest.py » ('j') | no next file with comments »