OLD | NEW |
1 #!/usr/bin/python | 1 #!/usr/bin/python |
2 | 2 |
3 # pylint: disable=C0301 | 3 # pylint: disable=C0301 |
4 """ | 4 """ |
5 Copyright 2014 Google Inc. | 5 Copyright 2014 Google Inc. |
6 | 6 |
7 Use of this source code is governed by a BSD-style license that can be | 7 Use of this source code is governed by a BSD-style license that can be |
8 found in the LICENSE file. | 8 found in the LICENSE file. |
9 | 9 |
10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper | 10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper |
(...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
153 bucket: GS bucket to delete a file from | 153 bucket: GS bucket to delete a file from |
154 path: full path (Posix-style) of the file within the bucket to delete | 154 path: full path (Posix-style) of the file within the bucket to delete |
155 """ | 155 """ |
156 b = self._connect_to_bucket(bucket=bucket) | 156 b = self._connect_to_bucket(bucket=bucket) |
157 key = Key(b) | 157 key = Key(b) |
158 key.name = path | 158 key.name = path |
159 try: | 159 try: |
160 key.delete() | 160 key.delete() |
161 except BotoServerError, e: | 161 except BotoServerError, e: |
162 e.body = (repr(e.body) + | 162 e.body = (repr(e.body) + |
163 ' while deleting bucket=%s, path=%s' % (b.name, path)) | 163 ' while deleting gs://%s/%s' % (b.name, path)) |
164 raise | 164 raise |
165 | 165 |
166 def get_last_modified_time(self, bucket, path): | 166 def get_last_modified_time(self, bucket, path): |
167 """Gets the timestamp of when this file was last modified. | 167 """Gets the timestamp of when this file was last modified. |
168 | 168 |
169 Params: | 169 Params: |
170 bucket: GS bucket in which to look for the file | 170 bucket: GS bucket in which to look for the file |
171 path: full path (Posix-style) of the file within the bucket to check | 171 path: full path (Posix-style) of the file within the bucket to check |
172 | 172 |
173 Returns the last modified time, as a freeform string. If the file was not | 173 Returns the last modified time, as a freeform string. If the file was not |
174 found, returns None. | 174 found, returns None. |
175 """ | 175 """ |
176 b = self._connect_to_bucket(bucket=bucket) | 176 b = self._connect_to_bucket(bucket=bucket) |
177 try: | 177 try: |
178 key = b.get_key(key_name=path) | 178 key = b.get_key(key_name=path) |
179 if not key: | 179 if not key: |
180 return None | 180 return None |
181 return key.last_modified | 181 return key.last_modified |
182 except BotoServerError, e: | 182 except BotoServerError, e: |
183 e.body = (repr(e.body) + | 183 e.body = (repr(e.body) + |
184 ' while getting attributes of bucket=%s, path=%s' % ( | 184 ' while getting attributes of gs://%s/%s' % (b.name, path)) |
185 b.name, path)) | |
186 raise | 185 raise |
187 | 186 |
188 def upload_file(self, source_path, dest_bucket, dest_path, | 187 def upload_file(self, source_path, dest_bucket, dest_path, |
189 upload_if=UploadIf.ALWAYS, | 188 upload_if=UploadIf.ALWAYS, |
190 predefined_acl=None, | 189 predefined_acl=None, |
191 fine_grained_acl_list=None): | 190 fine_grained_acl_list=None): |
192 """Upload contents of a local file to Google Storage. | 191 """Upload contents of a local file to Google Storage. |
193 | 192 |
194 params: | 193 params: |
195 source_path: full path (local-OS-style) on local disk to read from | 194 source_path: full path (local-OS-style) on local disk to read from |
196 dest_bucket: GS bucket to copy the file to | 195 dest_bucket: GS bucket to copy the file to |
197 dest_path: full path (Posix-style) within that bucket | 196 dest_path: full path (Posix-style) within that bucket |
198 upload_if: one of the UploadIf values, describing in which cases we should | 197 upload_if: one of the UploadIf values, describing in which cases we should |
199 upload the file | 198 upload the file |
200 predefined_acl: which predefined ACL to apply to the file on Google | 199 predefined_acl: which predefined ACL to apply to the file on Google |
201 Storage; must be one of the PredefinedACL values defined above. | 200 Storage; must be one of the PredefinedACL values defined above. |
202 If None, inherits dest_bucket's default object ACL. | 201 If None, inherits dest_bucket's default object ACL. |
203 fine_grained_acl_list: list of (id_type, id_value, permission) tuples | 202 fine_grained_acl_list: list of (id_type, id_value, permission) tuples |
204 to apply to the uploaded file (on top of the predefined_acl), | 203 to apply to the uploaded file (on top of the predefined_acl), |
205 or None if predefined_acl is sufficient | 204 or None if predefined_acl is sufficient |
206 | 205 |
207 TODO(epoger): Consider adding a do_compress parameter that would compress | 206 TODO(epoger): Consider adding a do_compress parameter that would compress |
208 the file using gzip before upload, and add a "Content-Encoding:gzip" header | 207 the file using gzip before upload, and add a "Content-Encoding:gzip" header |
209 so that HTTP downloads of the file would be unzipped automatically. | 208 so that HTTP downloads of the file would be unzipped automatically. |
210 See https://developers.google.com/storage/docs/gsutil/addlhelp/ | 209 See https://developers.google.com/storage/docs/gsutil/addlhelp/ |
211 WorkingWithObjectMetadata#content-encoding | 210 WorkingWithObjectMetadata#content-encoding |
212 """ | 211 """ |
213 b = self._connect_to_bucket(bucket=dest_bucket) | 212 b = self._connect_to_bucket(bucket=dest_bucket) |
| 213 local_md5 = None # filled in lazily |
214 | 214 |
215 if upload_if == self.UploadIf.IF_NEW: | 215 if upload_if == self.UploadIf.IF_NEW: |
216 old_key = b.get_key(key_name=dest_path) | 216 old_key = b.get_key(key_name=dest_path) |
217 if old_key: | 217 if old_key: |
218 print 'Skipping upload of existing file gs://%s/%s' % ( | 218 print 'Skipping upload of existing file gs://%s/%s' % ( |
219 b.name, dest_path) | 219 b.name, dest_path) |
220 return | 220 return |
221 elif upload_if == self.UploadIf.IF_MODIFIED: | 221 elif upload_if == self.UploadIf.IF_MODIFIED: |
222 old_key = b.get_key(key_name=dest_path) | 222 old_key = b.get_key(key_name=dest_path) |
223 if old_key: | 223 if old_key: |
224 local_md5 = '"%s"' % _get_local_md5(path=source_path) | 224 if not local_md5: |
225 if local_md5 == old_key.etag: | 225 local_md5 = _get_local_md5(path=source_path) |
| 226 if ('"%s"' % local_md5) == old_key.etag: |
226 print 'Skipping upload of unmodified file gs://%s/%s : %s' % ( | 227 print 'Skipping upload of unmodified file gs://%s/%s : %s' % ( |
227 b.name, dest_path, local_md5) | 228 b.name, dest_path, local_md5) |
228 return | 229 return |
229 elif upload_if != self.UploadIf.ALWAYS: | 230 elif upload_if != self.UploadIf.ALWAYS: |
230 raise Exception('unknown value of upload_if: %s' % upload_if) | 231 raise Exception('unknown value of upload_if: %s' % upload_if) |
231 | 232 |
232 key = Key(b) | 233 # Upload the file using a temporary name at first, in case the transfer |
233 key.name = dest_path | 234 # is interrupted partway through. |
| 235 if not local_md5: |
| 236 local_md5 = _get_local_md5(path=source_path) |
| 237 initial_key = Key(b) |
| 238 initial_key.name = dest_path + '-uploading-' + local_md5 |
234 try: | 239 try: |
235 key.set_contents_from_filename(filename=source_path, | 240 initial_key.set_contents_from_filename(filename=source_path, |
236 policy=predefined_acl) | 241 policy=predefined_acl) |
237 except BotoServerError, e: | 242 except BotoServerError, e: |
238 e.body = (repr(e.body) + | 243 e.body = (repr(e.body) + |
239 ' while uploading source_path=%s to bucket=%s, path=%s' % ( | 244 ' while uploading source_path=%s to gs://%s/%s' % ( |
240 source_path, b.name, key.name)) | 245 source_path, b.name, initial_key.name)) |
241 raise | 246 raise |
| 247 |
| 248 # Verify that the file contents were uploaded successfully. |
| 249 # |
| 250 # TODO(epoger): Check whether the boto library or XML API already do this... |
| 251 # if so, we may be duplicating effort here, and maybe we don't need to do |
| 252 # the whole "upload using temporary filename, then rename" thing. |
| 253 # |
| 254 # TODO(epoger): Confirm that the etag is set on the server side... |
| 255 # otherwise, we may just be validating another MD5 hash that was generated |
| 256 # on the client side before the file was uploaded! |
| 257 validate_key = b.get_key(key_name=initial_key.name) |
| 258 if validate_key.etag != ('"%s"' % local_md5): |
| 259 raise Exception('found wrong MD5 after uploading gs://%s/%s' % ( |
| 260 b.name, validate_key.name)) |
| 261 |
| 262 # Rename the file to its real name. |
| 263 # |
| 264 # TODO(epoger): I don't know how long this takes. I wish we could rename |
| 265 # the key instead, but AFAICT you can't do that. |
| 266 # Perhaps we could use Key.compose() to create a composite object pointing |
| 267 # at the original key? |
| 268 # See https://developers.google.com/storage/docs/composite-objects |
| 269 final_key = b.copy_key( |
| 270 new_key_name=dest_path, src_key_name=initial_key.name, |
| 271 src_bucket_name=b.name, preserve_acl=False) |
| 272 initial_key.delete() |
| 273 |
| 274 # Set ACLs on the file. |
| 275 # We do this *after* copy_key(), because copy_key's preserve_acl |
| 276 # functionality would incur a performance hit. |
242 for (id_type, id_value, permission) in fine_grained_acl_list or []: | 277 for (id_type, id_value, permission) in fine_grained_acl_list or []: |
243 self.set_acl( | 278 self.set_acl( |
244 bucket=b, path=key.name, | 279 bucket=b, path=final_key.name, |
245 id_type=id_type, id_value=id_value, permission=permission) | 280 id_type=id_type, id_value=id_value, permission=permission) |
246 | 281 |
247 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, | 282 def upload_dir_contents(self, source_dir, dest_bucket, dest_dir, |
248 upload_if=UploadIf.ALWAYS, **kwargs): | 283 upload_if=UploadIf.ALWAYS, **kwargs): |
249 """Recursively upload contents of a local directory to Google Storage. | 284 """Recursively upload contents of a local directory to Google Storage. |
250 | 285 |
251 params: | 286 params: |
252 source_dir: full path (local-OS-style) on local disk of directory to copy | 287 source_dir: full path (local-OS-style) on local disk of directory to copy |
253 contents of | 288 contents of |
254 dest_bucket: GS bucket to copy the files into | 289 dest_bucket: GS bucket to copy the files into |
(...skipping 75 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
330 b = self._connect_to_bucket(bucket=source_bucket) | 365 b = self._connect_to_bucket(bucket=source_bucket) |
331 key = Key(b) | 366 key = Key(b) |
332 key.name = source_path | 367 key.name = source_path |
333 if create_subdirs_if_needed: | 368 if create_subdirs_if_needed: |
334 _makedirs_if_needed(os.path.dirname(dest_path)) | 369 _makedirs_if_needed(os.path.dirname(dest_path)) |
335 with open(dest_path, 'w') as f: | 370 with open(dest_path, 'w') as f: |
336 try: | 371 try: |
337 key.get_contents_to_file(fp=f) | 372 key.get_contents_to_file(fp=f) |
338 except BotoServerError, e: | 373 except BotoServerError, e: |
339 e.body = (repr(e.body) + | 374 e.body = (repr(e.body) + |
340 ' while downloading bucket=%s, path=%s to local_path=%s' % ( | 375 ' while downloading gs://%s/%s to local_path=%s' % ( |
341 b.name, source_path, dest_path)) | 376 b.name, source_path, dest_path)) |
342 raise | 377 raise |
343 | 378 |
344 def download_dir_contents(self, source_bucket, source_dir, dest_dir): | 379 def download_dir_contents(self, source_bucket, source_dir, dest_dir): |
345 """Recursively download contents of a Google Storage directory to local disk | 380 """Recursively download contents of a Google Storage directory to local disk |
346 | 381 |
347 params: | 382 params: |
348 source_bucket: GS bucket to copy the files from | 383 source_bucket: GS bucket to copy the files from |
349 source_dir: full path (Posix-style) within that bucket; read the files | 384 source_dir: full path (Posix-style) within that bucket; read the files |
350 from this directory | 385 from this directory |
(...skipping 13 matching lines...) Expand all Loading... |
364 | 399 |
365 for filename in files: | 400 for filename in files: |
366 key = Key(b) | 401 key = Key(b) |
367 key.name = posixpath.join(source_dir, filename) | 402 key.name = posixpath.join(source_dir, filename) |
368 dest_path = os.path.join(dest_dir, filename) | 403 dest_path = os.path.join(dest_dir, filename) |
369 with open(dest_path, 'w') as f: | 404 with open(dest_path, 'w') as f: |
370 try: | 405 try: |
371 key.get_contents_to_file(fp=f) | 406 key.get_contents_to_file(fp=f) |
372 except BotoServerError, e: | 407 except BotoServerError, e: |
373 e.body = (repr(e.body) + | 408 e.body = (repr(e.body) + |
374 ' while downloading bucket=%s, path=%s to local_path=%s' % ( | 409 ' while downloading gs://%s/%s to local_path=%s' % ( |
375 b.name, key.name, dest_path)) | 410 b.name, key.name, dest_path)) |
376 raise | 411 raise |
377 | 412 |
378 for dirname in dirs: | 413 for dirname in dirs: |
379 self.download_dir_contents( # recurse | 414 self.download_dir_contents( # recurse |
380 source_bucket=source_bucket, | 415 source_bucket=source_bucket, |
381 source_dir=posixpath.join(source_dir, dirname), | 416 source_dir=posixpath.join(source_dir, dirname), |
382 dest_dir=os.path.join(dest_dir, dirname)) | 417 dest_dir=os.path.join(dest_dir, dirname)) |
383 | 418 |
384 def get_acl(self, bucket, path, id_type, id_value): | 419 def get_acl(self, bucket, path, id_type, id_value): |
(...skipping 183 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
568 | 603 |
569 def _get_local_md5(path): | 604 def _get_local_md5(path): |
570 """Returns the MD5 hash of a file on local disk.""" | 605 """Returns the MD5 hash of a file on local disk.""" |
571 hasher = hashlib.md5() | 606 hasher = hashlib.md5() |
572 with open(path, 'rb') as f: | 607 with open(path, 'rb') as f: |
573 while True: | 608 while True: |
574 data = f.read(64*1024) | 609 data = f.read(64*1024) |
575 if not data: | 610 if not data: |
576 return hasher.hexdigest() | 611 return hasher.hexdigest() |
577 hasher.update(data) | 612 hasher.update(data) |
OLD | NEW |