OLD | NEW |
1 #!/usr/bin/python | 1 #!/usr/bin/python |
2 | 2 |
3 # pylint: disable=C0301 | 3 # pylint: disable=C0301 |
4 """ | 4 """ |
5 Copyright 2014 Google Inc. | 5 Copyright 2014 Google Inc. |
6 | 6 |
7 Use of this source code is governed by a BSD-style license that can be | 7 Use of this source code is governed by a BSD-style license that can be |
8 found in the LICENSE file. | 8 found in the LICENSE file. |
9 | 9 |
10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper | 10 Utilities for accessing Google Cloud Storage, using the boto library (wrapper |
11 for the XML API). | 11 for the XML API). |
12 | 12 |
13 API/library references: | 13 API/library references: |
14 - https://developers.google.com/storage/docs/reference-guide | 14 - https://developers.google.com/storage/docs/reference-guide |
15 - http://googlecloudstorage.blogspot.com/2012/09/google-cloud-storage-tutorial-u
sing-boto.html | 15 - http://googlecloudstorage.blogspot.com/2012/09/google-cloud-storage-tutorial-u
sing-boto.html |
16 """ | 16 """ |
17 # pylint: enable=C0301 | 17 # pylint: enable=C0301 |
18 | 18 |
19 # System-level imports | 19 # System-level imports |
20 import errno | 20 import errno |
21 import hashlib | 21 import hashlib |
22 import logging | |
23 import os | 22 import os |
24 import posixpath | 23 import posixpath |
25 import Queue | 24 import Queue |
26 import re | 25 import re |
27 import sys | 26 import sys |
28 import threading | 27 import threading |
29 | 28 |
30 # Imports from third-party code | 29 # Imports from third-party code |
31 TRUNK_DIRECTORY = os.path.abspath(os.path.join( | 30 TRUNK_DIRECTORY = os.path.abspath(os.path.join( |
32 os.path.dirname(__file__), os.pardir, os.pardir)) | 31 os.path.dirname(__file__), os.pardir, os.pardir)) |
(...skipping 86 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
119 take longer than just uploading the file. | 118 take longer than just uploading the file. |
120 See http://skbug.com/2778 ('gs_utils: when uploading IF_NEW, batch up | 119 See http://skbug.com/2778 ('gs_utils: when uploading IF_NEW, batch up |
121 checks for existing files within a single remote directory') | 120 checks for existing files within a single remote directory') |
122 """ | 121 """ |
123 ALWAYS = 1 # always upload the file | 122 ALWAYS = 1 # always upload the file |
124 IF_NEW = 2 # if there is an existing file with the same name, | 123 IF_NEW = 2 # if there is an existing file with the same name, |
125 # leave it alone | 124 # leave it alone |
126 IF_MODIFIED = 3 # if there is an existing file with the same name and | 125 IF_MODIFIED = 3 # if there is an existing file with the same name and |
127 # contents, leave it alone | 126 # contents, leave it alone |
128 | 127 |
129 def __init__(self, boto_file_path=None, logger=None): | 128 def __init__(self, boto_file_path=None): |
130 """Constructor. | 129 """Constructor. |
131 | 130 |
132 Params: | 131 Params: |
133 boto_file_path: full path (local-OS-style) on local disk where .boto | 132 boto_file_path: full path (local-OS-style) on local disk where .boto |
134 credentials file can be found. If None, then the GSUtils object | 133 credentials file can be found. If None, then the GSUtils object |
135 created will be able to access only public files in Google Storage. | 134 created will be able to access only public files in Google Storage. |
136 logger: a logging.Logger instance to use for logging output; if None, | |
137 one will be created with default characteristics | |
138 | 135 |
139 Raises an exception if no file is found at boto_file_path, or if the file | 136 Raises an exception if no file is found at boto_file_path, or if the file |
140 found there is malformed. | 137 found there is malformed. |
141 """ | 138 """ |
142 self.logger = logger or logging.getLogger(__name__) | |
143 self._gs_access_key_id = None | 139 self._gs_access_key_id = None |
144 self._gs_secret_access_key = None | 140 self._gs_secret_access_key = None |
145 if boto_file_path: | 141 if boto_file_path: |
146 self.logger.info('Reading boto file from %s' % boto_file_path) | 142 print ('Reading boto file from %s' % boto_file_path) |
147 boto_dict = _config_file_as_dict(filepath=boto_file_path) | 143 boto_dict = _config_file_as_dict(filepath=boto_file_path) |
148 self._gs_access_key_id = boto_dict['gs_access_key_id'] | 144 self._gs_access_key_id = boto_dict['gs_access_key_id'] |
149 self._gs_secret_access_key = boto_dict['gs_secret_access_key'] | 145 self._gs_secret_access_key = boto_dict['gs_secret_access_key'] |
150 # Which field we get/set in ACL entries, depending on IdType. | 146 # Which field we get/set in ACL entries, depending on IdType. |
151 self._field_by_id_type = { | 147 self._field_by_id_type = { |
152 self.IdType.GROUP_BY_DOMAIN: 'domain', | 148 self.IdType.GROUP_BY_DOMAIN: 'domain', |
153 self.IdType.GROUP_BY_EMAIL: 'email_address', | 149 self.IdType.GROUP_BY_EMAIL: 'email_address', |
154 self.IdType.GROUP_BY_ID: 'id', | 150 self.IdType.GROUP_BY_ID: 'id', |
155 self.IdType.USER_BY_EMAIL: 'email_address', | 151 self.IdType.USER_BY_EMAIL: 'email_address', |
156 self.IdType.USER_BY_ID: 'id', | 152 self.IdType.USER_BY_ID: 'id', |
(...skipping 64 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
221 so that HTTP downloads of the file would be unzipped automatically. | 217 so that HTTP downloads of the file would be unzipped automatically. |
222 See https://developers.google.com/storage/docs/gsutil/addlhelp/ | 218 See https://developers.google.com/storage/docs/gsutil/addlhelp/ |
223 WorkingWithObjectMetadata#content-encoding | 219 WorkingWithObjectMetadata#content-encoding |
224 """ | 220 """ |
225 b = self._connect_to_bucket(bucket=dest_bucket) | 221 b = self._connect_to_bucket(bucket=dest_bucket) |
226 local_md5 = None # filled in lazily | 222 local_md5 = None # filled in lazily |
227 | 223 |
228 if upload_if == self.UploadIf.IF_NEW: | 224 if upload_if == self.UploadIf.IF_NEW: |
229 old_key = b.get_key(key_name=dest_path) | 225 old_key = b.get_key(key_name=dest_path) |
230 if old_key: | 226 if old_key: |
231 self.logger.info('Skipping upload of existing file gs://%s/%s' % ( | 227 print ('Skipping upload of existing file gs://%s/%s' % ( |
232 b.name, dest_path)) | 228 b.name, dest_path)) |
233 return | 229 return |
234 elif upload_if == self.UploadIf.IF_MODIFIED: | 230 elif upload_if == self.UploadIf.IF_MODIFIED: |
235 old_key = b.get_key(key_name=dest_path) | 231 old_key = b.get_key(key_name=dest_path) |
236 if old_key: | 232 if old_key: |
237 if not local_md5: | 233 if not local_md5: |
238 local_md5 = _get_local_md5(path=source_path) | 234 local_md5 = _get_local_md5(path=source_path) |
239 if ('"%s"' % local_md5) == old_key.etag: | 235 if ('"%s"' % local_md5) == old_key.etag: |
240 self.logger.info( | 236 print ( |
241 'Skipping upload of unmodified file gs://%s/%s : %s' % ( | 237 'Skipping upload of unmodified file gs://%s/%s : %s' % ( |
242 b.name, dest_path, local_md5)) | 238 b.name, dest_path, local_md5)) |
243 return | 239 return |
244 elif upload_if != self.UploadIf.ALWAYS: | 240 elif upload_if != self.UploadIf.ALWAYS: |
245 raise Exception('unknown value of upload_if: %s' % upload_if) | 241 raise Exception('unknown value of upload_if: %s' % upload_if) |
246 | 242 |
247 # Upload the file using a temporary name at first, in case the transfer | 243 # Upload the file using a temporary name at first, in case the transfer |
248 # is interrupted partway through. | 244 # is interrupted partway through. |
249 if not local_md5: | 245 if not local_md5: |
250 local_md5 = _get_local_md5(path=source_path) | 246 local_md5 = _get_local_md5(path=source_path) |
(...skipping 103 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
354 local_md5 = '"%s"' % _get_local_md5(path=os.path.join( | 350 local_md5 = '"%s"' % _get_local_md5(path=os.path.join( |
355 source_dir, rel_path)) | 351 source_dir, rel_path)) |
356 key = existing_dest_filemap[rel_path] | 352 key = existing_dest_filemap[rel_path] |
357 if local_md5 == key.etag: | 353 if local_md5 == key.etag: |
358 source_fileset.remove(rel_path) | 354 source_fileset.remove(rel_path) |
359 else: | 355 else: |
360 raise Exception('unknown value of upload_if: %s' % upload_if) | 356 raise Exception('unknown value of upload_if: %s' % upload_if) |
361 | 357 |
362 # Upload any files still in source_fileset. | 358 # Upload any files still in source_fileset. |
363 num_files_to_upload = len(source_fileset) | 359 num_files_to_upload = len(source_fileset) |
364 self.logger.info('Uploading %d files, skipping %d ...' % ( | 360 print ('Uploading %d files, skipping %d ...' % ( |
365 num_files_to_upload, num_files_total - num_files_to_upload)) | 361 num_files_to_upload, num_files_total - num_files_to_upload)) |
366 if num_files_to_upload == 0: | 362 if num_files_to_upload == 0: |
367 return | 363 return |
368 if num_threads > num_files_to_upload: | 364 if num_threads > num_files_to_upload: |
369 num_threads = num_files_to_upload | 365 num_threads = num_files_to_upload |
370 | 366 |
371 # Create a work queue with all files that need to be uploaded. | 367 # Create a work queue with all files that need to be uploaded. |
372 q = Queue.Queue(maxsize=num_files_to_upload) | 368 q = Queue.Queue(maxsize=num_files_to_upload) |
373 for rel_path in source_fileset: | 369 for rel_path in source_fileset: |
374 q.put(rel_path) | 370 q.put(rel_path) |
375 | 371 |
376 # Spin up worker threads to read from the task queue. | 372 # Spin up worker threads to read from the task queue. |
377 def worker(): | 373 def worker(): |
378 while True: | 374 while True: |
379 try: | 375 try: |
380 rel_path = q.get(block=False) | 376 rel_path = q.get(block=False) |
381 except Queue.Empty: | 377 except Queue.Empty: |
382 return # no more tasks in the queue, so exit | 378 return # no more tasks in the queue, so exit |
383 self.logger.info(' Uploading file %d/%d: %s' % ( | 379 print (' Uploading file %d/%d: %s' % ( |
384 num_files_to_upload - q.qsize(), num_files_to_upload, rel_path)) | 380 num_files_to_upload - q.qsize(), num_files_to_upload, rel_path)) |
385 self.upload_file( | 381 self.upload_file( |
386 source_path=os.path.join(source_dir, rel_path), | 382 source_path=os.path.join(source_dir, rel_path), |
387 dest_bucket=b, | 383 dest_bucket=b, |
388 dest_path=posixpath.join(dest_dir, rel_path), | 384 dest_path=posixpath.join(dest_dir, rel_path), |
389 upload_if=self.UploadIf.ALWAYS, | 385 upload_if=self.UploadIf.ALWAYS, |
390 **kwargs) | 386 **kwargs) |
391 q.task_done() | 387 q.task_done() |
392 for _ in range(num_threads): | 388 for _ in range(num_threads): |
393 t = threading.Thread(target=worker) | 389 t = threading.Thread(target=worker) |
(...skipping 255 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
649 | 645 |
650 def _get_local_md5(path): | 646 def _get_local_md5(path): |
651 """Returns the MD5 hash of a file on local disk.""" | 647 """Returns the MD5 hash of a file on local disk.""" |
652 hasher = hashlib.md5() | 648 hasher = hashlib.md5() |
653 with open(path, 'rb') as f: | 649 with open(path, 'rb') as f: |
654 while True: | 650 while True: |
655 data = f.read(64*1024) | 651 data = f.read(64*1024) |
656 if not data: | 652 if not data: |
657 return hasher.hexdigest() | 653 return hasher.hexdigest() |
658 hasher.update(data) | 654 hasher.update(data) |
OLD | NEW |