OLD | NEW |
(Empty) | |
| 1 # -*- coding: utf-8 -*- |
| 2 # Copyright 2015 Google Inc. All Rights Reserved. |
| 3 # |
| 4 # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 # you may not use this file except in compliance with the License. |
| 6 # You may obtain a copy of the License at |
| 7 # |
| 8 # http://www.apache.org/licenses/LICENSE-2.0 |
| 9 # |
| 10 # Unless required by applicable law or agreed to in writing, software |
| 11 # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 # See the License for the specific language governing permissions and |
| 14 # limitations under the License. |
| 15 """Helper functions for tracker file functionality.""" |
| 16 |
| 17 import errno |
| 18 import hashlib |
| 19 import os |
| 20 import re |
| 21 |
| 22 from boto import config |
| 23 from gslib.exception import CommandException |
| 24 from gslib.util import CreateDirIfNeeded |
| 25 from gslib.util import GetGsutilStateDir |
| 26 from gslib.util import ResumableThreshold |
| 27 from gslib.util import UTF8 |
| 28 |
| 29 # The maximum length of a file name can vary wildly between different |
| 30 # operating systems, so we always ensure that tracker files are less |
| 31 # than 100 characters in order to avoid any such issues. |
| 32 MAX_TRACKER_FILE_NAME_LENGTH = 100 |
| 33 |
| 34 |
| 35 TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT = ( |
| 36 'Couldn\'t write tracker file (%s): %s. This can happen if gsutil is ' |
| 37 'configured to save tracker files to an unwritable directory)') |
| 38 |
| 39 |
| 40 class TrackerFileType(object): |
| 41 UPLOAD = 'upload' |
| 42 DOWNLOAD = 'download' |
| 43 PARALLEL_UPLOAD = 'parallel_upload' |
| 44 REWRITE = 'rewrite' |
| 45 |
| 46 |
| 47 def _HashFilename(filename): |
| 48 """Apply a hash function (SHA1) to shorten the passed file name. |
| 49 |
| 50 The spec for the hashed file name is as follows: |
| 51 |
| 52 TRACKER_<hash>_<trailing> |
| 53 |
| 54 where hash is a SHA1 hash on the original file name and trailing is |
| 55 the last 16 chars from the original file name. Max file name lengths |
| 56 vary by operating system so the goal of this function is to ensure |
| 57 the hashed version takes fewer than 100 characters. |
| 58 |
| 59 Args: |
| 60 filename: file name to be hashed. |
| 61 |
| 62 Returns: |
| 63 shorter, hashed version of passed file name |
| 64 """ |
| 65 if isinstance(filename, unicode): |
| 66 filename = filename.encode(UTF8) |
| 67 else: |
| 68 filename = unicode(filename, UTF8).encode(UTF8) |
| 69 m = hashlib.sha1(filename) |
| 70 return 'TRACKER_' + m.hexdigest() + '.' + filename[-16:] |
| 71 |
| 72 |
| 73 def CreateTrackerDirIfNeeded(): |
| 74 """Looks up or creates the gsutil tracker file directory. |
| 75 |
| 76 This is the configured directory where gsutil keeps its resumable transfer |
| 77 tracker files. This function creates it if it doesn't already exist. |
| 78 |
| 79 Returns: |
| 80 The pathname to the tracker directory. |
| 81 """ |
| 82 tracker_dir = config.get( |
| 83 'GSUtil', 'resumable_tracker_dir', |
| 84 os.path.join(GetGsutilStateDir(), 'tracker-files')) |
| 85 CreateDirIfNeeded(tracker_dir) |
| 86 return tracker_dir |
| 87 |
| 88 |
| 89 def GetRewriteTrackerFilePath(src_bucket_name, src_obj_name, dst_bucket_name, |
| 90 dst_obj_name, api_selector): |
| 91 """Gets the tracker file name described by the arguments. |
| 92 |
| 93 Args: |
| 94 src_bucket_name: Source bucket (string). |
| 95 src_obj_name: Source object (string). |
| 96 dst_bucket_name: Destination bucket (string). |
| 97 dst_obj_name: Destination object (string) |
| 98 api_selector: API to use for this operation. |
| 99 |
| 100 Returns: |
| 101 File path to tracker file. |
| 102 """ |
| 103 # Encode the src and dest bucket and object names into the tracker file |
| 104 # name. |
| 105 res_tracker_file_name = ( |
| 106 re.sub('[/\\\\]', '_', 'rewrite__%s__%s__%s__%s__%s.token' % |
| 107 (src_bucket_name, src_obj_name, dst_bucket_name, |
| 108 dst_obj_name, api_selector))) |
| 109 |
| 110 return _HashAndReturnPath(res_tracker_file_name, TrackerFileType.REWRITE) |
| 111 |
| 112 |
| 113 def GetTrackerFilePath(dst_url, tracker_file_type, api_selector, src_url=None): |
| 114 """Gets the tracker file name described by the arguments. |
| 115 |
| 116 Args: |
| 117 dst_url: Destination URL for tracker file. |
| 118 tracker_file_type: TrackerFileType for this operation. |
| 119 api_selector: API to use for this operation. |
| 120 src_url: Source URL for the source file name for parallel uploads. |
| 121 |
| 122 Returns: |
| 123 File path to tracker file. |
| 124 """ |
| 125 if tracker_file_type == TrackerFileType.UPLOAD: |
| 126 # Encode the dest bucket and object name into the tracker file name. |
| 127 res_tracker_file_name = ( |
| 128 re.sub('[/\\\\]', '_', 'resumable_upload__%s__%s__%s.url' % |
| 129 (dst_url.bucket_name, dst_url.object_name, api_selector))) |
| 130 elif tracker_file_type == TrackerFileType.DOWNLOAD: |
| 131 # Encode the fully-qualified dest file name into the tracker file name. |
| 132 res_tracker_file_name = ( |
| 133 re.sub('[/\\\\]', '_', 'resumable_download__%s__%s.etag' % |
| 134 (os.path.realpath(dst_url.object_name), api_selector))) |
| 135 elif tracker_file_type == TrackerFileType.PARALLEL_UPLOAD: |
| 136 # Encode the dest bucket and object names as well as the source file name |
| 137 # into the tracker file name. |
| 138 res_tracker_file_name = ( |
| 139 re.sub('[/\\\\]', '_', 'parallel_upload__%s__%s__%s__%s.url' % |
| 140 (dst_url.bucket_name, dst_url.object_name, |
| 141 src_url, api_selector))) |
| 142 elif tracker_file_type == TrackerFileType.REWRITE: |
| 143 # Should use GetRewriteTrackerFilePath instead. |
| 144 raise NotImplementedError() |
| 145 |
| 146 return _HashAndReturnPath(res_tracker_file_name, tracker_file_type) |
| 147 |
| 148 |
| 149 def _HashAndReturnPath(res_tracker_file_name, tracker_file_type): |
| 150 resumable_tracker_dir = CreateTrackerDirIfNeeded() |
| 151 hashed_tracker_file_name = _HashFilename(res_tracker_file_name) |
| 152 tracker_file_name = '%s_%s' % (str(tracker_file_type).lower(), |
| 153 hashed_tracker_file_name) |
| 154 tracker_file_path = '%s%s%s' % (resumable_tracker_dir, os.sep, |
| 155 tracker_file_name) |
| 156 assert len(tracker_file_name) < MAX_TRACKER_FILE_NAME_LENGTH |
| 157 return tracker_file_path |
| 158 |
| 159 |
| 160 def DeleteTrackerFile(tracker_file_name): |
| 161 if tracker_file_name and os.path.exists(tracker_file_name): |
| 162 os.unlink(tracker_file_name) |
| 163 |
| 164 |
| 165 def HashRewriteParameters( |
| 166 src_obj_metadata, dst_obj_metadata, projection, src_generation=None, |
| 167 gen_match=None, meta_gen_match=None, canned_acl=None, fields=None, |
| 168 max_bytes_per_call=None): |
| 169 """Creates an MD5 hex digest of the parameters for a rewrite call. |
| 170 |
| 171 Resuming rewrites requires that the input parameters are identical. Thus, |
| 172 the rewrite tracker file needs to represent the input parameters. For |
| 173 easy comparison, hash the input values. If a user does a performs a |
| 174 same-source/same-destination rewrite via a different command (for example, |
| 175 with a changed ACL), the hashes will not match and we will restart the |
| 176 rewrite from the beginning. |
| 177 |
| 178 Args: |
| 179 src_obj_metadata: apitools Object describing source object. Must include |
| 180 bucket, name, and etag. |
| 181 dst_obj_metadata: apitools Object describing destination object. Must |
| 182 include bucket and object name |
| 183 projection: Projection used for the API call. |
| 184 src_generation: Optional source generation. |
| 185 gen_match: Optional generation precondition. |
| 186 meta_gen_match: Optional metageneration precondition. |
| 187 canned_acl: Optional canned ACL string. |
| 188 fields: Optional fields to include in response. |
| 189 max_bytes_per_call: Optional maximum bytes rewritten per call. |
| 190 |
| 191 Returns: |
| 192 MD5 hex digest Hash of the input parameters, or None if required parameters |
| 193 are missing. |
| 194 """ |
| 195 if (not src_obj_metadata or |
| 196 not src_obj_metadata.bucket or |
| 197 not src_obj_metadata.name or |
| 198 not src_obj_metadata.etag or |
| 199 not dst_obj_metadata or |
| 200 not dst_obj_metadata.bucket or |
| 201 not dst_obj_metadata.name or |
| 202 not projection): |
| 203 return |
| 204 md5_hash = hashlib.md5() |
| 205 for input_param in ( |
| 206 src_obj_metadata, dst_obj_metadata, projection, src_generation, |
| 207 gen_match, meta_gen_match, canned_acl, fields, max_bytes_per_call): |
| 208 md5_hash.update(str(input_param)) |
| 209 return md5_hash.hexdigest() |
| 210 |
| 211 |
| 212 def ReadRewriteTrackerFile(tracker_file_name, rewrite_params_hash): |
| 213 """Attempts to read a rewrite tracker file. |
| 214 |
| 215 Args: |
| 216 tracker_file_name: Tracker file path string. |
| 217 rewrite_params_hash: MD5 hex digest of rewrite call parameters constructed |
| 218 by HashRewriteParameters. |
| 219 |
| 220 Returns: |
| 221 String rewrite_token for resuming rewrite requests if a matching tracker |
| 222 file exists, None otherwise (which will result in starting a new rewrite). |
| 223 """ |
| 224 # Check to see if we already have a matching tracker file. |
| 225 tracker_file = None |
| 226 if not rewrite_params_hash: |
| 227 return |
| 228 try: |
| 229 tracker_file = open(tracker_file_name, 'r') |
| 230 existing_hash = tracker_file.readline().rstrip('\n') |
| 231 if existing_hash == rewrite_params_hash: |
| 232 # Next line is the rewrite token. |
| 233 return tracker_file.readline().rstrip('\n') |
| 234 except IOError as e: |
| 235 # Ignore non-existent file (happens first time a rewrite is attempted. |
| 236 if e.errno != errno.ENOENT: |
| 237 print('Couldn\'t read Copy tracker file (%s): %s. Restarting copy ' |
| 238 'from scratch.' % |
| 239 (tracker_file_name, e.strerror)) |
| 240 finally: |
| 241 if tracker_file: |
| 242 tracker_file.close() |
| 243 |
| 244 |
| 245 def WriteRewriteTrackerFile(tracker_file_name, rewrite_params_hash, |
| 246 rewrite_token): |
| 247 """Writes a rewrite tracker file. |
| 248 |
| 249 Args: |
| 250 tracker_file_name: Tracker file path string. |
| 251 rewrite_params_hash: MD5 hex digest of rewrite call parameters constructed |
| 252 by HashRewriteParameters. |
| 253 rewrite_token: Rewrite token string returned by the service. |
| 254 """ |
| 255 _WriteTrackerFile(tracker_file_name, '%s\n%s\n' % (rewrite_params_hash, |
| 256 rewrite_token)) |
| 257 |
| 258 |
| 259 def ReadOrCreateDownloadTrackerFile(src_obj_metadata, dst_url, |
| 260 api_selector): |
| 261 """Checks for a download tracker file and creates one if it does not exist. |
| 262 |
| 263 Args: |
| 264 src_obj_metadata: Metadata for the source object. Must include |
| 265 etag and size. |
| 266 dst_url: Destination file StorageUrl. |
| 267 api_selector: API mode to use (for tracker file naming). |
| 268 |
| 269 Returns: |
| 270 True if the tracker file already exists (resume existing download), |
| 271 False if we created a new tracker file (new download). |
| 272 """ |
| 273 if src_obj_metadata.size < ResumableThreshold(): |
| 274 # Don't create a tracker file for a small downloads; cross-process resumes |
| 275 # won't work, but restarting a small download is inexpensive. |
| 276 return False |
| 277 |
| 278 assert src_obj_metadata.etag |
| 279 tracker_file_name = GetTrackerFilePath( |
| 280 dst_url, TrackerFileType.DOWNLOAD, api_selector) |
| 281 tracker_file = None |
| 282 |
| 283 # Check to see if we already have a matching tracker file. |
| 284 try: |
| 285 tracker_file = open(tracker_file_name, 'r') |
| 286 etag_value = tracker_file.readline().rstrip('\n') |
| 287 if etag_value == src_obj_metadata.etag: |
| 288 return True |
| 289 except IOError as e: |
| 290 # Ignore non-existent file (happens first time a download |
| 291 # is attempted on an object), but warn user for other errors. |
| 292 if e.errno != errno.ENOENT: |
| 293 print('Couldn\'t read URL tracker file (%s): %s. Restarting ' |
| 294 'download from scratch.' % |
| 295 (tracker_file_name, e.strerror)) |
| 296 finally: |
| 297 if tracker_file: |
| 298 tracker_file.close() |
| 299 |
| 300 # Otherwise, create a new tracker file and start from scratch. |
| 301 _WriteTrackerFile(tracker_file_name, '%s\n' % src_obj_metadata.etag) |
| 302 |
| 303 |
| 304 def _WriteTrackerFile(tracker_file_name, data): |
| 305 """Creates a tracker file, storing the input data.""" |
| 306 try: |
| 307 with os.fdopen(os.open(tracker_file_name, |
| 308 os.O_WRONLY | os.O_CREAT, 0600), 'w') as tf: |
| 309 tf.write(data) |
| 310 return False |
| 311 except (IOError, OSError) as e: |
| 312 raise RaiseUnwritableTrackerFileException(tracker_file_name, e.strerror) |
| 313 |
| 314 |
| 315 def RaiseUnwritableTrackerFileException(tracker_file_name, error_str): |
| 316 """Raises an exception when unable to write the tracker file.""" |
| 317 raise CommandException(TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT % |
| 318 (tracker_file_name, error_str)) |
OLD | NEW |