OLD | NEW |
1 # -*- coding: utf-8 -*- | 1 # -*- coding: utf-8 -*- |
2 # Copyright 2015 Google Inc. All Rights Reserved. | 2 # Copyright 2015 Google Inc. All Rights Reserved. |
3 # | 3 # |
4 # Licensed under the Apache License, Version 2.0 (the "License"); | 4 # Licensed under the Apache License, Version 2.0 (the "License"); |
5 # you may not use this file except in compliance with the License. | 5 # you may not use this file except in compliance with the License. |
6 # You may obtain a copy of the License at | 6 # You may obtain a copy of the License at |
7 # | 7 # |
8 # http://www.apache.org/licenses/LICENSE-2.0 | 8 # http://www.apache.org/licenses/LICENSE-2.0 |
9 # | 9 # |
10 # Unless required by applicable law or agreed to in writing, software | 10 # Unless required by applicable law or agreed to in writing, software |
11 # distributed under the License is distributed on an "AS IS" BASIS, | 11 # distributed under the License is distributed on an "AS IS" BASIS, |
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 # See the License for the specific language governing permissions and | 13 # See the License for the specific language governing permissions and |
14 # limitations under the License. | 14 # limitations under the License. |
15 """Helper functions for tracker file functionality.""" | 15 """Helper functions for tracker file functionality.""" |
16 | 16 |
17 import errno | 17 import errno |
18 import hashlib | 18 import hashlib |
| 19 import json |
19 import os | 20 import os |
20 import re | 21 import re |
21 | 22 |
22 from boto import config | 23 from boto import config |
23 from gslib.exception import CommandException | 24 from gslib.exception import CommandException |
24 from gslib.util import CreateDirIfNeeded | 25 from gslib.util import CreateDirIfNeeded |
25 from gslib.util import GetGsutilStateDir | 26 from gslib.util import GetGsutilStateDir |
26 from gslib.util import ResumableThreshold | 27 from gslib.util import ResumableThreshold |
27 from gslib.util import UTF8 | 28 from gslib.util import UTF8 |
28 | 29 |
29 # The maximum length of a file name can vary wildly between different | 30 # The maximum length of a file name can vary wildly between different |
30 # operating systems, so we always ensure that tracker files are less | 31 # operating systems, so we always ensure that tracker files are less |
31 # than 100 characters in order to avoid any such issues. | 32 # than 100 characters in order to avoid any such issues. |
32 MAX_TRACKER_FILE_NAME_LENGTH = 100 | 33 MAX_TRACKER_FILE_NAME_LENGTH = 100 |
33 | 34 |
34 | 35 |
35 TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT = ( | 36 TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT = ( |
36 'Couldn\'t write tracker file (%s): %s. This can happen if gsutil is ' | 37 'Couldn\'t write tracker file (%s): %s. This can happen if gsutil is ' |
37 'configured to save tracker files to an unwritable directory)') | 38 'configured to save tracker files to an unwritable directory)') |
38 | 39 |
39 | 40 |
40 class TrackerFileType(object): | 41 class TrackerFileType(object): |
41 UPLOAD = 'upload' | 42 UPLOAD = 'upload' |
42 DOWNLOAD = 'download' | 43 DOWNLOAD = 'download' |
| 44 DOWNLOAD_COMPONENT = 'download_component' |
43 PARALLEL_UPLOAD = 'parallel_upload' | 45 PARALLEL_UPLOAD = 'parallel_upload' |
| 46 SLICED_DOWNLOAD = 'sliced_download' |
44 REWRITE = 'rewrite' | 47 REWRITE = 'rewrite' |
45 | 48 |
46 | 49 |
47 def _HashFilename(filename): | 50 def _HashFilename(filename): |
48 """Apply a hash function (SHA1) to shorten the passed file name. | 51 """Apply a hash function (SHA1) to shorten the passed file name. |
49 | 52 |
50 The spec for the hashed file name is as follows: | 53 The spec for the hashed file name is as follows: |
51 | 54 |
52 TRACKER_<hash>_<trailing> | 55 TRACKER_<hash>_<trailing> |
53 | 56 |
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
103 # Encode the src and dest bucket and object names into the tracker file | 106 # Encode the src and dest bucket and object names into the tracker file |
104 # name. | 107 # name. |
105 res_tracker_file_name = ( | 108 res_tracker_file_name = ( |
106 re.sub('[/\\\\]', '_', 'rewrite__%s__%s__%s__%s__%s.token' % | 109 re.sub('[/\\\\]', '_', 'rewrite__%s__%s__%s__%s__%s.token' % |
107 (src_bucket_name, src_obj_name, dst_bucket_name, | 110 (src_bucket_name, src_obj_name, dst_bucket_name, |
108 dst_obj_name, api_selector))) | 111 dst_obj_name, api_selector))) |
109 | 112 |
110 return _HashAndReturnPath(res_tracker_file_name, TrackerFileType.REWRITE) | 113 return _HashAndReturnPath(res_tracker_file_name, TrackerFileType.REWRITE) |
111 | 114 |
112 | 115 |
113 def GetTrackerFilePath(dst_url, tracker_file_type, api_selector, src_url=None): | 116 def GetTrackerFilePath(dst_url, tracker_file_type, api_selector, src_url=None, |
| 117 component_num=None): |
114 """Gets the tracker file name described by the arguments. | 118 """Gets the tracker file name described by the arguments. |
115 | 119 |
116 Args: | 120 Args: |
117 dst_url: Destination URL for tracker file. | 121 dst_url: Destination URL for tracker file. |
118 tracker_file_type: TrackerFileType for this operation. | 122 tracker_file_type: TrackerFileType for this operation. |
119 api_selector: API to use for this operation. | 123 api_selector: API to use for this operation. |
120 src_url: Source URL for the source file name for parallel uploads. | 124 src_url: Source URL for the source file name for parallel uploads. |
| 125 component_num: Component number if this is a download component, else None. |
121 | 126 |
122 Returns: | 127 Returns: |
123 File path to tracker file. | 128 File path to tracker file. |
124 """ | 129 """ |
125 if tracker_file_type == TrackerFileType.UPLOAD: | 130 if tracker_file_type == TrackerFileType.UPLOAD: |
126 # Encode the dest bucket and object name into the tracker file name. | 131 # Encode the dest bucket and object name into the tracker file name. |
127 res_tracker_file_name = ( | 132 res_tracker_file_name = ( |
128 re.sub('[/\\\\]', '_', 'resumable_upload__%s__%s__%s.url' % | 133 re.sub('[/\\\\]', '_', 'resumable_upload__%s__%s__%s.url' % |
129 (dst_url.bucket_name, dst_url.object_name, api_selector))) | 134 (dst_url.bucket_name, dst_url.object_name, api_selector))) |
130 elif tracker_file_type == TrackerFileType.DOWNLOAD: | 135 elif tracker_file_type == TrackerFileType.DOWNLOAD: |
131 # Encode the fully-qualified dest file name into the tracker file name. | 136 # Encode the fully-qualified dest file name into the tracker file name. |
132 res_tracker_file_name = ( | 137 res_tracker_file_name = ( |
133 re.sub('[/\\\\]', '_', 'resumable_download__%s__%s.etag' % | 138 re.sub('[/\\\\]', '_', 'resumable_download__%s__%s.etag' % |
134 (os.path.realpath(dst_url.object_name), api_selector))) | 139 (os.path.realpath(dst_url.object_name), api_selector))) |
| 140 elif tracker_file_type == TrackerFileType.DOWNLOAD_COMPONENT: |
| 141 # Encode the fully-qualified dest file name and the component number |
| 142 # into the tracker file name. |
| 143 res_tracker_file_name = ( |
| 144 re.sub('[/\\\\]', '_', 'resumable_download__%s__%s__%d.etag' % |
| 145 (os.path.realpath(dst_url.object_name), api_selector, |
| 146 component_num))) |
135 elif tracker_file_type == TrackerFileType.PARALLEL_UPLOAD: | 147 elif tracker_file_type == TrackerFileType.PARALLEL_UPLOAD: |
136 # Encode the dest bucket and object names as well as the source file name | 148 # Encode the dest bucket and object names as well as the source file name |
137 # into the tracker file name. | 149 # into the tracker file name. |
138 res_tracker_file_name = ( | 150 res_tracker_file_name = ( |
139 re.sub('[/\\\\]', '_', 'parallel_upload__%s__%s__%s__%s.url' % | 151 re.sub('[/\\\\]', '_', 'parallel_upload__%s__%s__%s__%s.url' % |
140 (dst_url.bucket_name, dst_url.object_name, | 152 (dst_url.bucket_name, dst_url.object_name, |
141 src_url, api_selector))) | 153 src_url, api_selector))) |
| 154 elif tracker_file_type == TrackerFileType.SLICED_DOWNLOAD: |
| 155 # Encode the fully-qualified dest file name into the tracker file name. |
| 156 res_tracker_file_name = ( |
| 157 re.sub('[/\\\\]', '_', 'sliced_download__%s__%s.etag' % |
| 158 (os.path.realpath(dst_url.object_name), api_selector))) |
142 elif tracker_file_type == TrackerFileType.REWRITE: | 159 elif tracker_file_type == TrackerFileType.REWRITE: |
143 # Should use GetRewriteTrackerFilePath instead. | 160 # Should use GetRewriteTrackerFilePath instead. |
144 raise NotImplementedError() | 161 raise NotImplementedError() |
145 | 162 |
146 return _HashAndReturnPath(res_tracker_file_name, tracker_file_type) | 163 return _HashAndReturnPath(res_tracker_file_name, tracker_file_type) |
147 | 164 |
148 | 165 |
| 166 def DeleteDownloadTrackerFiles(dst_url, api_selector): |
| 167 """Deletes all tracker files corresponding to an object download. |
| 168 |
| 169 Args: |
| 170 dst_url: StorageUrl describing the destination file. |
| 171 api_selector: The Cloud API implementation used. |
| 172 """ |
| 173 # Delete non-sliced download tracker file. |
| 174 DeleteTrackerFile(GetTrackerFilePath(dst_url, TrackerFileType.DOWNLOAD, |
| 175 api_selector)) |
| 176 |
| 177 # Delete all sliced download tracker files. |
| 178 tracker_files = GetSlicedDownloadTrackerFilePaths(dst_url, api_selector) |
| 179 for tracker_file in tracker_files: |
| 180 DeleteTrackerFile(tracker_file) |
| 181 |
| 182 |
| 183 def GetSlicedDownloadTrackerFilePaths(dst_url, api_selector, |
| 184 num_components=None): |
| 185 """Gets a list of sliced download tracker file paths. |
| 186 |
| 187 The list consists of the parent tracker file path in index 0, and then |
| 188 any existing component tracker files in [1:]. |
| 189 |
| 190 Args: |
| 191 dst_url: Destination URL for tracker file. |
| 192 api_selector: API to use for this operation. |
| 193 num_components: The number of component tracker files, if already known. |
| 194 If not known, the number will be retrieved from the parent |
| 195 tracker file on disk. |
| 196 Returns: |
| 197 File path to tracker file. |
| 198 """ |
| 199 parallel_tracker_file_path = GetTrackerFilePath( |
| 200 dst_url, TrackerFileType.SLICED_DOWNLOAD, api_selector) |
| 201 tracker_file_paths = [parallel_tracker_file_path] |
| 202 |
| 203 # If we don't know the number of components, check the tracker file. |
| 204 if num_components is None: |
| 205 tracker_file = None |
| 206 try: |
| 207 tracker_file = open(parallel_tracker_file_path, 'r') |
| 208 num_components = json.load(tracker_file)['num_components'] |
| 209 except (IOError, ValueError): |
| 210 return tracker_file_paths |
| 211 finally: |
| 212 if tracker_file: |
| 213 tracker_file.close() |
| 214 |
| 215 for i in range(num_components): |
| 216 tracker_file_paths.append(GetTrackerFilePath( |
| 217 dst_url, TrackerFileType.DOWNLOAD_COMPONENT, api_selector, |
| 218 component_num=i)) |
| 219 |
| 220 return tracker_file_paths |
| 221 |
| 222 |
149 def _HashAndReturnPath(res_tracker_file_name, tracker_file_type): | 223 def _HashAndReturnPath(res_tracker_file_name, tracker_file_type): |
| 224 """Hashes and returns a tracker file path. |
| 225 |
| 226 Args: |
| 227 res_tracker_file_name: The tracker file name prior to it being hashed. |
| 228 tracker_file_type: The TrackerFileType of res_tracker_file_name. |
| 229 |
| 230 Returns: |
| 231 Final (hashed) tracker file path. |
| 232 """ |
150 resumable_tracker_dir = CreateTrackerDirIfNeeded() | 233 resumable_tracker_dir = CreateTrackerDirIfNeeded() |
151 hashed_tracker_file_name = _HashFilename(res_tracker_file_name) | 234 hashed_tracker_file_name = _HashFilename(res_tracker_file_name) |
152 tracker_file_name = '%s_%s' % (str(tracker_file_type).lower(), | 235 tracker_file_name = '%s_%s' % (str(tracker_file_type).lower(), |
153 hashed_tracker_file_name) | 236 hashed_tracker_file_name) |
154 tracker_file_path = '%s%s%s' % (resumable_tracker_dir, os.sep, | 237 tracker_file_path = '%s%s%s' % (resumable_tracker_dir, os.sep, |
155 tracker_file_name) | 238 tracker_file_name) |
156 assert len(tracker_file_name) < MAX_TRACKER_FILE_NAME_LENGTH | 239 assert len(tracker_file_name) < MAX_TRACKER_FILE_NAME_LENGTH |
157 return tracker_file_path | 240 return tracker_file_path |
158 | 241 |
159 | 242 |
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
249 Args: | 332 Args: |
250 tracker_file_name: Tracker file path string. | 333 tracker_file_name: Tracker file path string. |
251 rewrite_params_hash: MD5 hex digest of rewrite call parameters constructed | 334 rewrite_params_hash: MD5 hex digest of rewrite call parameters constructed |
252 by HashRewriteParameters. | 335 by HashRewriteParameters. |
253 rewrite_token: Rewrite token string returned by the service. | 336 rewrite_token: Rewrite token string returned by the service. |
254 """ | 337 """ |
255 _WriteTrackerFile(tracker_file_name, '%s\n%s\n' % (rewrite_params_hash, | 338 _WriteTrackerFile(tracker_file_name, '%s\n%s\n' % (rewrite_params_hash, |
256 rewrite_token)) | 339 rewrite_token)) |
257 | 340 |
258 | 341 |
259 def ReadOrCreateDownloadTrackerFile(src_obj_metadata, dst_url, | 342 def ReadOrCreateDownloadTrackerFile(src_obj_metadata, dst_url, logger, |
260 api_selector): | 343 api_selector, start_byte, |
| 344 existing_file_size, component_num=None): |
261 """Checks for a download tracker file and creates one if it does not exist. | 345 """Checks for a download tracker file and creates one if it does not exist. |
262 | 346 |
| 347 The methodology for determining the download start point differs between |
| 348 normal and sliced downloads. For normal downloads, the existing bytes in |
| 349 the file are presumed to be correct and have been previously downloaded from |
| 350 the server (if a tracker file exists). In this case, the existing file size |
| 351 is used to determine the download start point. For sliced downloads, the |
| 352 number of bytes previously retrieved from the server cannot be determined |
| 353 from the existing file size, and so the number of bytes known to have been |
| 354 previously downloaded is retrieved from the tracker file. |
| 355 |
263 Args: | 356 Args: |
264 src_obj_metadata: Metadata for the source object. Must include | 357 src_obj_metadata: Metadata for the source object. Must include etag and |
265 etag and size. | 358 generation. |
266 dst_url: Destination file StorageUrl. | 359 dst_url: Destination URL for tracker file. |
267 api_selector: API mode to use (for tracker file naming). | 360 logger: For outputting log messages. |
| 361 api_selector: API to use for this operation. |
| 362 start_byte: The start byte of the byte range for this download. |
| 363 existing_file_size: Size of existing file for this download on disk. |
| 364 component_num: The component number, if this is a component of a parallel |
| 365 download, else None. |
268 | 366 |
269 Returns: | 367 Returns: |
270 True if the tracker file already exists (resume existing download), | 368 tracker_file_name: The name of the tracker file, if one was used. |
271 False if we created a new tracker file (new download). | 369 download_start_byte: The first byte that still needs to be downloaded. |
272 """ | 370 """ |
| 371 assert src_obj_metadata.etag |
| 372 |
| 373 tracker_file_name = None |
273 if src_obj_metadata.size < ResumableThreshold(): | 374 if src_obj_metadata.size < ResumableThreshold(): |
274 # Don't create a tracker file for a small downloads; cross-process resumes | 375 # Don't create a tracker file for a small downloads; cross-process resumes |
275 # won't work, but restarting a small download is inexpensive. | 376 # won't work, but restarting a small download is inexpensive. |
276 return False | 377 return tracker_file_name, start_byte |
277 | 378 |
278 assert src_obj_metadata.etag | 379 download_name = dst_url.object_name |
279 tracker_file_name = GetTrackerFilePath( | 380 if component_num is None: |
280 dst_url, TrackerFileType.DOWNLOAD, api_selector) | 381 tracker_file_type = TrackerFileType.DOWNLOAD |
| 382 else: |
| 383 tracker_file_type = TrackerFileType.DOWNLOAD_COMPONENT |
| 384 download_name += ' component %d' % component_num |
| 385 |
| 386 tracker_file_name = GetTrackerFilePath(dst_url, tracker_file_type, |
| 387 api_selector, |
| 388 component_num=component_num) |
281 tracker_file = None | 389 tracker_file = None |
282 | |
283 # Check to see if we already have a matching tracker file. | 390 # Check to see if we already have a matching tracker file. |
284 try: | 391 try: |
285 tracker_file = open(tracker_file_name, 'r') | 392 tracker_file = open(tracker_file_name, 'r') |
286 etag_value = tracker_file.readline().rstrip('\n') | 393 if tracker_file_type is TrackerFileType.DOWNLOAD: |
287 if etag_value == src_obj_metadata.etag: | 394 etag_value = tracker_file.readline().rstrip('\n') |
288 return True | 395 if etag_value == src_obj_metadata.etag: |
289 except IOError as e: | 396 return tracker_file_name, existing_file_size |
| 397 elif tracker_file_type is TrackerFileType.DOWNLOAD_COMPONENT: |
| 398 component_data = json.loads(tracker_file.read()) |
| 399 if (component_data['etag'] == src_obj_metadata.etag and |
| 400 component_data['generation'] == src_obj_metadata.generation): |
| 401 return tracker_file_name, component_data['download_start_byte'] |
| 402 |
| 403 logger.warn('Tracker file doesn\'t match for download of %s. Restarting ' |
| 404 'download from scratch.' % download_name) |
| 405 |
| 406 except (IOError, ValueError) as e: |
290 # Ignore non-existent file (happens first time a download | 407 # Ignore non-existent file (happens first time a download |
291 # is attempted on an object), but warn user for other errors. | 408 # is attempted on an object), but warn user for other errors. |
292 if e.errno != errno.ENOENT: | 409 if isinstance(e, ValueError) or e.errno != errno.ENOENT: |
293 print('Couldn\'t read URL tracker file (%s): %s. Restarting ' | 410 logger.warn('Couldn\'t read download tracker file (%s): %s. Restarting ' |
294 'download from scratch.' % | 411 'download from scratch.' % (tracker_file_name, str(e))) |
295 (tracker_file_name, e.strerror)) | |
296 finally: | 412 finally: |
297 if tracker_file: | 413 if tracker_file: |
298 tracker_file.close() | 414 tracker_file.close() |
299 | 415 |
300 # Otherwise, create a new tracker file and start from scratch. | 416 # There wasn't a matching tracker file, so create one and then start the |
301 _WriteTrackerFile(tracker_file_name, '%s\n' % src_obj_metadata.etag) | 417 # download from scratch. |
| 418 if tracker_file_type is TrackerFileType.DOWNLOAD: |
| 419 _WriteTrackerFile(tracker_file_name, '%s\n' % src_obj_metadata.etag) |
| 420 elif tracker_file_type is TrackerFileType.DOWNLOAD_COMPONENT: |
| 421 WriteDownloadComponentTrackerFile(tracker_file_name, src_obj_metadata, |
| 422 start_byte) |
| 423 return tracker_file_name, start_byte |
| 424 |
| 425 |
| 426 def WriteDownloadComponentTrackerFile(tracker_file_name, src_obj_metadata, |
| 427 current_file_pos): |
| 428 """Updates or creates a download component tracker file on disk. |
| 429 |
| 430 Args: |
| 431 tracker_file_name: The name of the tracker file. |
| 432 src_obj_metadata: Metadata for the source object. Must include etag. |
| 433 current_file_pos: The current position in the file. |
| 434 """ |
| 435 component_data = {'etag': src_obj_metadata.etag, |
| 436 'generation': src_obj_metadata.generation, |
| 437 'download_start_byte': current_file_pos} |
| 438 |
| 439 _WriteTrackerFile(tracker_file_name, json.dumps(component_data)) |
302 | 440 |
303 | 441 |
304 def _WriteTrackerFile(tracker_file_name, data): | 442 def _WriteTrackerFile(tracker_file_name, data): |
305 """Creates a tracker file, storing the input data.""" | 443 """Creates a tracker file, storing the input data.""" |
306 try: | 444 try: |
307 with os.fdopen(os.open(tracker_file_name, | 445 with os.fdopen(os.open(tracker_file_name, |
308 os.O_WRONLY | os.O_CREAT, 0600), 'w') as tf: | 446 os.O_WRONLY | os.O_CREAT, 0600), 'w') as tf: |
309 tf.write(data) | 447 tf.write(data) |
310 return False | 448 return False |
311 except (IOError, OSError) as e: | 449 except (IOError, OSError) as e: |
312 raise RaiseUnwritableTrackerFileException(tracker_file_name, e.strerror) | 450 raise RaiseUnwritableTrackerFileException(tracker_file_name, e.strerror) |
313 | 451 |
314 | 452 |
315 def RaiseUnwritableTrackerFileException(tracker_file_name, error_str): | 453 def RaiseUnwritableTrackerFileException(tracker_file_name, error_str): |
316 """Raises an exception when unable to write the tracker file.""" | 454 """Raises an exception when unable to write the tracker file.""" |
317 raise CommandException(TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT % | 455 raise CommandException(TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT % |
318 (tracker_file_name, error_str)) | 456 (tracker_file_name, error_str)) |
OLD | NEW |