Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(12)

Side by Side Diff: third_party/gsutil/gslib/tracker_file.py

Issue 1380943003: Roll version of gsutil to 4.15. (Closed) Base URL: https://github.com/catapult-project/catapult.git@master
Patch Set: rebase Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 # -*- coding: utf-8 -*- 1 # -*- coding: utf-8 -*-
2 # Copyright 2015 Google Inc. All Rights Reserved. 2 # Copyright 2015 Google Inc. All Rights Reserved.
3 # 3 #
4 # Licensed under the Apache License, Version 2.0 (the "License"); 4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License. 5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at 6 # You may obtain a copy of the License at
7 # 7 #
8 # http://www.apache.org/licenses/LICENSE-2.0 8 # http://www.apache.org/licenses/LICENSE-2.0
9 # 9 #
10 # Unless required by applicable law or agreed to in writing, software 10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS, 11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and 13 # See the License for the specific language governing permissions and
14 # limitations under the License. 14 # limitations under the License.
15 """Helper functions for tracker file functionality.""" 15 """Helper functions for tracker file functionality."""
16 16
17 import errno 17 import errno
18 import hashlib 18 import hashlib
19 import json
19 import os 20 import os
20 import re 21 import re
21 22
22 from boto import config 23 from boto import config
23 from gslib.exception import CommandException 24 from gslib.exception import CommandException
24 from gslib.util import CreateDirIfNeeded 25 from gslib.util import CreateDirIfNeeded
25 from gslib.util import GetGsutilStateDir 26 from gslib.util import GetGsutilStateDir
26 from gslib.util import ResumableThreshold 27 from gslib.util import ResumableThreshold
27 from gslib.util import UTF8 28 from gslib.util import UTF8
28 29
29 # The maximum length of a file name can vary wildly between different 30 # The maximum length of a file name can vary wildly between different
30 # operating systems, so we always ensure that tracker files are less 31 # operating systems, so we always ensure that tracker files are less
31 # than 100 characters in order to avoid any such issues. 32 # than 100 characters in order to avoid any such issues.
32 MAX_TRACKER_FILE_NAME_LENGTH = 100 33 MAX_TRACKER_FILE_NAME_LENGTH = 100
33 34
34 35
35 TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT = ( 36 TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT = (
36 'Couldn\'t write tracker file (%s): %s. This can happen if gsutil is ' 37 'Couldn\'t write tracker file (%s): %s. This can happen if gsutil is '
37 'configured to save tracker files to an unwritable directory)') 38 'configured to save tracker files to an unwritable directory)')
38 39
39 40
40 class TrackerFileType(object): 41 class TrackerFileType(object):
41 UPLOAD = 'upload' 42 UPLOAD = 'upload'
42 DOWNLOAD = 'download' 43 DOWNLOAD = 'download'
44 DOWNLOAD_COMPONENT = 'download_component'
43 PARALLEL_UPLOAD = 'parallel_upload' 45 PARALLEL_UPLOAD = 'parallel_upload'
46 SLICED_DOWNLOAD = 'sliced_download'
44 REWRITE = 'rewrite' 47 REWRITE = 'rewrite'
45 48
46 49
47 def _HashFilename(filename): 50 def _HashFilename(filename):
48 """Apply a hash function (SHA1) to shorten the passed file name. 51 """Apply a hash function (SHA1) to shorten the passed file name.
49 52
50 The spec for the hashed file name is as follows: 53 The spec for the hashed file name is as follows:
51 54
52 TRACKER_<hash>_<trailing> 55 TRACKER_<hash>_<trailing>
53 56
(...skipping 49 matching lines...) Expand 10 before | Expand all | Expand 10 after
103 # Encode the src and dest bucket and object names into the tracker file 106 # Encode the src and dest bucket and object names into the tracker file
104 # name. 107 # name.
105 res_tracker_file_name = ( 108 res_tracker_file_name = (
106 re.sub('[/\\\\]', '_', 'rewrite__%s__%s__%s__%s__%s.token' % 109 re.sub('[/\\\\]', '_', 'rewrite__%s__%s__%s__%s__%s.token' %
107 (src_bucket_name, src_obj_name, dst_bucket_name, 110 (src_bucket_name, src_obj_name, dst_bucket_name,
108 dst_obj_name, api_selector))) 111 dst_obj_name, api_selector)))
109 112
110 return _HashAndReturnPath(res_tracker_file_name, TrackerFileType.REWRITE) 113 return _HashAndReturnPath(res_tracker_file_name, TrackerFileType.REWRITE)
111 114
112 115
113 def GetTrackerFilePath(dst_url, tracker_file_type, api_selector, src_url=None): 116 def GetTrackerFilePath(dst_url, tracker_file_type, api_selector, src_url=None,
117 component_num=None):
114 """Gets the tracker file name described by the arguments. 118 """Gets the tracker file name described by the arguments.
115 119
116 Args: 120 Args:
117 dst_url: Destination URL for tracker file. 121 dst_url: Destination URL for tracker file.
118 tracker_file_type: TrackerFileType for this operation. 122 tracker_file_type: TrackerFileType for this operation.
119 api_selector: API to use for this operation. 123 api_selector: API to use for this operation.
120 src_url: Source URL for the source file name for parallel uploads. 124 src_url: Source URL for the source file name for parallel uploads.
125 component_num: Component number if this is a download component, else None.
121 126
122 Returns: 127 Returns:
123 File path to tracker file. 128 File path to tracker file.
124 """ 129 """
125 if tracker_file_type == TrackerFileType.UPLOAD: 130 if tracker_file_type == TrackerFileType.UPLOAD:
126 # Encode the dest bucket and object name into the tracker file name. 131 # Encode the dest bucket and object name into the tracker file name.
127 res_tracker_file_name = ( 132 res_tracker_file_name = (
128 re.sub('[/\\\\]', '_', 'resumable_upload__%s__%s__%s.url' % 133 re.sub('[/\\\\]', '_', 'resumable_upload__%s__%s__%s.url' %
129 (dst_url.bucket_name, dst_url.object_name, api_selector))) 134 (dst_url.bucket_name, dst_url.object_name, api_selector)))
130 elif tracker_file_type == TrackerFileType.DOWNLOAD: 135 elif tracker_file_type == TrackerFileType.DOWNLOAD:
131 # Encode the fully-qualified dest file name into the tracker file name. 136 # Encode the fully-qualified dest file name into the tracker file name.
132 res_tracker_file_name = ( 137 res_tracker_file_name = (
133 re.sub('[/\\\\]', '_', 'resumable_download__%s__%s.etag' % 138 re.sub('[/\\\\]', '_', 'resumable_download__%s__%s.etag' %
134 (os.path.realpath(dst_url.object_name), api_selector))) 139 (os.path.realpath(dst_url.object_name), api_selector)))
140 elif tracker_file_type == TrackerFileType.DOWNLOAD_COMPONENT:
141 # Encode the fully-qualified dest file name and the component number
142 # into the tracker file name.
143 res_tracker_file_name = (
144 re.sub('[/\\\\]', '_', 'resumable_download__%s__%s__%d.etag' %
145 (os.path.realpath(dst_url.object_name), api_selector,
146 component_num)))
135 elif tracker_file_type == TrackerFileType.PARALLEL_UPLOAD: 147 elif tracker_file_type == TrackerFileType.PARALLEL_UPLOAD:
136 # Encode the dest bucket and object names as well as the source file name 148 # Encode the dest bucket and object names as well as the source file name
137 # into the tracker file name. 149 # into the tracker file name.
138 res_tracker_file_name = ( 150 res_tracker_file_name = (
139 re.sub('[/\\\\]', '_', 'parallel_upload__%s__%s__%s__%s.url' % 151 re.sub('[/\\\\]', '_', 'parallel_upload__%s__%s__%s__%s.url' %
140 (dst_url.bucket_name, dst_url.object_name, 152 (dst_url.bucket_name, dst_url.object_name,
141 src_url, api_selector))) 153 src_url, api_selector)))
154 elif tracker_file_type == TrackerFileType.SLICED_DOWNLOAD:
155 # Encode the fully-qualified dest file name into the tracker file name.
156 res_tracker_file_name = (
157 re.sub('[/\\\\]', '_', 'sliced_download__%s__%s.etag' %
158 (os.path.realpath(dst_url.object_name), api_selector)))
142 elif tracker_file_type == TrackerFileType.REWRITE: 159 elif tracker_file_type == TrackerFileType.REWRITE:
143 # Should use GetRewriteTrackerFilePath instead. 160 # Should use GetRewriteTrackerFilePath instead.
144 raise NotImplementedError() 161 raise NotImplementedError()
145 162
146 return _HashAndReturnPath(res_tracker_file_name, tracker_file_type) 163 return _HashAndReturnPath(res_tracker_file_name, tracker_file_type)
147 164
148 165
166 def DeleteDownloadTrackerFiles(dst_url, api_selector):
167 """Deletes all tracker files corresponding to an object download.
168
169 Args:
170 dst_url: StorageUrl describing the destination file.
171 api_selector: The Cloud API implementation used.
172 """
173 # Delete non-sliced download tracker file.
174 DeleteTrackerFile(GetTrackerFilePath(dst_url, TrackerFileType.DOWNLOAD,
175 api_selector))
176
177 # Delete all sliced download tracker files.
178 tracker_files = GetSlicedDownloadTrackerFilePaths(dst_url, api_selector)
179 for tracker_file in tracker_files:
180 DeleteTrackerFile(tracker_file)
181
182
183 def GetSlicedDownloadTrackerFilePaths(dst_url, api_selector,
184 num_components=None):
185 """Gets a list of sliced download tracker file paths.
186
187 The list consists of the parent tracker file path in index 0, and then
188 any existing component tracker files in [1:].
189
190 Args:
191 dst_url: Destination URL for tracker file.
192 api_selector: API to use for this operation.
193 num_components: The number of component tracker files, if already known.
194 If not known, the number will be retrieved from the parent
195 tracker file on disk.
196 Returns:
197 File path to tracker file.
198 """
199 parallel_tracker_file_path = GetTrackerFilePath(
200 dst_url, TrackerFileType.SLICED_DOWNLOAD, api_selector)
201 tracker_file_paths = [parallel_tracker_file_path]
202
203 # If we don't know the number of components, check the tracker file.
204 if num_components is None:
205 tracker_file = None
206 try:
207 tracker_file = open(parallel_tracker_file_path, 'r')
208 num_components = json.load(tracker_file)['num_components']
209 except (IOError, ValueError):
210 return tracker_file_paths
211 finally:
212 if tracker_file:
213 tracker_file.close()
214
215 for i in range(num_components):
216 tracker_file_paths.append(GetTrackerFilePath(
217 dst_url, TrackerFileType.DOWNLOAD_COMPONENT, api_selector,
218 component_num=i))
219
220 return tracker_file_paths
221
222
149 def _HashAndReturnPath(res_tracker_file_name, tracker_file_type): 223 def _HashAndReturnPath(res_tracker_file_name, tracker_file_type):
224 """Hashes and returns a tracker file path.
225
226 Args:
227 res_tracker_file_name: The tracker file name prior to it being hashed.
228 tracker_file_type: The TrackerFileType of res_tracker_file_name.
229
230 Returns:
231 Final (hashed) tracker file path.
232 """
150 resumable_tracker_dir = CreateTrackerDirIfNeeded() 233 resumable_tracker_dir = CreateTrackerDirIfNeeded()
151 hashed_tracker_file_name = _HashFilename(res_tracker_file_name) 234 hashed_tracker_file_name = _HashFilename(res_tracker_file_name)
152 tracker_file_name = '%s_%s' % (str(tracker_file_type).lower(), 235 tracker_file_name = '%s_%s' % (str(tracker_file_type).lower(),
153 hashed_tracker_file_name) 236 hashed_tracker_file_name)
154 tracker_file_path = '%s%s%s' % (resumable_tracker_dir, os.sep, 237 tracker_file_path = '%s%s%s' % (resumable_tracker_dir, os.sep,
155 tracker_file_name) 238 tracker_file_name)
156 assert len(tracker_file_name) < MAX_TRACKER_FILE_NAME_LENGTH 239 assert len(tracker_file_name) < MAX_TRACKER_FILE_NAME_LENGTH
157 return tracker_file_path 240 return tracker_file_path
158 241
159 242
(...skipping 89 matching lines...) Expand 10 before | Expand all | Expand 10 after
249 Args: 332 Args:
250 tracker_file_name: Tracker file path string. 333 tracker_file_name: Tracker file path string.
251 rewrite_params_hash: MD5 hex digest of rewrite call parameters constructed 334 rewrite_params_hash: MD5 hex digest of rewrite call parameters constructed
252 by HashRewriteParameters. 335 by HashRewriteParameters.
253 rewrite_token: Rewrite token string returned by the service. 336 rewrite_token: Rewrite token string returned by the service.
254 """ 337 """
255 _WriteTrackerFile(tracker_file_name, '%s\n%s\n' % (rewrite_params_hash, 338 _WriteTrackerFile(tracker_file_name, '%s\n%s\n' % (rewrite_params_hash,
256 rewrite_token)) 339 rewrite_token))
257 340
258 341
259 def ReadOrCreateDownloadTrackerFile(src_obj_metadata, dst_url, 342 def ReadOrCreateDownloadTrackerFile(src_obj_metadata, dst_url, logger,
260 api_selector): 343 api_selector, start_byte,
344 existing_file_size, component_num=None):
261 """Checks for a download tracker file and creates one if it does not exist. 345 """Checks for a download tracker file and creates one if it does not exist.
262 346
347 The methodology for determining the download start point differs between
348 normal and sliced downloads. For normal downloads, the existing bytes in
349 the file are presumed to be correct and have been previously downloaded from
350 the server (if a tracker file exists). In this case, the existing file size
351 is used to determine the download start point. For sliced downloads, the
352 number of bytes previously retrieved from the server cannot be determined
353 from the existing file size, and so the number of bytes known to have been
354 previously downloaded is retrieved from the tracker file.
355
263 Args: 356 Args:
264 src_obj_metadata: Metadata for the source object. Must include 357 src_obj_metadata: Metadata for the source object. Must include etag and
265 etag and size. 358 generation.
266 dst_url: Destination file StorageUrl. 359 dst_url: Destination URL for tracker file.
267 api_selector: API mode to use (for tracker file naming). 360 logger: For outputting log messages.
361 api_selector: API to use for this operation.
362 start_byte: The start byte of the byte range for this download.
363 existing_file_size: Size of existing file for this download on disk.
364 component_num: The component number, if this is a component of a parallel
365 download, else None.
268 366
269 Returns: 367 Returns:
270 True if the tracker file already exists (resume existing download), 368 tracker_file_name: The name of the tracker file, if one was used.
271 False if we created a new tracker file (new download). 369 download_start_byte: The first byte that still needs to be downloaded.
272 """ 370 """
371 assert src_obj_metadata.etag
372
373 tracker_file_name = None
273 if src_obj_metadata.size < ResumableThreshold(): 374 if src_obj_metadata.size < ResumableThreshold():
274 # Don't create a tracker file for a small downloads; cross-process resumes 375 # Don't create a tracker file for a small downloads; cross-process resumes
275 # won't work, but restarting a small download is inexpensive. 376 # won't work, but restarting a small download is inexpensive.
276 return False 377 return tracker_file_name, start_byte
277 378
278 assert src_obj_metadata.etag 379 download_name = dst_url.object_name
279 tracker_file_name = GetTrackerFilePath( 380 if component_num is None:
280 dst_url, TrackerFileType.DOWNLOAD, api_selector) 381 tracker_file_type = TrackerFileType.DOWNLOAD
382 else:
383 tracker_file_type = TrackerFileType.DOWNLOAD_COMPONENT
384 download_name += ' component %d' % component_num
385
386 tracker_file_name = GetTrackerFilePath(dst_url, tracker_file_type,
387 api_selector,
388 component_num=component_num)
281 tracker_file = None 389 tracker_file = None
282
283 # Check to see if we already have a matching tracker file. 390 # Check to see if we already have a matching tracker file.
284 try: 391 try:
285 tracker_file = open(tracker_file_name, 'r') 392 tracker_file = open(tracker_file_name, 'r')
286 etag_value = tracker_file.readline().rstrip('\n') 393 if tracker_file_type is TrackerFileType.DOWNLOAD:
287 if etag_value == src_obj_metadata.etag: 394 etag_value = tracker_file.readline().rstrip('\n')
288 return True 395 if etag_value == src_obj_metadata.etag:
289 except IOError as e: 396 return tracker_file_name, existing_file_size
397 elif tracker_file_type is TrackerFileType.DOWNLOAD_COMPONENT:
398 component_data = json.loads(tracker_file.read())
399 if (component_data['etag'] == src_obj_metadata.etag and
400 component_data['generation'] == src_obj_metadata.generation):
401 return tracker_file_name, component_data['download_start_byte']
402
403 logger.warn('Tracker file doesn\'t match for download of %s. Restarting '
404 'download from scratch.' % download_name)
405
406 except (IOError, ValueError) as e:
290 # Ignore non-existent file (happens first time a download 407 # Ignore non-existent file (happens first time a download
291 # is attempted on an object), but warn user for other errors. 408 # is attempted on an object), but warn user for other errors.
292 if e.errno != errno.ENOENT: 409 if isinstance(e, ValueError) or e.errno != errno.ENOENT:
293 print('Couldn\'t read URL tracker file (%s): %s. Restarting ' 410 logger.warn('Couldn\'t read download tracker file (%s): %s. Restarting '
294 'download from scratch.' % 411 'download from scratch.' % (tracker_file_name, str(e)))
295 (tracker_file_name, e.strerror))
296 finally: 412 finally:
297 if tracker_file: 413 if tracker_file:
298 tracker_file.close() 414 tracker_file.close()
299 415
300 # Otherwise, create a new tracker file and start from scratch. 416 # There wasn't a matching tracker file, so create one and then start the
301 _WriteTrackerFile(tracker_file_name, '%s\n' % src_obj_metadata.etag) 417 # download from scratch.
418 if tracker_file_type is TrackerFileType.DOWNLOAD:
419 _WriteTrackerFile(tracker_file_name, '%s\n' % src_obj_metadata.etag)
420 elif tracker_file_type is TrackerFileType.DOWNLOAD_COMPONENT:
421 WriteDownloadComponentTrackerFile(tracker_file_name, src_obj_metadata,
422 start_byte)
423 return tracker_file_name, start_byte
424
425
426 def WriteDownloadComponentTrackerFile(tracker_file_name, src_obj_metadata,
427 current_file_pos):
428 """Updates or creates a download component tracker file on disk.
429
430 Args:
431 tracker_file_name: The name of the tracker file.
432 src_obj_metadata: Metadata for the source object. Must include etag.
433 current_file_pos: The current position in the file.
434 """
435 component_data = {'etag': src_obj_metadata.etag,
436 'generation': src_obj_metadata.generation,
437 'download_start_byte': current_file_pos}
438
439 _WriteTrackerFile(tracker_file_name, json.dumps(component_data))
302 440
303 441
304 def _WriteTrackerFile(tracker_file_name, data): 442 def _WriteTrackerFile(tracker_file_name, data):
305 """Creates a tracker file, storing the input data.""" 443 """Creates a tracker file, storing the input data."""
306 try: 444 try:
307 with os.fdopen(os.open(tracker_file_name, 445 with os.fdopen(os.open(tracker_file_name,
308 os.O_WRONLY | os.O_CREAT, 0600), 'w') as tf: 446 os.O_WRONLY | os.O_CREAT, 0600), 'w') as tf:
309 tf.write(data) 447 tf.write(data)
310 return False 448 return False
311 except (IOError, OSError) as e: 449 except (IOError, OSError) as e:
312 raise RaiseUnwritableTrackerFileException(tracker_file_name, e.strerror) 450 raise RaiseUnwritableTrackerFileException(tracker_file_name, e.strerror)
313 451
314 452
315 def RaiseUnwritableTrackerFileException(tracker_file_name, error_str): 453 def RaiseUnwritableTrackerFileException(tracker_file_name, error_str):
316 """Raises an exception when unable to write the tracker file.""" 454 """Raises an exception when unable to write the tracker file."""
317 raise CommandException(TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT % 455 raise CommandException(TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT %
318 (tracker_file_name, error_str)) 456 (tracker_file_name, error_str))
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698