OLD | NEW |
| (Empty) |
1 # -*- coding: utf-8 -*- | |
2 # Copyright 2015 Google Inc. All Rights Reserved. | |
3 # | |
4 # Licensed under the Apache License, Version 2.0 (the "License"); | |
5 # you may not use this file except in compliance with the License. | |
6 # You may obtain a copy of the License at | |
7 # | |
8 # http://www.apache.org/licenses/LICENSE-2.0 | |
9 # | |
10 # Unless required by applicable law or agreed to in writing, software | |
11 # distributed under the License is distributed on an "AS IS" BASIS, | |
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 # See the License for the specific language governing permissions and | |
14 # limitations under the License. | |
15 """Helper functions for tracker file functionality.""" | |
16 | |
17 import errno | |
18 import hashlib | |
19 import os | |
20 import re | |
21 | |
22 from boto import config | |
23 from gslib.exception import CommandException | |
24 from gslib.util import CreateDirIfNeeded | |
25 from gslib.util import GetGsutilStateDir | |
26 from gslib.util import ResumableThreshold | |
27 from gslib.util import UTF8 | |
28 | |
29 # The maximum length of a file name can vary wildly between different | |
30 # operating systems, so we always ensure that tracker files are less | |
31 # than 100 characters in order to avoid any such issues. | |
32 MAX_TRACKER_FILE_NAME_LENGTH = 100 | |
33 | |
34 | |
35 TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT = ( | |
36 'Couldn\'t write tracker file (%s): %s. This can happen if gsutil is ' | |
37 'configured to save tracker files to an unwritable directory)') | |
38 | |
39 | |
40 class TrackerFileType(object): | |
41 UPLOAD = 'upload' | |
42 DOWNLOAD = 'download' | |
43 PARALLEL_UPLOAD = 'parallel_upload' | |
44 REWRITE = 'rewrite' | |
45 | |
46 | |
47 def _HashFilename(filename): | |
48 """Apply a hash function (SHA1) to shorten the passed file name. | |
49 | |
50 The spec for the hashed file name is as follows: | |
51 | |
52 TRACKER_<hash>_<trailing> | |
53 | |
54 where hash is a SHA1 hash on the original file name and trailing is | |
55 the last 16 chars from the original file name. Max file name lengths | |
56 vary by operating system so the goal of this function is to ensure | |
57 the hashed version takes fewer than 100 characters. | |
58 | |
59 Args: | |
60 filename: file name to be hashed. | |
61 | |
62 Returns: | |
63 shorter, hashed version of passed file name | |
64 """ | |
65 if isinstance(filename, unicode): | |
66 filename = filename.encode(UTF8) | |
67 else: | |
68 filename = unicode(filename, UTF8).encode(UTF8) | |
69 m = hashlib.sha1(filename) | |
70 return 'TRACKER_' + m.hexdigest() + '.' + filename[-16:] | |
71 | |
72 | |
73 def CreateTrackerDirIfNeeded(): | |
74 """Looks up or creates the gsutil tracker file directory. | |
75 | |
76 This is the configured directory where gsutil keeps its resumable transfer | |
77 tracker files. This function creates it if it doesn't already exist. | |
78 | |
79 Returns: | |
80 The pathname to the tracker directory. | |
81 """ | |
82 tracker_dir = config.get( | |
83 'GSUtil', 'resumable_tracker_dir', | |
84 os.path.join(GetGsutilStateDir(), 'tracker-files')) | |
85 CreateDirIfNeeded(tracker_dir) | |
86 return tracker_dir | |
87 | |
88 | |
89 def GetRewriteTrackerFilePath(src_bucket_name, src_obj_name, dst_bucket_name, | |
90 dst_obj_name, api_selector): | |
91 """Gets the tracker file name described by the arguments. | |
92 | |
93 Args: | |
94 src_bucket_name: Source bucket (string). | |
95 src_obj_name: Source object (string). | |
96 dst_bucket_name: Destination bucket (string). | |
97 dst_obj_name: Destination object (string) | |
98 api_selector: API to use for this operation. | |
99 | |
100 Returns: | |
101 File path to tracker file. | |
102 """ | |
103 # Encode the src and dest bucket and object names into the tracker file | |
104 # name. | |
105 res_tracker_file_name = ( | |
106 re.sub('[/\\\\]', '_', 'rewrite__%s__%s__%s__%s__%s.token' % | |
107 (src_bucket_name, src_obj_name, dst_bucket_name, | |
108 dst_obj_name, api_selector))) | |
109 | |
110 return _HashAndReturnPath(res_tracker_file_name, TrackerFileType.REWRITE) | |
111 | |
112 | |
113 def GetTrackerFilePath(dst_url, tracker_file_type, api_selector, src_url=None): | |
114 """Gets the tracker file name described by the arguments. | |
115 | |
116 Args: | |
117 dst_url: Destination URL for tracker file. | |
118 tracker_file_type: TrackerFileType for this operation. | |
119 api_selector: API to use for this operation. | |
120 src_url: Source URL for the source file name for parallel uploads. | |
121 | |
122 Returns: | |
123 File path to tracker file. | |
124 """ | |
125 if tracker_file_type == TrackerFileType.UPLOAD: | |
126 # Encode the dest bucket and object name into the tracker file name. | |
127 res_tracker_file_name = ( | |
128 re.sub('[/\\\\]', '_', 'resumable_upload__%s__%s__%s.url' % | |
129 (dst_url.bucket_name, dst_url.object_name, api_selector))) | |
130 elif tracker_file_type == TrackerFileType.DOWNLOAD: | |
131 # Encode the fully-qualified dest file name into the tracker file name. | |
132 res_tracker_file_name = ( | |
133 re.sub('[/\\\\]', '_', 'resumable_download__%s__%s.etag' % | |
134 (os.path.realpath(dst_url.object_name), api_selector))) | |
135 elif tracker_file_type == TrackerFileType.PARALLEL_UPLOAD: | |
136 # Encode the dest bucket and object names as well as the source file name | |
137 # into the tracker file name. | |
138 res_tracker_file_name = ( | |
139 re.sub('[/\\\\]', '_', 'parallel_upload__%s__%s__%s__%s.url' % | |
140 (dst_url.bucket_name, dst_url.object_name, | |
141 src_url, api_selector))) | |
142 elif tracker_file_type == TrackerFileType.REWRITE: | |
143 # Should use GetRewriteTrackerFilePath instead. | |
144 raise NotImplementedError() | |
145 | |
146 return _HashAndReturnPath(res_tracker_file_name, tracker_file_type) | |
147 | |
148 | |
149 def _HashAndReturnPath(res_tracker_file_name, tracker_file_type): | |
150 resumable_tracker_dir = CreateTrackerDirIfNeeded() | |
151 hashed_tracker_file_name = _HashFilename(res_tracker_file_name) | |
152 tracker_file_name = '%s_%s' % (str(tracker_file_type).lower(), | |
153 hashed_tracker_file_name) | |
154 tracker_file_path = '%s%s%s' % (resumable_tracker_dir, os.sep, | |
155 tracker_file_name) | |
156 assert len(tracker_file_name) < MAX_TRACKER_FILE_NAME_LENGTH | |
157 return tracker_file_path | |
158 | |
159 | |
160 def DeleteTrackerFile(tracker_file_name): | |
161 if tracker_file_name and os.path.exists(tracker_file_name): | |
162 os.unlink(tracker_file_name) | |
163 | |
164 | |
165 def HashRewriteParameters( | |
166 src_obj_metadata, dst_obj_metadata, projection, src_generation=None, | |
167 gen_match=None, meta_gen_match=None, canned_acl=None, fields=None, | |
168 max_bytes_per_call=None): | |
169 """Creates an MD5 hex digest of the parameters for a rewrite call. | |
170 | |
171 Resuming rewrites requires that the input parameters are identical. Thus, | |
172 the rewrite tracker file needs to represent the input parameters. For | |
173 easy comparison, hash the input values. If a user does a performs a | |
174 same-source/same-destination rewrite via a different command (for example, | |
175 with a changed ACL), the hashes will not match and we will restart the | |
176 rewrite from the beginning. | |
177 | |
178 Args: | |
179 src_obj_metadata: apitools Object describing source object. Must include | |
180 bucket, name, and etag. | |
181 dst_obj_metadata: apitools Object describing destination object. Must | |
182 include bucket and object name | |
183 projection: Projection used for the API call. | |
184 src_generation: Optional source generation. | |
185 gen_match: Optional generation precondition. | |
186 meta_gen_match: Optional metageneration precondition. | |
187 canned_acl: Optional canned ACL string. | |
188 fields: Optional fields to include in response. | |
189 max_bytes_per_call: Optional maximum bytes rewritten per call. | |
190 | |
191 Returns: | |
192 MD5 hex digest Hash of the input parameters, or None if required parameters | |
193 are missing. | |
194 """ | |
195 if (not src_obj_metadata or | |
196 not src_obj_metadata.bucket or | |
197 not src_obj_metadata.name or | |
198 not src_obj_metadata.etag or | |
199 not dst_obj_metadata or | |
200 not dst_obj_metadata.bucket or | |
201 not dst_obj_metadata.name or | |
202 not projection): | |
203 return | |
204 md5_hash = hashlib.md5() | |
205 for input_param in ( | |
206 src_obj_metadata, dst_obj_metadata, projection, src_generation, | |
207 gen_match, meta_gen_match, canned_acl, fields, max_bytes_per_call): | |
208 md5_hash.update(str(input_param)) | |
209 return md5_hash.hexdigest() | |
210 | |
211 | |
212 def ReadRewriteTrackerFile(tracker_file_name, rewrite_params_hash): | |
213 """Attempts to read a rewrite tracker file. | |
214 | |
215 Args: | |
216 tracker_file_name: Tracker file path string. | |
217 rewrite_params_hash: MD5 hex digest of rewrite call parameters constructed | |
218 by HashRewriteParameters. | |
219 | |
220 Returns: | |
221 String rewrite_token for resuming rewrite requests if a matching tracker | |
222 file exists, None otherwise (which will result in starting a new rewrite). | |
223 """ | |
224 # Check to see if we already have a matching tracker file. | |
225 tracker_file = None | |
226 if not rewrite_params_hash: | |
227 return | |
228 try: | |
229 tracker_file = open(tracker_file_name, 'r') | |
230 existing_hash = tracker_file.readline().rstrip('\n') | |
231 if existing_hash == rewrite_params_hash: | |
232 # Next line is the rewrite token. | |
233 return tracker_file.readline().rstrip('\n') | |
234 except IOError as e: | |
235 # Ignore non-existent file (happens first time a rewrite is attempted. | |
236 if e.errno != errno.ENOENT: | |
237 print('Couldn\'t read Copy tracker file (%s): %s. Restarting copy ' | |
238 'from scratch.' % | |
239 (tracker_file_name, e.strerror)) | |
240 finally: | |
241 if tracker_file: | |
242 tracker_file.close() | |
243 | |
244 | |
245 def WriteRewriteTrackerFile(tracker_file_name, rewrite_params_hash, | |
246 rewrite_token): | |
247 """Writes a rewrite tracker file. | |
248 | |
249 Args: | |
250 tracker_file_name: Tracker file path string. | |
251 rewrite_params_hash: MD5 hex digest of rewrite call parameters constructed | |
252 by HashRewriteParameters. | |
253 rewrite_token: Rewrite token string returned by the service. | |
254 """ | |
255 _WriteTrackerFile(tracker_file_name, '%s\n%s\n' % (rewrite_params_hash, | |
256 rewrite_token)) | |
257 | |
258 | |
259 def ReadOrCreateDownloadTrackerFile(src_obj_metadata, dst_url, | |
260 api_selector): | |
261 """Checks for a download tracker file and creates one if it does not exist. | |
262 | |
263 Args: | |
264 src_obj_metadata: Metadata for the source object. Must include | |
265 etag and size. | |
266 dst_url: Destination file StorageUrl. | |
267 api_selector: API mode to use (for tracker file naming). | |
268 | |
269 Returns: | |
270 True if the tracker file already exists (resume existing download), | |
271 False if we created a new tracker file (new download). | |
272 """ | |
273 if src_obj_metadata.size < ResumableThreshold(): | |
274 # Don't create a tracker file for a small downloads; cross-process resumes | |
275 # won't work, but restarting a small download is inexpensive. | |
276 return False | |
277 | |
278 assert src_obj_metadata.etag | |
279 tracker_file_name = GetTrackerFilePath( | |
280 dst_url, TrackerFileType.DOWNLOAD, api_selector) | |
281 tracker_file = None | |
282 | |
283 # Check to see if we already have a matching tracker file. | |
284 try: | |
285 tracker_file = open(tracker_file_name, 'r') | |
286 etag_value = tracker_file.readline().rstrip('\n') | |
287 if etag_value == src_obj_metadata.etag: | |
288 return True | |
289 except IOError as e: | |
290 # Ignore non-existent file (happens first time a download | |
291 # is attempted on an object), but warn user for other errors. | |
292 if e.errno != errno.ENOENT: | |
293 print('Couldn\'t read URL tracker file (%s): %s. Restarting ' | |
294 'download from scratch.' % | |
295 (tracker_file_name, e.strerror)) | |
296 finally: | |
297 if tracker_file: | |
298 tracker_file.close() | |
299 | |
300 # Otherwise, create a new tracker file and start from scratch. | |
301 _WriteTrackerFile(tracker_file_name, '%s\n' % src_obj_metadata.etag) | |
302 | |
303 | |
304 def _WriteTrackerFile(tracker_file_name, data): | |
305 """Creates a tracker file, storing the input data.""" | |
306 try: | |
307 with os.fdopen(os.open(tracker_file_name, | |
308 os.O_WRONLY | os.O_CREAT, 0600), 'w') as tf: | |
309 tf.write(data) | |
310 return False | |
311 except (IOError, OSError) as e: | |
312 raise RaiseUnwritableTrackerFileException(tracker_file_name, e.strerror) | |
313 | |
314 | |
315 def RaiseUnwritableTrackerFileException(tracker_file_name, error_str): | |
316 """Raises an exception when unable to write the tracker file.""" | |
317 raise CommandException(TRACKER_FILE_UNWRITABLE_EXCEPTION_TEXT % | |
318 (tracker_file_name, error_str)) | |
OLD | NEW |