OLD | NEW |
| (Empty) |
1 # Copyright 2010 Google Inc. All Rights Reserved. | |
2 # | |
3 # Permission is hereby granted, free of charge, to any person obtaining a | |
4 # copy of this software and associated documentation files (the | |
5 # "Software"), to deal in the Software without restriction, including | |
6 # without limitation the rights to use, copy, modify, merge, publish, dis- | |
7 # tribute, sublicense, and/or sell copies of the Software, and to permit | |
8 # persons to whom the Software is furnished to do so, subject to the fol- | |
9 # lowing conditions: | |
10 # | |
11 # The above copyright notice and this permission notice shall be included | |
12 # in all copies or substantial portions of the Software. | |
13 # | |
14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | |
15 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- | |
16 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT | |
17 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
18 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | |
20 # IN THE SOFTWARE. | |
21 | |
22 """Implementation of wildcarding over StorageUris. | |
23 | |
24 StorageUri is an abstraction that Google introduced in the boto library, | |
25 for representing storage provider-independent bucket and object names with | |
26 a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current | |
27 class provides wildcarding support for StorageUri objects (including both | |
28 bucket and file system objects), allowing one to express collections of | |
29 objects with syntax like the following: | |
30 gs://mybucket/images/*.png | |
31 file:///tmp/???abc??? | |
32 | |
33 We provide wildcarding support as part of gsutil rather than as part | |
34 of boto because wildcarding is really part of shell command-like | |
35 functionality. | |
36 | |
37 A comment about wildcard semantics: We support both single path component | |
38 wildcards (e.g., using '*') and recursive wildcards (using '**'), for both | |
39 file and cloud URIs. For example, | |
40 gs://bucket/doc/*/*.html | |
41 would enumerate HTML files one directory down from gs://bucket/doc, while | |
42 gs://bucket/**/*.html | |
43 would enumerate HTML files in all objects contained in the bucket. | |
44 | |
45 Note also that if you use file system wildcards it's likely your shell | |
46 interprets the wildcarding before passing the command to gsutil. For example: | |
47 % gsutil cp /opt/eclipse/*/*.html gs://bucket/eclipse | |
48 would likely be expanded by the shell into the following before running gsutil: | |
49 % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse | |
50 | |
51 Note also that most shells don't support '**' wildcarding (I think only | |
52 zsh does). If you want to use '**' wildcarding with such a shell you can | |
53 single quote each wildcarded string, so it gets passed uninterpreted by the | |
54 shell to gsutil (at which point gsutil will perform the wildcarding expansion): | |
55 % gsutil cp '/opt/eclipse/**/*.html' gs://bucket/eclipse | |
56 """ | |
57 | |
58 import boto | |
59 import fnmatch | |
60 import glob | |
61 import os | |
62 import re | |
63 import sys | |
64 import urllib | |
65 | |
66 from boto.s3.prefix import Prefix | |
67 from boto.storage_uri import BucketStorageUri | |
68 from bucket_listing_ref import BucketListingRef | |
69 | |
70 # Regex to determine if a string contains any wildcards. | |
71 WILDCARD_REGEX = re.compile('[*?\[\]]') | |
72 | |
73 WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator' | |
74 WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator' | |
75 | |
76 | |
77 class WildcardIterator(object): | |
78 """Base class for wildcarding over StorageUris. | |
79 | |
80 This class implements support for iterating over StorageUris that | |
81 contain wildcards. | |
82 | |
83 The base class is abstract; you should instantiate using the | |
84 wildcard_iterator() static factory method, which chooses the right | |
85 implementation depending on the StorageUri. | |
86 """ | |
87 | |
88 def __repr__(self): | |
89 """Returns string representation of WildcardIterator.""" | |
90 return 'WildcardIterator(%s)' % self.wildcard_uri | |
91 | |
92 | |
93 class CloudWildcardIterator(WildcardIterator): | |
94 """WildcardIterator subclass for buckets and objects. | |
95 | |
96 Iterates over BucketListingRef matching the StorageUri wildcard. It's | |
97 much more efficient to request the Key from the BucketListingRef (via | |
98 GetKey()) than to request the StorageUri and then call uri.get_key() | |
99 to retrieve the key, for cases where you want to get metadata that's | |
100 available in the Bucket (for example to get the name and size of | |
101 each object), because that information is available in the bucket GET | |
102 results. If you were to iterate over URIs for such cases and then get | |
103 the name and size info from each resulting StorageUri, it would cause | |
104 an additional object GET request for each of the result URIs. | |
105 """ | |
106 | |
107 def __init__(self, wildcard_uri, proj_id_handler, | |
108 bucket_storage_uri_class=BucketStorageUri, all_versions=False, | |
109 headers=None, debug=0): | |
110 """ | |
111 Instantiates an iterator over BucketListingRef matching given wildcard URI. | |
112 | |
113 Args: | |
114 wildcard_uri: StorageUri that contains the wildcard to iterate. | |
115 proj_id_handler: ProjectIdHandler to use for current command. | |
116 bucket_storage_uri_class: BucketStorageUri interface. | |
117 Settable for testing/mocking. | |
118 headers: Dictionary containing optional HTTP headers to pass to boto. | |
119 debug: Debug level to pass in to boto connection (range 0..3). | |
120 """ | |
121 self.wildcard_uri = wildcard_uri | |
122 # Make a copy of the headers so any updates we make during wildcard | |
123 # expansion aren't left in the input params (specifically, so we don't | |
124 # include the x-goog-project-id header needed by a subset of cases, in | |
125 # the data returned to caller, which could then be used in other cases | |
126 # where that header must not be passed). | |
127 if headers is None: | |
128 self.headers = {} | |
129 else: | |
130 self.headers = headers.copy() | |
131 self.proj_id_handler = proj_id_handler | |
132 self.bucket_storage_uri_class = bucket_storage_uri_class | |
133 self.all_versions = all_versions | |
134 self.debug = debug | |
135 | |
136 def __iter__(self): | |
137 """Python iterator that gets called when iterating over cloud wildcard. | |
138 | |
139 Yields: | |
140 BucketListingRef, or empty iterator if no matches. | |
141 """ | |
142 # First handle bucket wildcarding, if any. | |
143 if ContainsWildcard(self.wildcard_uri.bucket_name): | |
144 regex = fnmatch.translate(self.wildcard_uri.bucket_name) | |
145 bucket_uris = [] | |
146 prog = re.compile(regex) | |
147 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR, | |
148 self.wildcard_uri, | |
149 self.headers) | |
150 for b in self.wildcard_uri.get_all_buckets(headers=self.headers): | |
151 if prog.match(b.name): | |
152 # Use str(b.name) because get_all_buckets() returns Unicode | |
153 # string, which when used to construct x-goog-copy-src metadata | |
154 # requests for object-to-object copies causes pathname '/' chars | |
155 # to be entity-encoded (bucket%2Fdir instead of bucket/dir), | |
156 # which causes the request to fail. | |
157 uri_str = '%s://%s' % (self.wildcard_uri.scheme, | |
158 urllib.quote_plus(str(b.name))) | |
159 bucket_uris.append( | |
160 boto.storage_uri( | |
161 uri_str, debug=self.debug, | |
162 bucket_storage_uri_class=self.bucket_storage_uri_class, | |
163 suppress_consec_slashes=False)) | |
164 else: | |
165 bucket_uris = [self.wildcard_uri.clone_replace_name('')] | |
166 | |
167 # Now iterate over bucket(s), and handle object wildcarding, if any. | |
168 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR, | |
169 self.wildcard_uri, | |
170 self.headers) | |
171 for bucket_uri in bucket_uris: | |
172 if self.wildcard_uri.names_bucket(): | |
173 # Bucket-only URI. | |
174 yield BucketListingRef(bucket_uri, key=None, prefix=None, | |
175 headers=self.headers) | |
176 else: | |
177 # URI contains an object name. If there's no wildcard just yield | |
178 # the needed URI. | |
179 if not ContainsWildcard(self.wildcard_uri.object_name): | |
180 uri_to_yield = bucket_uri.clone_replace_name( | |
181 self.wildcard_uri.object_name) | |
182 yield BucketListingRef(uri_to_yield, key=None, prefix=None, | |
183 headers=self.headers) | |
184 else: | |
185 # URI contains a wildcard. Expand iteratively by building | |
186 # prefix/delimiter bucket listing request, filtering the results per | |
187 # the current level's wildcard, and continuing with the next component | |
188 # of the wildcard. See _BuildBucketFilterStrings() documentation | |
189 # for details. | |
190 # | |
191 # Initialize the iteration with bucket name from bucket_uri but | |
192 # object name from self.wildcard_uri. This is needed to handle cases | |
193 # where both the bucket and object names contain wildcards. | |
194 uris_needing_expansion = [ | |
195 bucket_uri.clone_replace_name(self.wildcard_uri.object_name)] | |
196 while len(uris_needing_expansion) > 0: | |
197 uri = uris_needing_expansion.pop(0) | |
198 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = ( | |
199 self._BuildBucketFilterStrings(uri.object_name)) | |
200 prog = re.compile(fnmatch.translate(prefix_wildcard)) | |
201 # List bucket for objects matching prefix up to delimiter. | |
202 for key in bucket_uri.list_bucket(prefix=prefix, | |
203 delimiter=delimiter, | |
204 headers=self.headers, | |
205 all_versions=self.all_versions): | |
206 # Check that the prefix regex matches rstripped key.name (to | |
207 # correspond with the rstripped prefix_wildcard from | |
208 # _BuildBucketFilterStrings()). | |
209 if prog.match(key.name.rstrip('/')): | |
210 if suffix_wildcard and key.name.rstrip('/') != suffix_wildcard: | |
211 if isinstance(key, Prefix): | |
212 # There's more wildcard left to expand. | |
213 uris_needing_expansion.append( | |
214 uri.clone_replace_name(key.name.rstrip('/') + '/' | |
215 + suffix_wildcard)) | |
216 else: | |
217 # Done expanding. | |
218 expanded_uri = uri.clone_replace_key(key) | |
219 | |
220 if isinstance(key, Prefix): | |
221 yield BucketListingRef(expanded_uri, key=None, prefix=key, | |
222 headers=self.headers) | |
223 else: | |
224 if self.all_versions: | |
225 yield BucketListingRef(expanded_uri, key=key, prefix=None, | |
226 headers=self.headers) | |
227 else: | |
228 # Yield BLR wrapping version-less URI. | |
229 yield BucketListingRef(expanded_uri.clone_replace_name( | |
230 expanded_uri.object_name), key=key, prefix=None, | |
231 headers=self.headers) | |
232 | |
233 def _BuildBucketFilterStrings(self, wildcard): | |
234 """ | |
235 Builds strings needed for querying a bucket and filtering results to | |
236 implement wildcard object name matching. | |
237 | |
238 Args: | |
239 wildcard: The wildcard string to match to objects. | |
240 | |
241 Returns: | |
242 (prefix, delimiter, prefix_wildcard, suffix_wildcard) | |
243 where: | |
244 prefix is the prefix to be sent in bucket GET request. | |
245 delimiter is the delimiter to be sent in bucket GET request. | |
246 prefix_wildcard is the wildcard to be used to filter bucket GET results. | |
247 suffix_wildcard is wildcard to be appended to filtered bucket GET | |
248 results for next wildcard expansion iteration. | |
249 For example, given the wildcard gs://bucket/abc/d*e/f*.txt we | |
250 would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and | |
251 suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket | |
252 listing request will then produce a listing result set that can be | |
253 filtered using this prefix_wildcard; and we'd use this suffix_wildcard | |
254 to feed into the next call(s) to _BuildBucketFilterStrings(), for the | |
255 next iteration of listing/filtering. | |
256 | |
257 Raises: | |
258 AssertionError if wildcard doesn't contain any wildcard chars. | |
259 """ | |
260 # Generate a request prefix if the object name part of the wildcard starts | |
261 # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz'). | |
262 match = WILDCARD_REGEX.search(wildcard) | |
263 if not match: | |
264 # Input "wildcard" has no wildcard chars, so just return tuple that will | |
265 # cause a bucket listing to match the given input wildcard. Example: if | |
266 # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc, | |
267 # the next iteration will call _BuildBucketFilterStrings() with | |
268 # gs://bucket/dir/abc, and we will return prefix ='dir/abc', | |
269 # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''. | |
270 prefix = wildcard | |
271 delimiter = '/' | |
272 prefix_wildcard = wildcard | |
273 suffix_wildcard = '' | |
274 else: | |
275 if match.start() > 0: | |
276 # Wildcard does not occur at beginning of object name, so construct a | |
277 # prefix string to send to server. | |
278 prefix = wildcard[:match.start()] | |
279 wildcard_part = wildcard[match.start():] | |
280 else: | |
281 prefix = None | |
282 wildcard_part = wildcard | |
283 end = wildcard_part.find('/') | |
284 if end != -1: | |
285 wildcard_part = wildcard_part[:end+1] | |
286 # Remove trailing '/' so we will match gs://bucket/abc* as well as | |
287 # gs://bucket/abc*/ with the same wildcard regex. | |
288 prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/') | |
289 suffix_wildcard = wildcard[match.end():] | |
290 end = suffix_wildcard.find('/') | |
291 if end == -1: | |
292 suffix_wildcard = '' | |
293 else: | |
294 suffix_wildcard = suffix_wildcard[end+1:] | |
295 # To implement recursive (**) wildcarding, if prefix_wildcard | |
296 # suffix_wildcard starts with '**' don't send a delimiter, and combine | |
297 # suffix_wildcard at end of prefix_wildcard. | |
298 if prefix_wildcard.find('**') != -1: | |
299 delimiter = None | |
300 prefix_wildcard = prefix_wildcard + suffix_wildcard | |
301 suffix_wildcard = '' | |
302 else: | |
303 delimiter = '/' | |
304 delim_pos = suffix_wildcard.find(delimiter) | |
305 # The following debug output is useful for tracing how the algorithm | |
306 # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt | |
307 if self.debug > 1: | |
308 sys.stderr.write( | |
309 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, ' | |
310 'prefix_wildcard=%s, suffix_wildcard=%s\n' % | |
311 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard)) | |
312 return (prefix, delimiter, prefix_wildcard, suffix_wildcard) | |
313 | |
314 def IterKeys(self): | |
315 """ | |
316 Convenience iterator that runs underlying iterator and returns Key for each | |
317 iteration. | |
318 | |
319 Yields: | |
320 Subclass of boto.s3.key.Key, or empty iterator if no matches. | |
321 | |
322 Raises: | |
323 WildcardException: for bucket-only uri. | |
324 """ | |
325 for bucket_listing_ref in self. __iter__(): | |
326 if bucket_listing_ref.HasKey(): | |
327 yield bucket_listing_ref.GetKey() | |
328 | |
329 def IterUris(self): | |
330 """ | |
331 Convenience iterator that runs underlying iterator and returns StorageUri | |
332 for each iteration. | |
333 | |
334 Yields: | |
335 StorageUri, or empty iterator if no matches. | |
336 """ | |
337 for bucket_listing_ref in self. __iter__(): | |
338 yield bucket_listing_ref.GetUri() | |
339 | |
340 def IterUrisForKeys(self): | |
341 """ | |
342 Convenience iterator that runs underlying iterator and returns the | |
343 StorageUri for each iterated BucketListingRef that has a Key. | |
344 | |
345 Yields: | |
346 StorageUri, or empty iterator if no matches. | |
347 """ | |
348 for bucket_listing_ref in self. __iter__(): | |
349 if bucket_listing_ref.HasKey(): | |
350 yield bucket_listing_ref.GetUri() | |
351 | |
352 | |
353 class FileWildcardIterator(WildcardIterator): | |
354 """WildcardIterator subclass for files and directories. | |
355 | |
356 If you use recursive wildcards ('**') only a single such wildcard is | |
357 supported. For example you could use the wildcard '**/*.txt' to list all .txt | |
358 files in any subdirectory of the current directory, but you couldn't use a | |
359 wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt | |
360 files in any subdirectory named 'abc'). | |
361 """ | |
362 | |
363 def __init__(self, wildcard_uri, headers=None, debug=0): | |
364 """ | |
365 Instantiate an iterator over BucketListingRefs matching given wildcard URI. | |
366 | |
367 Args: | |
368 wildcard_uri: StorageUri that contains the wildcard to iterate. | |
369 headers: Dictionary containing optional HTTP headers to pass to boto. | |
370 debug: Debug level to pass in to boto connection (range 0..3). | |
371 """ | |
372 self.wildcard_uri = wildcard_uri | |
373 self.headers = headers | |
374 self.debug = debug | |
375 | |
376 def __iter__(self): | |
377 wildcard = self.wildcard_uri.object_name | |
378 match = re.search('\*\*', wildcard) | |
379 if match: | |
380 # Recursive wildcarding request ('.../**/...'). | |
381 # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' | |
382 base_dir = wildcard[:match.start()-1] | |
383 remaining_wildcard = wildcard[match.start()+2:] | |
384 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and | |
385 # remaining_wildcard = '/*' | |
386 if remaining_wildcard.startswith('*'): | |
387 raise WildcardException('Invalid wildcard with more than 2 consecutive ' | |
388 '*s (%s)' % wildcard) | |
389 # If there was no remaining wildcard past the recursive wildcard, | |
390 # treat it as if it were a '*'. For example, file://tmp/** is equivalent | |
391 # to file://tmp/**/* | |
392 if not remaining_wildcard: | |
393 remaining_wildcard = '*' | |
394 # Skip slash(es). | |
395 remaining_wildcard = remaining_wildcard.lstrip(os.sep) | |
396 filepaths = [] | |
397 for dirpath, unused_dirnames, filenames in os.walk(base_dir): | |
398 filepaths.extend( | |
399 os.path.join(dirpath, f) for f in fnmatch.filter(filenames, | |
400 remaining_wildcard) | |
401 ) | |
402 else: | |
403 # Not a recursive wildcarding request. | |
404 filepaths = glob.glob(wildcard) | |
405 for filepath in filepaths: | |
406 expanded_uri = self.wildcard_uri.clone_replace_name(filepath) | |
407 yield BucketListingRef(expanded_uri) | |
408 | |
409 def IterKeys(self): | |
410 """ | |
411 Placeholder to allow polymorphic use of WildcardIterator. | |
412 | |
413 Raises: | |
414 WildcardException: in all cases. | |
415 """ | |
416 raise WildcardException( | |
417 'Iterating over Keys not possible for file wildcards') | |
418 | |
419 def IterUris(self): | |
420 """ | |
421 Convenience iterator that runs underlying iterator and returns StorageUri | |
422 for each iteration. | |
423 | |
424 Yields: | |
425 StorageUri, or empty iterator if no matches. | |
426 """ | |
427 for bucket_listing_ref in self. __iter__(): | |
428 yield bucket_listing_ref.GetUri() | |
429 | |
430 | |
431 class WildcardException(StandardError): | |
432 """Exception thrown for invalid wildcard URIs.""" | |
433 | |
434 def __init__(self, reason): | |
435 StandardError.__init__(self) | |
436 self.reason = reason | |
437 | |
438 def __repr__(self): | |
439 return 'WildcardException: %s' % self.reason | |
440 | |
441 def __str__(self): | |
442 return 'WildcardException: %s' % self.reason | |
443 | |
444 | |
445 def wildcard_iterator(uri_or_str, proj_id_handler, | |
446 bucket_storage_uri_class=BucketStorageUri, | |
447 all_versions=False, | |
448 headers=None, debug=0): | |
449 """Instantiate a WildCardIterator for the given StorageUri. | |
450 | |
451 Args: | |
452 uri_or_str: StorageUri or URI string naming wildcard objects to iterate. | |
453 proj_id_handler: ProjectIdHandler to use for current command. | |
454 bucket_storage_uri_class: BucketStorageUri interface. | |
455 Settable for testing/mocking. | |
456 headers: Dictionary containing optional HTTP headers to pass to boto. | |
457 debug: Debug level to pass in to boto connection (range 0..3). | |
458 | |
459 Returns: | |
460 A WildcardIterator that handles the requested iteration. | |
461 """ | |
462 | |
463 if isinstance(uri_or_str, basestring): | |
464 # Disable enforce_bucket_naming, to allow bucket names containing wildcard | |
465 # chars. | |
466 uri = boto.storage_uri( | |
467 uri_or_str, debug=debug, validate=False, | |
468 bucket_storage_uri_class=bucket_storage_uri_class, | |
469 suppress_consec_slashes=False) | |
470 else: | |
471 uri = uri_or_str | |
472 | |
473 if uri.is_cloud_uri(): | |
474 return CloudWildcardIterator( | |
475 uri, proj_id_handler, | |
476 bucket_storage_uri_class=bucket_storage_uri_class, | |
477 all_versions=all_versions, | |
478 headers=headers, | |
479 debug=debug) | |
480 elif uri.is_file_uri(): | |
481 return FileWildcardIterator(uri, headers=headers, debug=debug) | |
482 else: | |
483 raise WildcardException('Unexpected type of StorageUri (%s)' % uri) | |
484 | |
485 | |
486 def ContainsWildcard(uri_or_str): | |
487 """Checks whether uri_or_str contains a wildcard. | |
488 | |
489 Args: | |
490 uri_or_str: StorageUri or URI string to check. | |
491 | |
492 Returns: | |
493 bool indicator. | |
494 """ | |
495 if isinstance(uri_or_str, basestring): | |
496 return bool(WILDCARD_REGEX.search(uri_or_str)) | |
497 else: | |
498 return bool(WILDCARD_REGEX.search(uri_or_str.uri)) | |
OLD | NEW |