OLD | NEW |
| (Empty) |
1 # -*- coding: utf-8 -*- | |
2 # Copyright 2010 Google Inc. All Rights Reserved. | |
3 # | |
4 # Licensed under the Apache License, Version 2.0 (the "License"); | |
5 # you may not use this file except in compliance with the License. | |
6 # You may obtain a copy of the License at | |
7 # | |
8 # http://www.apache.org/licenses/LICENSE-2.0 | |
9 # | |
10 # Unless required by applicable law or agreed to in writing, software | |
11 # distributed under the License is distributed on an "AS IS" BASIS, | |
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 # See the License for the specific language governing permissions and | |
14 # limitations under the License. | |
15 """Wildcard iterator class and supporting functions.""" | |
16 | |
17 from __future__ import absolute_import | |
18 | |
19 import fnmatch | |
20 import glob | |
21 import os | |
22 import re | |
23 import sys | |
24 import textwrap | |
25 | |
26 from gslib.bucket_listing_ref import BucketListingBucket | |
27 from gslib.bucket_listing_ref import BucketListingObject | |
28 from gslib.bucket_listing_ref import BucketListingPrefix | |
29 from gslib.cloud_api import AccessDeniedException | |
30 from gslib.cloud_api import CloudApi | |
31 from gslib.cloud_api import NotFoundException | |
32 from gslib.exception import CommandException | |
33 from gslib.storage_url import ContainsWildcard | |
34 from gslib.storage_url import StorageUrlFromString | |
35 from gslib.storage_url import StripOneSlash | |
36 from gslib.storage_url import WILDCARD_REGEX | |
37 from gslib.translation_helper import GenerationFromUrlAndString | |
38 from gslib.util import UTF8 | |
39 | |
40 | |
41 FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)') | |
42 | |
43 | |
44 class WildcardIterator(object): | |
45 """Class for iterating over Google Cloud Storage strings containing wildcards. | |
46 | |
47 The base class is abstract; you should instantiate using the | |
48 wildcard_iterator() static factory method, which chooses the right | |
49 implementation depending on the base string. | |
50 """ | |
51 | |
52 # TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both | |
53 # and make one return the other. | |
54 def __repr__(self): | |
55 """Returns string representation of WildcardIterator.""" | |
56 return 'WildcardIterator(%s)' % self.wildcard_url.url_string | |
57 | |
58 | |
59 class CloudWildcardIterator(WildcardIterator): | |
60 """WildcardIterator subclass for buckets, bucket subdirs and objects. | |
61 | |
62 Iterates over BucketListingRef matching the Url string wildcard. It's | |
63 much more efficient to first get metadata that's available in the Bucket | |
64 (for example to get the name and size of each object), because that | |
65 information is available in the object list results. | |
66 """ | |
67 | |
68 def __init__(self, wildcard_url, gsutil_api, all_versions=False, | |
69 debug=0, project_id=None): | |
70 """Instantiates an iterator that matches the wildcard URL. | |
71 | |
72 Args: | |
73 wildcard_url: CloudUrl that contains the wildcard to iterate. | |
74 gsutil_api: Cloud storage interface. Passed in for thread safety, also | |
75 settable for testing/mocking. | |
76 all_versions: If true, the iterator yields all versions of objects | |
77 matching the wildcard. If false, yields just the live | |
78 object version. | |
79 debug: Debug level to control debug output for iterator. | |
80 project_id: Project ID to use for bucket listings. | |
81 """ | |
82 self.wildcard_url = wildcard_url | |
83 self.all_versions = all_versions | |
84 self.debug = debug | |
85 self.gsutil_api = gsutil_api | |
86 self.project_id = project_id | |
87 | |
88 def __iter__(self, bucket_listing_fields=None, | |
89 expand_top_level_buckets=False): | |
90 """Iterator that gets called when iterating over the cloud wildcard. | |
91 | |
92 In the case where no wildcard is present, returns a single matching object, | |
93 single matching prefix, or one of each if both exist. | |
94 | |
95 Args: | |
96 bucket_listing_fields: Iterable fields to include in bucket listings. | |
97 Ex. ['name', 'acl']. Iterator is | |
98 responsible for converting these to list-style | |
99 format ['items/name', 'items/acl'] as well as | |
100 adding any fields necessary for listing such as | |
101 prefixes. API implemenation is responsible for | |
102 adding pagination fields. If this is None, | |
103 all fields are returned. | |
104 expand_top_level_buckets: If true, yield no BUCKET references. Instead, | |
105 expand buckets into top-level objects and | |
106 prefixes. | |
107 | |
108 Yields: | |
109 BucketListingRef of type BUCKET, OBJECT or PREFIX. | |
110 """ | |
111 single_version_request = self.wildcard_url.HasGeneration() | |
112 | |
113 # For wildcard expansion purposes, we need at a minimum the name of | |
114 # each object and prefix. If we're not using the default of requesting | |
115 # all fields, make sure at least these are requested. The Cloud API | |
116 # tolerates specifying the same field twice. | |
117 get_fields = None | |
118 if bucket_listing_fields: | |
119 get_fields = set() | |
120 for field in bucket_listing_fields: | |
121 get_fields.add(field) | |
122 bucket_listing_fields = self._GetToListFields( | |
123 get_fields=bucket_listing_fields) | |
124 bucket_listing_fields.update(['items/name', 'prefixes']) | |
125 get_fields.update(['name']) | |
126 # If we're making versioned requests, ensure generation and | |
127 # metageneration are also included. | |
128 if single_version_request or self.all_versions: | |
129 bucket_listing_fields.update(['items/generation', | |
130 'items/metageneration']) | |
131 get_fields.update(['generation', 'metageneration']) | |
132 | |
133 # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then | |
134 # iterate over the expanded bucket strings and handle any object | |
135 # wildcarding. | |
136 for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']): | |
137 bucket_url_string = bucket_listing_ref.url_string | |
138 if self.wildcard_url.IsBucket(): | |
139 # IsBucket() guarantees there are no prefix or object wildcards, and | |
140 # thus this is a top-level listing of buckets. | |
141 if expand_top_level_buckets: | |
142 url = StorageUrlFromString(bucket_url_string) | |
143 for obj_or_prefix in self.gsutil_api.ListObjects( | |
144 url.bucket_name, delimiter='/', all_versions=self.all_versions, | |
145 provider=self.wildcard_url.scheme, | |
146 fields=bucket_listing_fields): | |
147 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: | |
148 yield self._GetObjectRef(bucket_url_string, obj_or_prefix.data, | |
149 with_version=self.all_versions) | |
150 else: # CloudApi.CsObjectOrPrefixType.PREFIX: | |
151 yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data) | |
152 else: | |
153 yield bucket_listing_ref | |
154 else: | |
155 # By default, assume a non-wildcarded URL is an object, not a prefix. | |
156 # This prevents unnecessary listings (which are slower, more expensive, | |
157 # and also subject to eventual consistency). | |
158 if (not ContainsWildcard(self.wildcard_url.url_string) and | |
159 self.wildcard_url.IsObject() and not self.all_versions): | |
160 try: | |
161 get_object = self.gsutil_api.GetObjectMetadata( | |
162 self.wildcard_url.bucket_name, | |
163 self.wildcard_url.object_name, | |
164 generation=self.wildcard_url.generation, | |
165 provider=self.wildcard_url.scheme, | |
166 fields=get_fields) | |
167 yield self._GetObjectRef( | |
168 self.wildcard_url.bucket_url_string, get_object, | |
169 with_version=(self.all_versions or single_version_request)) | |
170 return | |
171 except (NotFoundException, AccessDeniedException): | |
172 # It's possible this is a prefix - try to list instead. | |
173 pass | |
174 | |
175 # Expand iteratively by building prefix/delimiter bucket listing | |
176 # request, filtering the results per the current level's wildcard | |
177 # (if present), and continuing with the next component of the | |
178 # wildcard. See _BuildBucketFilterStrings() documentation for details. | |
179 if single_version_request: | |
180 url_string = '%s%s#%s' % (bucket_url_string, | |
181 self.wildcard_url.object_name, | |
182 self.wildcard_url.generation) | |
183 else: | |
184 # Rstrip any prefixes to correspond with rstripped prefix wildcard | |
185 # from _BuildBucketFilterStrings(). | |
186 url_string = '%s%s' % (bucket_url_string, | |
187 StripOneSlash(self.wildcard_url.object_name) | |
188 or '/') # Cover root object named '/' case. | |
189 urls_needing_expansion = [url_string] | |
190 while urls_needing_expansion: | |
191 url = StorageUrlFromString(urls_needing_expansion.pop(0)) | |
192 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = ( | |
193 self._BuildBucketFilterStrings(url.object_name)) | |
194 prog = re.compile(fnmatch.translate(prefix_wildcard)) | |
195 | |
196 # List bucket for objects matching prefix up to delimiter. | |
197 for obj_or_prefix in self.gsutil_api.ListObjects( | |
198 url.bucket_name, prefix=prefix, delimiter=delimiter, | |
199 all_versions=self.all_versions or single_version_request, | |
200 provider=self.wildcard_url.scheme, | |
201 fields=bucket_listing_fields): | |
202 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: | |
203 gcs_object = obj_or_prefix.data | |
204 if prog.match(gcs_object.name): | |
205 if not suffix_wildcard or ( | |
206 StripOneSlash(gcs_object.name) == suffix_wildcard): | |
207 if not single_version_request or ( | |
208 self._SingleVersionMatches(gcs_object.generation)): | |
209 yield self._GetObjectRef( | |
210 bucket_url_string, gcs_object, with_version=( | |
211 self.all_versions or single_version_request)) | |
212 else: # CloudApi.CsObjectOrPrefixType.PREFIX | |
213 prefix = obj_or_prefix.data | |
214 # If the prefix ends with a slash, remove it. Note that we only | |
215 # remove one slash so that we can successfully enumerate dirs | |
216 # containing multiple slashes. | |
217 rstripped_prefix = StripOneSlash(prefix) | |
218 if prog.match(rstripped_prefix): | |
219 if suffix_wildcard and rstripped_prefix != suffix_wildcard: | |
220 # There's more wildcard left to expand. | |
221 url_append_string = '%s%s' % ( | |
222 bucket_url_string, rstripped_prefix + '/' + | |
223 suffix_wildcard) | |
224 urls_needing_expansion.append(url_append_string) | |
225 else: | |
226 # No wildcard to expand, just yield the prefix | |
227 yield self._GetPrefixRef(bucket_url_string, prefix) | |
228 | |
229 def _BuildBucketFilterStrings(self, wildcard): | |
230 """Builds strings needed for querying a bucket and filtering results. | |
231 | |
232 This implements wildcard object name matching. | |
233 | |
234 Args: | |
235 wildcard: The wildcard string to match to objects. | |
236 | |
237 Returns: | |
238 (prefix, delimiter, prefix_wildcard, suffix_wildcard) | |
239 where: | |
240 prefix is the prefix to be sent in bucket GET request. | |
241 delimiter is the delimiter to be sent in bucket GET request. | |
242 prefix_wildcard is the wildcard to be used to filter bucket GET results. | |
243 suffix_wildcard is wildcard to be appended to filtered bucket GET | |
244 results for next wildcard expansion iteration. | |
245 For example, given the wildcard gs://bucket/abc/d*e/f*.txt we | |
246 would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and | |
247 suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket | |
248 listing request will then produce a listing result set that can be | |
249 filtered using this prefix_wildcard; and we'd use this suffix_wildcard | |
250 to feed into the next call(s) to _BuildBucketFilterStrings(), for the | |
251 next iteration of listing/filtering. | |
252 | |
253 Raises: | |
254 AssertionError if wildcard doesn't contain any wildcard chars. | |
255 """ | |
256 # Generate a request prefix if the object name part of the wildcard starts | |
257 # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz'). | |
258 match = WILDCARD_REGEX.search(wildcard) | |
259 if not match: | |
260 # Input "wildcard" has no wildcard chars, so just return tuple that will | |
261 # cause a bucket listing to match the given input wildcard. Example: if | |
262 # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc, | |
263 # the next iteration will call _BuildBucketFilterStrings() with | |
264 # gs://bucket/dir/abc, and we will return prefix ='dir/abc', | |
265 # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''. | |
266 prefix = wildcard | |
267 delimiter = '/' | |
268 prefix_wildcard = wildcard | |
269 suffix_wildcard = '' | |
270 else: | |
271 if match.start() > 0: | |
272 # Wildcard does not occur at beginning of object name, so construct a | |
273 # prefix string to send to server. | |
274 prefix = wildcard[:match.start()] | |
275 wildcard_part = wildcard[match.start():] | |
276 else: | |
277 prefix = None | |
278 wildcard_part = wildcard | |
279 end = wildcard_part.find('/') | |
280 if end != -1: | |
281 wildcard_part = wildcard_part[:end+1] | |
282 # Remove trailing '/' so we will match gs://bucket/abc* as well as | |
283 # gs://bucket/abc*/ with the same wildcard regex. | |
284 prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part) | |
285 suffix_wildcard = wildcard[match.end():] | |
286 end = suffix_wildcard.find('/') | |
287 if end == -1: | |
288 suffix_wildcard = '' | |
289 else: | |
290 suffix_wildcard = suffix_wildcard[end+1:] | |
291 # To implement recursive (**) wildcarding, if prefix_wildcard | |
292 # suffix_wildcard starts with '**' don't send a delimiter, and combine | |
293 # suffix_wildcard at end of prefix_wildcard. | |
294 if prefix_wildcard.find('**') != -1: | |
295 delimiter = None | |
296 prefix_wildcard += suffix_wildcard | |
297 suffix_wildcard = '' | |
298 else: | |
299 delimiter = '/' | |
300 # The following debug output is useful for tracing how the algorithm | |
301 # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt | |
302 if self.debug > 1: | |
303 sys.stderr.write( | |
304 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, ' | |
305 'prefix_wildcard=%s, suffix_wildcard=%s\n' % | |
306 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard)) | |
307 return (prefix, delimiter, prefix_wildcard, suffix_wildcard) | |
308 | |
309 def _SingleVersionMatches(self, listed_generation): | |
310 decoded_generation = GenerationFromUrlAndString(self.wildcard_url, | |
311 listed_generation) | |
312 return str(self.wildcard_url.generation) == str(decoded_generation) | |
313 | |
314 def _ExpandBucketWildcards(self, bucket_fields=None): | |
315 """Expands bucket and provider wildcards. | |
316 | |
317 Builds a list of bucket url strings that can be iterated on. | |
318 | |
319 Args: | |
320 bucket_fields: If present, populate only these metadata fields for | |
321 buckets. Example value: ['acl', 'defaultObjectAcl'] | |
322 | |
323 Yields: | |
324 BucketListingRefereneces of type BUCKET. | |
325 """ | |
326 bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string) | |
327 if (bucket_fields and set(bucket_fields) == set(['id']) and | |
328 not ContainsWildcard(self.wildcard_url.bucket_name)): | |
329 # If we just want the name of a non-wildcarded bucket URL, | |
330 # don't make an RPC. | |
331 yield BucketListingBucket(bucket_url) | |
332 elif(self.wildcard_url.IsBucket() and | |
333 not ContainsWildcard(self.wildcard_url.bucket_name)): | |
334 # If we have a non-wildcarded bucket URL, get just that bucket. | |
335 yield BucketListingBucket( | |
336 bucket_url, root_object=self.gsutil_api.GetBucket( | |
337 self.wildcard_url.bucket_name, provider=self.wildcard_url.scheme, | |
338 fields=bucket_fields)) | |
339 else: | |
340 regex = fnmatch.translate(self.wildcard_url.bucket_name) | |
341 prog = re.compile(regex) | |
342 | |
343 fields = self._GetToListFields(bucket_fields) | |
344 if fields: | |
345 fields.add('items/id') | |
346 for bucket in self.gsutil_api.ListBuckets( | |
347 fields=fields, project_id=self.project_id, | |
348 provider=self.wildcard_url.scheme): | |
349 if prog.match(bucket.id): | |
350 url = StorageUrlFromString( | |
351 '%s://%s/' % (self.wildcard_url.scheme, bucket.id)) | |
352 yield BucketListingBucket(url, root_object=bucket) | |
353 | |
354 def _GetToListFields(self, get_fields=None): | |
355 """Prepends 'items/' to the input fields and converts it to a set. | |
356 | |
357 This way field sets requested for GetBucket can be used in ListBucket calls. | |
358 Note that the input set must contain only bucket or object fields; listing | |
359 fields such as prefixes or nextPageToken should be added after calling | |
360 this function. | |
361 | |
362 Args: | |
363 get_fields: Iterable fields usable in GetBucket/GetObject calls. | |
364 | |
365 Returns: | |
366 Set of fields usable in ListBuckets/ListObjects calls. | |
367 """ | |
368 if get_fields: | |
369 list_fields = set() | |
370 for field in get_fields: | |
371 list_fields.add('items/' + field) | |
372 return list_fields | |
373 | |
374 def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False): | |
375 """Creates a BucketListingRef of type OBJECT from the arguments. | |
376 | |
377 Args: | |
378 bucket_url_string: Wildcardless string describing the containing bucket. | |
379 gcs_object: gsutil_api root Object for populating the BucketListingRef. | |
380 with_version: If true, return a reference with a versioned string. | |
381 | |
382 Returns: | |
383 BucketListingRef of type OBJECT. | |
384 """ | |
385 # Generation can be None in test mocks, so just return the | |
386 # live object for simplicity. | |
387 if with_version and gcs_object.generation is not None: | |
388 generation_str = GenerationFromUrlAndString(self.wildcard_url, | |
389 gcs_object.generation) | |
390 object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name, | |
391 generation_str) | |
392 else: | |
393 object_string = '%s%s' % (bucket_url_string, gcs_object.name) | |
394 object_url = StorageUrlFromString(object_string) | |
395 return BucketListingObject(object_url, root_object=gcs_object) | |
396 | |
397 def _GetPrefixRef(self, bucket_url_string, prefix): | |
398 """Creates a BucketListingRef of type PREFIX from the arguments. | |
399 | |
400 Args: | |
401 bucket_url_string: Wildcardless string describing the containing bucket. | |
402 prefix: gsutil_api Prefix for populating the BucketListingRef | |
403 | |
404 Returns: | |
405 BucketListingRef of type PREFIX. | |
406 """ | |
407 prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix)) | |
408 return BucketListingPrefix(prefix_url, root_object=prefix) | |
409 | |
410 def IterBuckets(self, bucket_fields=None): | |
411 """Iterates over the wildcard, returning refs for each expanded bucket. | |
412 | |
413 This ignores the object part of the URL entirely and expands only the | |
414 the bucket portion. It will yield BucketListingRefs of type BUCKET only. | |
415 | |
416 Args: | |
417 bucket_fields: Iterable fields to include in bucket listings. | |
418 Ex. ['defaultObjectAcl', 'logging']. This function is | |
419 responsible for converting these to listing-style | |
420 format ['items/defaultObjectAcl', 'items/logging'], as | |
421 well as adding any fields necessary for listing such as | |
422 'items/id'. API implemenation is responsible for | |
423 adding pagination fields. If this is None, all fields are | |
424 returned. | |
425 | |
426 Yields: | |
427 BucketListingRef of type BUCKET, or empty iterator if no matches. | |
428 """ | |
429 for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields): | |
430 yield blr | |
431 | |
432 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): | |
433 """Iterates over the wildcard, yielding bucket, prefix or object refs. | |
434 | |
435 Args: | |
436 bucket_listing_fields: If present, populate only these metadata | |
437 fields for listed objects. | |
438 expand_top_level_buckets: If true and the wildcard expands only to | |
439 Bucket(s), yields the expansion of each bucket | |
440 into a top-level listing of prefixes and objects | |
441 in that bucket instead of a BucketListingRef | |
442 to that bucket. | |
443 | |
444 Yields: | |
445 BucketListingRef, or empty iterator if no matches. | |
446 """ | |
447 for blr in self.__iter__( | |
448 bucket_listing_fields=bucket_listing_fields, | |
449 expand_top_level_buckets=expand_top_level_buckets): | |
450 yield blr | |
451 | |
452 def IterObjects(self, bucket_listing_fields=None): | |
453 """Iterates over the wildcard, yielding only object BucketListingRefs. | |
454 | |
455 Args: | |
456 bucket_listing_fields: If present, populate only these metadata | |
457 fields for listed objects. | |
458 | |
459 Yields: | |
460 BucketListingRefs of type OBJECT or empty iterator if no matches. | |
461 """ | |
462 for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields, | |
463 expand_top_level_buckets=True): | |
464 if blr.IsObject(): | |
465 yield blr | |
466 | |
467 | |
468 class FileWildcardIterator(WildcardIterator): | |
469 """WildcardIterator subclass for files and directories. | |
470 | |
471 If you use recursive wildcards ('**') only a single such wildcard is | |
472 supported. For example you could use the wildcard '**/*.txt' to list all .txt | |
473 files in any subdirectory of the current directory, but you couldn't use a | |
474 wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt | |
475 files in any subdirectory named 'abc'). | |
476 """ | |
477 | |
478 def __init__(self, wildcard_url, debug=0): | |
479 """Instantiates an iterator over BucketListingRefs matching wildcard URL. | |
480 | |
481 Args: | |
482 wildcard_url: FileUrl that contains the wildcard to iterate. | |
483 debug: Debug level (range 0..3). | |
484 """ | |
485 self.wildcard_url = wildcard_url | |
486 self.debug = debug | |
487 | |
488 def __iter__(self): | |
489 """Iterator that gets called when iterating over the file wildcard. | |
490 | |
491 In the case where no wildcard is present, returns a single matching file | |
492 or directory. | |
493 | |
494 Raises: | |
495 WildcardException: if invalid wildcard found. | |
496 | |
497 Yields: | |
498 BucketListingRef of type OBJECT (for files) or PREFIX (for directories) | |
499 """ | |
500 wildcard = self.wildcard_url.object_name | |
501 match = FLAT_LIST_REGEX.match(wildcard) | |
502 if match: | |
503 # Recursive wildcarding request ('.../**/...'). | |
504 # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' | |
505 base_dir = match.group('before')[:-1] | |
506 remaining_wildcard = match.group('after') | |
507 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and | |
508 # remaining_wildcard = '/*' | |
509 if remaining_wildcard.startswith('*'): | |
510 raise WildcardException('Invalid wildcard with more than 2 consecutive ' | |
511 '*s (%s)' % wildcard) | |
512 # If there was no remaining wildcard past the recursive wildcard, | |
513 # treat it as if it were a '*'. For example, file://tmp/** is equivalent | |
514 # to file://tmp/**/* | |
515 if not remaining_wildcard: | |
516 remaining_wildcard = '*' | |
517 # Skip slash(es). | |
518 remaining_wildcard = remaining_wildcard.lstrip(os.sep) | |
519 filepaths = self._IterDir(base_dir, remaining_wildcard) | |
520 else: | |
521 # Not a recursive wildcarding request. | |
522 filepaths = glob.iglob(wildcard) | |
523 for filepath in filepaths: | |
524 expanded_url = StorageUrlFromString(filepath) | |
525 if os.path.isdir(filepath): | |
526 yield BucketListingPrefix(expanded_url) | |
527 else: | |
528 yield BucketListingObject(expanded_url) | |
529 | |
530 def _IterDir(self, directory, wildcard): | |
531 """An iterator over the specified dir and wildcard.""" | |
532 # UTF8-encode directory before passing it to os.walk() so if there are | |
533 # non-valid UTF8 chars in the file name (e.g., that can happen if the file | |
534 # originated on Windows) os.walk() will not attempt to decode and then die | |
535 # with a "codec can't decode byte" error, and instead we can catch the error | |
536 # at yield time and print a more informative error message. | |
537 for dirpath, unused_dirnames, filenames in os.walk(directory.encode(UTF8)): | |
538 for f in fnmatch.filter(filenames, wildcard): | |
539 try: | |
540 yield os.path.join(dirpath, f).decode(UTF8) | |
541 except UnicodeDecodeError: | |
542 # Note: We considered several ways to deal with this, but each had | |
543 # problems: | |
544 # 1. Raise an exception and try to catch in a higher layer (the | |
545 # gsutil cp command), so we can properly support the gsutil cp -c | |
546 # option. That doesn't work because raising an exception during | |
547 # iteration terminates the generator. | |
548 # 2. Accumulate a list of bad filenames and skip processing each | |
549 # during iteration, then raise at the end, with exception text | |
550 # printing the bad paths. That doesn't work because iteration is | |
551 # wrapped in PluralityCheckableIterator, so it's possible there | |
552 # are not-yet-performed copy operations at the time we reach the | |
553 # end of the iteration and raise the exception - which would cause | |
554 # us to skip copying validly named files. Moreover, the gsutil | |
555 # cp command loops over argv, so if you run the command gsutil cp | |
556 # -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1 | |
557 # would cause dir2 never to be visited. | |
558 # 3. Print the invalid pathname and skip it during iteration. That | |
559 # would work but would mean gsutil cp could exit with status 0 | |
560 # even though some files weren't copied. | |
561 # 4. Change the WildcardIterator to include an error status along with | |
562 # the result. That would solve the problem but would be a | |
563 # substantial change (WildcardIterator is used in many parts of | |
564 # gsutil), and we didn't feel that magnitude of change was | |
565 # warranted by this relatively uncommon corner case. | |
566 # Instead we chose to abort when one such file is encountered, and | |
567 # require the user to remove or rename the files and try again. | |
568 raise CommandException('\n'.join(textwrap.wrap( | |
569 'Invalid Unicode path encountered (%s). gsutil cannot proceed ' | |
570 'with such files present. Please remove or rename this file and ' | |
571 'try again. NOTE: the path printed above replaces the ' | |
572 'problematic characters with a hex-encoded printable ' | |
573 'representation. For more details (including how to convert to a ' | |
574 'gsutil-compatible encoding) see `gsutil help encoding`.' % | |
575 repr(os.path.join(dirpath, f))))) | |
576 | |
577 # pylint: disable=unused-argument | |
578 def IterObjects(self, bucket_listing_fields=None): | |
579 """Iterates over the wildcard, yielding only object (file) refs. | |
580 | |
581 Args: | |
582 bucket_listing_fields: Ignored as filesystems don't have buckets. | |
583 | |
584 Yields: | |
585 BucketListingRefs of type OBJECT or empty iterator if no matches. | |
586 """ | |
587 for bucket_listing_ref in self.IterAll(): | |
588 if bucket_listing_ref.IsObject(): | |
589 yield bucket_listing_ref | |
590 | |
591 # pylint: disable=unused-argument | |
592 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): | |
593 """Iterates over the wildcard, yielding BucketListingRefs. | |
594 | |
595 Args: | |
596 bucket_listing_fields: Ignored; filesystems don't have buckets. | |
597 expand_top_level_buckets: Ignored; filesystems don't have buckets. | |
598 | |
599 Yields: | |
600 BucketListingRefs of type OBJECT (file) or PREFIX (directory), | |
601 or empty iterator if no matches. | |
602 """ | |
603 for bucket_listing_ref in self.__iter__(): | |
604 yield bucket_listing_ref | |
605 | |
606 def IterBuckets(self, unused_bucket_fields=None): | |
607 """Placeholder to allow polymorphic use of WildcardIterator. | |
608 | |
609 Args: | |
610 unused_bucket_fields: Ignored; filesystems don't have buckets. | |
611 | |
612 Raises: | |
613 WildcardException: in all cases. | |
614 """ | |
615 raise WildcardException( | |
616 'Iterating over Buckets not possible for file wildcards') | |
617 | |
618 | |
619 class WildcardException(StandardError): | |
620 """Exception raised for invalid wildcard URLs.""" | |
621 | |
622 def __init__(self, reason): | |
623 StandardError.__init__(self) | |
624 self.reason = reason | |
625 | |
626 def __repr__(self): | |
627 return 'WildcardException: %s' % self.reason | |
628 | |
629 def __str__(self): | |
630 return 'WildcardException: %s' % self.reason | |
631 | |
632 | |
633 def CreateWildcardIterator(url_str, gsutil_api, all_versions=False, debug=0, | |
634 project_id=None): | |
635 """Instantiate a WildcardIterator for the given URL string. | |
636 | |
637 Args: | |
638 url_str: URL string naming wildcard object(s) to iterate. | |
639 gsutil_api: Cloud storage interface. Passed in for thread safety, also | |
640 settable for testing/mocking. | |
641 all_versions: If true, the iterator yields all versions of objects | |
642 matching the wildcard. If false, yields just the live | |
643 object version. | |
644 debug: Debug level to control debug output for iterator. | |
645 project_id: Project id to use for bucket listings. | |
646 | |
647 Returns: | |
648 A WildcardIterator that handles the requested iteration. | |
649 """ | |
650 | |
651 url = StorageUrlFromString(url_str) | |
652 if url.IsFileUrl(): | |
653 return FileWildcardIterator(url, debug=debug) | |
654 else: # Cloud URL | |
655 return CloudWildcardIterator( | |
656 url, gsutil_api, all_versions=all_versions, debug=debug, | |
657 project_id=project_id) | |
OLD | NEW |