OLD | NEW |
(Empty) | |
| 1 # -*- coding: utf-8 -*- |
| 2 # Copyright 2010 Google Inc. All Rights Reserved. |
| 3 # |
| 4 # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 # you may not use this file except in compliance with the License. |
| 6 # You may obtain a copy of the License at |
| 7 # |
| 8 # http://www.apache.org/licenses/LICENSE-2.0 |
| 9 # |
| 10 # Unless required by applicable law or agreed to in writing, software |
| 11 # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 # See the License for the specific language governing permissions and |
| 14 # limitations under the License. |
| 15 """Wildcard iterator class and supporting functions.""" |
| 16 |
| 17 from __future__ import absolute_import |
| 18 |
| 19 import fnmatch |
| 20 import glob |
| 21 import os |
| 22 import re |
| 23 import sys |
| 24 import textwrap |
| 25 |
| 26 from gslib.bucket_listing_ref import BucketListingBucket |
| 27 from gslib.bucket_listing_ref import BucketListingObject |
| 28 from gslib.bucket_listing_ref import BucketListingPrefix |
| 29 from gslib.cloud_api import AccessDeniedException |
| 30 from gslib.cloud_api import CloudApi |
| 31 from gslib.cloud_api import NotFoundException |
| 32 from gslib.exception import CommandException |
| 33 from gslib.storage_url import ContainsWildcard |
| 34 from gslib.storage_url import StorageUrlFromString |
| 35 from gslib.storage_url import StripOneSlash |
| 36 from gslib.storage_url import WILDCARD_REGEX |
| 37 from gslib.translation_helper import GenerationFromUrlAndString |
| 38 from gslib.util import UTF8 |
| 39 |
| 40 |
| 41 FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)') |
| 42 |
| 43 |
| 44 class WildcardIterator(object): |
| 45 """Class for iterating over Google Cloud Storage strings containing wildcards. |
| 46 |
| 47 The base class is abstract; you should instantiate using the |
| 48 wildcard_iterator() static factory method, which chooses the right |
| 49 implementation depending on the base string. |
| 50 """ |
| 51 |
| 52 # TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both |
| 53 # and make one return the other. |
| 54 def __repr__(self): |
| 55 """Returns string representation of WildcardIterator.""" |
| 56 return 'WildcardIterator(%s)' % self.wildcard_url.url_string |
| 57 |
| 58 |
| 59 class CloudWildcardIterator(WildcardIterator): |
| 60 """WildcardIterator subclass for buckets, bucket subdirs and objects. |
| 61 |
| 62 Iterates over BucketListingRef matching the Url string wildcard. It's |
| 63 much more efficient to first get metadata that's available in the Bucket |
| 64 (for example to get the name and size of each object), because that |
| 65 information is available in the object list results. |
| 66 """ |
| 67 |
| 68 def __init__(self, wildcard_url, gsutil_api, all_versions=False, |
| 69 debug=0, project_id=None): |
| 70 """Instantiates an iterator that matches the wildcard URL. |
| 71 |
| 72 Args: |
| 73 wildcard_url: CloudUrl that contains the wildcard to iterate. |
| 74 gsutil_api: Cloud storage interface. Passed in for thread safety, also |
| 75 settable for testing/mocking. |
| 76 all_versions: If true, the iterator yields all versions of objects |
| 77 matching the wildcard. If false, yields just the live |
| 78 object version. |
| 79 debug: Debug level to control debug output for iterator. |
| 80 project_id: Project ID to use for bucket listings. |
| 81 """ |
| 82 self.wildcard_url = wildcard_url |
| 83 self.all_versions = all_versions |
| 84 self.debug = debug |
| 85 self.gsutil_api = gsutil_api |
| 86 self.project_id = project_id |
| 87 |
| 88 def __iter__(self, bucket_listing_fields=None, |
| 89 expand_top_level_buckets=False): |
| 90 """Iterator that gets called when iterating over the cloud wildcard. |
| 91 |
| 92 In the case where no wildcard is present, returns a single matching object, |
| 93 single matching prefix, or one of each if both exist. |
| 94 |
| 95 Args: |
| 96 bucket_listing_fields: Iterable fields to include in bucket listings. |
| 97 Ex. ['name', 'acl']. Iterator is |
| 98 responsible for converting these to list-style |
| 99 format ['items/name', 'items/acl'] as well as |
| 100 adding any fields necessary for listing such as |
| 101 prefixes. API implemenation is responsible for |
| 102 adding pagination fields. If this is None, |
| 103 all fields are returned. |
| 104 expand_top_level_buckets: If true, yield no BUCKET references. Instead, |
| 105 expand buckets into top-level objects and |
| 106 prefixes. |
| 107 |
| 108 Yields: |
| 109 BucketListingRef of type BUCKET, OBJECT or PREFIX. |
| 110 """ |
| 111 single_version_request = self.wildcard_url.HasGeneration() |
| 112 |
| 113 # For wildcard expansion purposes, we need at a minimum the name of |
| 114 # each object and prefix. If we're not using the default of requesting |
| 115 # all fields, make sure at least these are requested. The Cloud API |
| 116 # tolerates specifying the same field twice. |
| 117 get_fields = None |
| 118 if bucket_listing_fields: |
| 119 get_fields = set() |
| 120 for field in bucket_listing_fields: |
| 121 get_fields.add(field) |
| 122 bucket_listing_fields = self._GetToListFields( |
| 123 get_fields=bucket_listing_fields) |
| 124 bucket_listing_fields.update(['items/name', 'prefixes']) |
| 125 get_fields.update(['name']) |
| 126 # If we're making versioned requests, ensure generation and |
| 127 # metageneration are also included. |
| 128 if single_version_request or self.all_versions: |
| 129 bucket_listing_fields.update(['items/generation', |
| 130 'items/metageneration']) |
| 131 get_fields.update(['generation', 'metageneration']) |
| 132 |
| 133 # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then |
| 134 # iterate over the expanded bucket strings and handle any object |
| 135 # wildcarding. |
| 136 for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']): |
| 137 bucket_url_string = bucket_listing_ref.url_string |
| 138 if self.wildcard_url.IsBucket(): |
| 139 # IsBucket() guarantees there are no prefix or object wildcards, and |
| 140 # thus this is a top-level listing of buckets. |
| 141 if expand_top_level_buckets: |
| 142 url = StorageUrlFromString(bucket_url_string) |
| 143 for obj_or_prefix in self.gsutil_api.ListObjects( |
| 144 url.bucket_name, delimiter='/', all_versions=self.all_versions, |
| 145 provider=self.wildcard_url.scheme, |
| 146 fields=bucket_listing_fields): |
| 147 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: |
| 148 yield self._GetObjectRef(bucket_url_string, obj_or_prefix.data, |
| 149 with_version=self.all_versions) |
| 150 else: # CloudApi.CsObjectOrPrefixType.PREFIX: |
| 151 yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data) |
| 152 else: |
| 153 yield bucket_listing_ref |
| 154 else: |
| 155 # By default, assume a non-wildcarded URL is an object, not a prefix. |
| 156 # This prevents unnecessary listings (which are slower, more expensive, |
| 157 # and also subject to eventual consistency). |
| 158 if (not ContainsWildcard(self.wildcard_url.url_string) and |
| 159 self.wildcard_url.IsObject() and not self.all_versions): |
| 160 try: |
| 161 get_object = self.gsutil_api.GetObjectMetadata( |
| 162 self.wildcard_url.bucket_name, |
| 163 self.wildcard_url.object_name, |
| 164 generation=self.wildcard_url.generation, |
| 165 provider=self.wildcard_url.scheme, |
| 166 fields=get_fields) |
| 167 yield self._GetObjectRef( |
| 168 self.wildcard_url.bucket_url_string, get_object, |
| 169 with_version=(self.all_versions or single_version_request)) |
| 170 return |
| 171 except (NotFoundException, AccessDeniedException): |
| 172 # It's possible this is a prefix - try to list instead. |
| 173 pass |
| 174 |
| 175 # Expand iteratively by building prefix/delimiter bucket listing |
| 176 # request, filtering the results per the current level's wildcard |
| 177 # (if present), and continuing with the next component of the |
| 178 # wildcard. See _BuildBucketFilterStrings() documentation for details. |
| 179 if single_version_request: |
| 180 url_string = '%s%s#%s' % (bucket_url_string, |
| 181 self.wildcard_url.object_name, |
| 182 self.wildcard_url.generation) |
| 183 else: |
| 184 # Rstrip any prefixes to correspond with rstripped prefix wildcard |
| 185 # from _BuildBucketFilterStrings(). |
| 186 url_string = '%s%s' % (bucket_url_string, |
| 187 StripOneSlash(self.wildcard_url.object_name) |
| 188 or '/') # Cover root object named '/' case. |
| 189 urls_needing_expansion = [url_string] |
| 190 while urls_needing_expansion: |
| 191 url = StorageUrlFromString(urls_needing_expansion.pop(0)) |
| 192 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = ( |
| 193 self._BuildBucketFilterStrings(url.object_name)) |
| 194 prog = re.compile(fnmatch.translate(prefix_wildcard)) |
| 195 |
| 196 # List bucket for objects matching prefix up to delimiter. |
| 197 for obj_or_prefix in self.gsutil_api.ListObjects( |
| 198 url.bucket_name, prefix=prefix, delimiter=delimiter, |
| 199 all_versions=self.all_versions or single_version_request, |
| 200 provider=self.wildcard_url.scheme, |
| 201 fields=bucket_listing_fields): |
| 202 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: |
| 203 gcs_object = obj_or_prefix.data |
| 204 if prog.match(gcs_object.name): |
| 205 if not suffix_wildcard or ( |
| 206 StripOneSlash(gcs_object.name) == suffix_wildcard): |
| 207 if not single_version_request or ( |
| 208 self._SingleVersionMatches(gcs_object.generation)): |
| 209 yield self._GetObjectRef( |
| 210 bucket_url_string, gcs_object, with_version=( |
| 211 self.all_versions or single_version_request)) |
| 212 else: # CloudApi.CsObjectOrPrefixType.PREFIX |
| 213 prefix = obj_or_prefix.data |
| 214 # If the prefix ends with a slash, remove it. Note that we only |
| 215 # remove one slash so that we can successfully enumerate dirs |
| 216 # containing multiple slashes. |
| 217 rstripped_prefix = StripOneSlash(prefix) |
| 218 if prog.match(rstripped_prefix): |
| 219 if suffix_wildcard and rstripped_prefix != suffix_wildcard: |
| 220 # There's more wildcard left to expand. |
| 221 url_append_string = '%s%s' % ( |
| 222 bucket_url_string, rstripped_prefix + '/' + |
| 223 suffix_wildcard) |
| 224 urls_needing_expansion.append(url_append_string) |
| 225 else: |
| 226 # No wildcard to expand, just yield the prefix |
| 227 yield self._GetPrefixRef(bucket_url_string, prefix) |
| 228 |
| 229 def _BuildBucketFilterStrings(self, wildcard): |
| 230 """Builds strings needed for querying a bucket and filtering results. |
| 231 |
| 232 This implements wildcard object name matching. |
| 233 |
| 234 Args: |
| 235 wildcard: The wildcard string to match to objects. |
| 236 |
| 237 Returns: |
| 238 (prefix, delimiter, prefix_wildcard, suffix_wildcard) |
| 239 where: |
| 240 prefix is the prefix to be sent in bucket GET request. |
| 241 delimiter is the delimiter to be sent in bucket GET request. |
| 242 prefix_wildcard is the wildcard to be used to filter bucket GET results. |
| 243 suffix_wildcard is wildcard to be appended to filtered bucket GET |
| 244 results for next wildcard expansion iteration. |
| 245 For example, given the wildcard gs://bucket/abc/d*e/f*.txt we |
| 246 would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and |
| 247 suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket |
| 248 listing request will then produce a listing result set that can be |
| 249 filtered using this prefix_wildcard; and we'd use this suffix_wildcard |
| 250 to feed into the next call(s) to _BuildBucketFilterStrings(), for the |
| 251 next iteration of listing/filtering. |
| 252 |
| 253 Raises: |
| 254 AssertionError if wildcard doesn't contain any wildcard chars. |
| 255 """ |
| 256 # Generate a request prefix if the object name part of the wildcard starts |
| 257 # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz'). |
| 258 match = WILDCARD_REGEX.search(wildcard) |
| 259 if not match: |
| 260 # Input "wildcard" has no wildcard chars, so just return tuple that will |
| 261 # cause a bucket listing to match the given input wildcard. Example: if |
| 262 # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc, |
| 263 # the next iteration will call _BuildBucketFilterStrings() with |
| 264 # gs://bucket/dir/abc, and we will return prefix ='dir/abc', |
| 265 # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''. |
| 266 prefix = wildcard |
| 267 delimiter = '/' |
| 268 prefix_wildcard = wildcard |
| 269 suffix_wildcard = '' |
| 270 else: |
| 271 if match.start() > 0: |
| 272 # Wildcard does not occur at beginning of object name, so construct a |
| 273 # prefix string to send to server. |
| 274 prefix = wildcard[:match.start()] |
| 275 wildcard_part = wildcard[match.start():] |
| 276 else: |
| 277 prefix = None |
| 278 wildcard_part = wildcard |
| 279 end = wildcard_part.find('/') |
| 280 if end != -1: |
| 281 wildcard_part = wildcard_part[:end+1] |
| 282 # Remove trailing '/' so we will match gs://bucket/abc* as well as |
| 283 # gs://bucket/abc*/ with the same wildcard regex. |
| 284 prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part) |
| 285 suffix_wildcard = wildcard[match.end():] |
| 286 end = suffix_wildcard.find('/') |
| 287 if end == -1: |
| 288 suffix_wildcard = '' |
| 289 else: |
| 290 suffix_wildcard = suffix_wildcard[end+1:] |
| 291 # To implement recursive (**) wildcarding, if prefix_wildcard |
| 292 # suffix_wildcard starts with '**' don't send a delimiter, and combine |
| 293 # suffix_wildcard at end of prefix_wildcard. |
| 294 if prefix_wildcard.find('**') != -1: |
| 295 delimiter = None |
| 296 prefix_wildcard += suffix_wildcard |
| 297 suffix_wildcard = '' |
| 298 else: |
| 299 delimiter = '/' |
| 300 # The following debug output is useful for tracing how the algorithm |
| 301 # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt |
| 302 if self.debug > 1: |
| 303 sys.stderr.write( |
| 304 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, ' |
| 305 'prefix_wildcard=%s, suffix_wildcard=%s\n' % |
| 306 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard)) |
| 307 return (prefix, delimiter, prefix_wildcard, suffix_wildcard) |
| 308 |
| 309 def _SingleVersionMatches(self, listed_generation): |
| 310 decoded_generation = GenerationFromUrlAndString(self.wildcard_url, |
| 311 listed_generation) |
| 312 return str(self.wildcard_url.generation) == str(decoded_generation) |
| 313 |
| 314 def _ExpandBucketWildcards(self, bucket_fields=None): |
| 315 """Expands bucket and provider wildcards. |
| 316 |
| 317 Builds a list of bucket url strings that can be iterated on. |
| 318 |
| 319 Args: |
| 320 bucket_fields: If present, populate only these metadata fields for |
| 321 buckets. Example value: ['acl', 'defaultObjectAcl'] |
| 322 |
| 323 Yields: |
| 324 BucketListingRefereneces of type BUCKET. |
| 325 """ |
| 326 bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string) |
| 327 if (bucket_fields and set(bucket_fields) == set(['id']) and |
| 328 not ContainsWildcard(self.wildcard_url.bucket_name)): |
| 329 # If we just want the name of a non-wildcarded bucket URL, |
| 330 # don't make an RPC. |
| 331 yield BucketListingBucket(bucket_url) |
| 332 elif(self.wildcard_url.IsBucket() and |
| 333 not ContainsWildcard(self.wildcard_url.bucket_name)): |
| 334 # If we have a non-wildcarded bucket URL, get just that bucket. |
| 335 yield BucketListingBucket( |
| 336 bucket_url, root_object=self.gsutil_api.GetBucket( |
| 337 self.wildcard_url.bucket_name, provider=self.wildcard_url.scheme, |
| 338 fields=bucket_fields)) |
| 339 else: |
| 340 regex = fnmatch.translate(self.wildcard_url.bucket_name) |
| 341 prog = re.compile(regex) |
| 342 |
| 343 fields = self._GetToListFields(bucket_fields) |
| 344 if fields: |
| 345 fields.add('items/id') |
| 346 for bucket in self.gsutil_api.ListBuckets( |
| 347 fields=fields, project_id=self.project_id, |
| 348 provider=self.wildcard_url.scheme): |
| 349 if prog.match(bucket.id): |
| 350 url = StorageUrlFromString( |
| 351 '%s://%s/' % (self.wildcard_url.scheme, bucket.id)) |
| 352 yield BucketListingBucket(url, root_object=bucket) |
| 353 |
| 354 def _GetToListFields(self, get_fields=None): |
| 355 """Prepends 'items/' to the input fields and converts it to a set. |
| 356 |
| 357 This way field sets requested for GetBucket can be used in ListBucket calls. |
| 358 Note that the input set must contain only bucket or object fields; listing |
| 359 fields such as prefixes or nextPageToken should be added after calling |
| 360 this function. |
| 361 |
| 362 Args: |
| 363 get_fields: Iterable fields usable in GetBucket/GetObject calls. |
| 364 |
| 365 Returns: |
| 366 Set of fields usable in ListBuckets/ListObjects calls. |
| 367 """ |
| 368 if get_fields: |
| 369 list_fields = set() |
| 370 for field in get_fields: |
| 371 list_fields.add('items/' + field) |
| 372 return list_fields |
| 373 |
| 374 def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False): |
| 375 """Creates a BucketListingRef of type OBJECT from the arguments. |
| 376 |
| 377 Args: |
| 378 bucket_url_string: Wildcardless string describing the containing bucket. |
| 379 gcs_object: gsutil_api root Object for populating the BucketListingRef. |
| 380 with_version: If true, return a reference with a versioned string. |
| 381 |
| 382 Returns: |
| 383 BucketListingRef of type OBJECT. |
| 384 """ |
| 385 # Generation can be None in test mocks, so just return the |
| 386 # live object for simplicity. |
| 387 if with_version and gcs_object.generation is not None: |
| 388 generation_str = GenerationFromUrlAndString(self.wildcard_url, |
| 389 gcs_object.generation) |
| 390 object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name, |
| 391 generation_str) |
| 392 else: |
| 393 object_string = '%s%s' % (bucket_url_string, gcs_object.name) |
| 394 object_url = StorageUrlFromString(object_string) |
| 395 return BucketListingObject(object_url, root_object=gcs_object) |
| 396 |
| 397 def _GetPrefixRef(self, bucket_url_string, prefix): |
| 398 """Creates a BucketListingRef of type PREFIX from the arguments. |
| 399 |
| 400 Args: |
| 401 bucket_url_string: Wildcardless string describing the containing bucket. |
| 402 prefix: gsutil_api Prefix for populating the BucketListingRef |
| 403 |
| 404 Returns: |
| 405 BucketListingRef of type PREFIX. |
| 406 """ |
| 407 prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix)) |
| 408 return BucketListingPrefix(prefix_url, root_object=prefix) |
| 409 |
| 410 def IterBuckets(self, bucket_fields=None): |
| 411 """Iterates over the wildcard, returning refs for each expanded bucket. |
| 412 |
| 413 This ignores the object part of the URL entirely and expands only the |
| 414 the bucket portion. It will yield BucketListingRefs of type BUCKET only. |
| 415 |
| 416 Args: |
| 417 bucket_fields: Iterable fields to include in bucket listings. |
| 418 Ex. ['defaultObjectAcl', 'logging']. This function is |
| 419 responsible for converting these to listing-style |
| 420 format ['items/defaultObjectAcl', 'items/logging'], as |
| 421 well as adding any fields necessary for listing such as |
| 422 'items/id'. API implemenation is responsible for |
| 423 adding pagination fields. If this is None, all fields are |
| 424 returned. |
| 425 |
| 426 Yields: |
| 427 BucketListingRef of type BUCKET, or empty iterator if no matches. |
| 428 """ |
| 429 for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields): |
| 430 yield blr |
| 431 |
| 432 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): |
| 433 """Iterates over the wildcard, yielding bucket, prefix or object refs. |
| 434 |
| 435 Args: |
| 436 bucket_listing_fields: If present, populate only these metadata |
| 437 fields for listed objects. |
| 438 expand_top_level_buckets: If true and the wildcard expands only to |
| 439 Bucket(s), yields the expansion of each bucket |
| 440 into a top-level listing of prefixes and objects |
| 441 in that bucket instead of a BucketListingRef |
| 442 to that bucket. |
| 443 |
| 444 Yields: |
| 445 BucketListingRef, or empty iterator if no matches. |
| 446 """ |
| 447 for blr in self.__iter__( |
| 448 bucket_listing_fields=bucket_listing_fields, |
| 449 expand_top_level_buckets=expand_top_level_buckets): |
| 450 yield blr |
| 451 |
| 452 def IterObjects(self, bucket_listing_fields=None): |
| 453 """Iterates over the wildcard, yielding only object BucketListingRefs. |
| 454 |
| 455 Args: |
| 456 bucket_listing_fields: If present, populate only these metadata |
| 457 fields for listed objects. |
| 458 |
| 459 Yields: |
| 460 BucketListingRefs of type OBJECT or empty iterator if no matches. |
| 461 """ |
| 462 for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields, |
| 463 expand_top_level_buckets=True): |
| 464 if blr.IsObject(): |
| 465 yield blr |
| 466 |
| 467 |
| 468 class FileWildcardIterator(WildcardIterator): |
| 469 """WildcardIterator subclass for files and directories. |
| 470 |
| 471 If you use recursive wildcards ('**') only a single such wildcard is |
| 472 supported. For example you could use the wildcard '**/*.txt' to list all .txt |
| 473 files in any subdirectory of the current directory, but you couldn't use a |
| 474 wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt |
| 475 files in any subdirectory named 'abc'). |
| 476 """ |
| 477 |
| 478 def __init__(self, wildcard_url, debug=0): |
| 479 """Instantiates an iterator over BucketListingRefs matching wildcard URL. |
| 480 |
| 481 Args: |
| 482 wildcard_url: FileUrl that contains the wildcard to iterate. |
| 483 debug: Debug level (range 0..3). |
| 484 """ |
| 485 self.wildcard_url = wildcard_url |
| 486 self.debug = debug |
| 487 |
| 488 def __iter__(self): |
| 489 """Iterator that gets called when iterating over the file wildcard. |
| 490 |
| 491 In the case where no wildcard is present, returns a single matching file |
| 492 or directory. |
| 493 |
| 494 Raises: |
| 495 WildcardException: if invalid wildcard found. |
| 496 |
| 497 Yields: |
| 498 BucketListingRef of type OBJECT (for files) or PREFIX (for directories) |
| 499 """ |
| 500 wildcard = self.wildcard_url.object_name |
| 501 match = FLAT_LIST_REGEX.match(wildcard) |
| 502 if match: |
| 503 # Recursive wildcarding request ('.../**/...'). |
| 504 # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' |
| 505 base_dir = match.group('before')[:-1] |
| 506 remaining_wildcard = match.group('after') |
| 507 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and |
| 508 # remaining_wildcard = '/*' |
| 509 if remaining_wildcard.startswith('*'): |
| 510 raise WildcardException('Invalid wildcard with more than 2 consecutive ' |
| 511 '*s (%s)' % wildcard) |
| 512 # If there was no remaining wildcard past the recursive wildcard, |
| 513 # treat it as if it were a '*'. For example, file://tmp/** is equivalent |
| 514 # to file://tmp/**/* |
| 515 if not remaining_wildcard: |
| 516 remaining_wildcard = '*' |
| 517 # Skip slash(es). |
| 518 remaining_wildcard = remaining_wildcard.lstrip(os.sep) |
| 519 filepaths = self._IterDir(base_dir, remaining_wildcard) |
| 520 else: |
| 521 # Not a recursive wildcarding request. |
| 522 filepaths = glob.iglob(wildcard) |
| 523 for filepath in filepaths: |
| 524 expanded_url = StorageUrlFromString(filepath) |
| 525 if os.path.isdir(filepath): |
| 526 yield BucketListingPrefix(expanded_url) |
| 527 else: |
| 528 yield BucketListingObject(expanded_url) |
| 529 |
| 530 def _IterDir(self, directory, wildcard): |
| 531 """An iterator over the specified dir and wildcard.""" |
| 532 # UTF8-encode directory before passing it to os.walk() so if there are |
| 533 # non-valid UTF8 chars in the file name (e.g., that can happen if the file |
| 534 # originated on Windows) os.walk() will not attempt to decode and then die |
| 535 # with a "codec can't decode byte" error, and instead we can catch the error |
| 536 # at yield time and print a more informative error message. |
| 537 for dirpath, unused_dirnames, filenames in os.walk(directory.encode(UTF8)): |
| 538 for f in fnmatch.filter(filenames, wildcard): |
| 539 try: |
| 540 yield os.path.join(dirpath, f).decode(UTF8) |
| 541 except UnicodeDecodeError: |
| 542 # Note: We considered several ways to deal with this, but each had |
| 543 # problems: |
| 544 # 1. Raise an exception and try to catch in a higher layer (the |
| 545 # gsutil cp command), so we can properly support the gsutil cp -c |
| 546 # option. That doesn't work because raising an exception during |
| 547 # iteration terminates the generator. |
| 548 # 2. Accumulate a list of bad filenames and skip processing each |
| 549 # during iteration, then raise at the end, with exception text |
| 550 # printing the bad paths. That doesn't work because iteration is |
| 551 # wrapped in PluralityCheckableIterator, so it's possible there |
| 552 # are not-yet-performed copy operations at the time we reach the |
| 553 # end of the iteration and raise the exception - which would cause |
| 554 # us to skip copying validly named files. Moreover, the gsutil |
| 555 # cp command loops over argv, so if you run the command gsutil cp |
| 556 # -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1 |
| 557 # would cause dir2 never to be visited. |
| 558 # 3. Print the invalid pathname and skip it during iteration. That |
| 559 # would work but would mean gsutil cp could exit with status 0 |
| 560 # even though some files weren't copied. |
| 561 # 4. Change the WildcardIterator to include an error status along with |
| 562 # the result. That would solve the problem but would be a |
| 563 # substantial change (WildcardIterator is used in many parts of |
| 564 # gsutil), and we didn't feel that magnitude of change was |
| 565 # warranted by this relatively uncommon corner case. |
| 566 # Instead we chose to abort when one such file is encountered, and |
| 567 # require the user to remove or rename the files and try again. |
| 568 raise CommandException('\n'.join(textwrap.wrap( |
| 569 'Invalid Unicode path encountered (%s). gsutil cannot proceed ' |
| 570 'with such files present. Please remove or rename this file and ' |
| 571 'try again. NOTE: the path printed above replaces the ' |
| 572 'problematic characters with a hex-encoded printable ' |
| 573 'representation. For more details (including how to convert to a ' |
| 574 'gsutil-compatible encoding) see `gsutil help encoding`.' % |
| 575 repr(os.path.join(dirpath, f))))) |
| 576 |
| 577 # pylint: disable=unused-argument |
| 578 def IterObjects(self, bucket_listing_fields=None): |
| 579 """Iterates over the wildcard, yielding only object (file) refs. |
| 580 |
| 581 Args: |
| 582 bucket_listing_fields: Ignored as filesystems don't have buckets. |
| 583 |
| 584 Yields: |
| 585 BucketListingRefs of type OBJECT or empty iterator if no matches. |
| 586 """ |
| 587 for bucket_listing_ref in self.IterAll(): |
| 588 if bucket_listing_ref.IsObject(): |
| 589 yield bucket_listing_ref |
| 590 |
| 591 # pylint: disable=unused-argument |
| 592 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): |
| 593 """Iterates over the wildcard, yielding BucketListingRefs. |
| 594 |
| 595 Args: |
| 596 bucket_listing_fields: Ignored; filesystems don't have buckets. |
| 597 expand_top_level_buckets: Ignored; filesystems don't have buckets. |
| 598 |
| 599 Yields: |
| 600 BucketListingRefs of type OBJECT (file) or PREFIX (directory), |
| 601 or empty iterator if no matches. |
| 602 """ |
| 603 for bucket_listing_ref in self.__iter__(): |
| 604 yield bucket_listing_ref |
| 605 |
| 606 def IterBuckets(self, unused_bucket_fields=None): |
| 607 """Placeholder to allow polymorphic use of WildcardIterator. |
| 608 |
| 609 Args: |
| 610 unused_bucket_fields: Ignored; filesystems don't have buckets. |
| 611 |
| 612 Raises: |
| 613 WildcardException: in all cases. |
| 614 """ |
| 615 raise WildcardException( |
| 616 'Iterating over Buckets not possible for file wildcards') |
| 617 |
| 618 |
| 619 class WildcardException(StandardError): |
| 620 """Exception raised for invalid wildcard URLs.""" |
| 621 |
| 622 def __init__(self, reason): |
| 623 StandardError.__init__(self) |
| 624 self.reason = reason |
| 625 |
| 626 def __repr__(self): |
| 627 return 'WildcardException: %s' % self.reason |
| 628 |
| 629 def __str__(self): |
| 630 return 'WildcardException: %s' % self.reason |
| 631 |
| 632 |
| 633 def CreateWildcardIterator(url_str, gsutil_api, all_versions=False, debug=0, |
| 634 project_id=None): |
| 635 """Instantiate a WildcardIterator for the given URL string. |
| 636 |
| 637 Args: |
| 638 url_str: URL string naming wildcard object(s) to iterate. |
| 639 gsutil_api: Cloud storage interface. Passed in for thread safety, also |
| 640 settable for testing/mocking. |
| 641 all_versions: If true, the iterator yields all versions of objects |
| 642 matching the wildcard. If false, yields just the live |
| 643 object version. |
| 644 debug: Debug level to control debug output for iterator. |
| 645 project_id: Project id to use for bucket listings. |
| 646 |
| 647 Returns: |
| 648 A WildcardIterator that handles the requested iteration. |
| 649 """ |
| 650 |
| 651 url = StorageUrlFromString(url_str) |
| 652 if url.IsFileUrl(): |
| 653 return FileWildcardIterator(url, debug=debug) |
| 654 else: # Cloud URL |
| 655 return CloudWildcardIterator( |
| 656 url, gsutil_api, all_versions=all_versions, debug=debug, |
| 657 project_id=project_id) |
OLD | NEW |