Index: tools/telemetry/third_party/gsutil/gslib/wildcard_iterator.py |
diff --git a/tools/telemetry/third_party/gsutil/gslib/wildcard_iterator.py b/tools/telemetry/third_party/gsutil/gslib/wildcard_iterator.py |
deleted file mode 100644 |
index 57b4f638a4bfe7b788ec580b289cc011c315222f..0000000000000000000000000000000000000000 |
--- a/tools/telemetry/third_party/gsutil/gslib/wildcard_iterator.py |
+++ /dev/null |
@@ -1,657 +0,0 @@ |
-# -*- coding: utf-8 -*- |
-# Copyright 2010 Google Inc. All Rights Reserved. |
-# |
-# Licensed under the Apache License, Version 2.0 (the "License"); |
-# you may not use this file except in compliance with the License. |
-# You may obtain a copy of the License at |
-# |
-# http://www.apache.org/licenses/LICENSE-2.0 |
-# |
-# Unless required by applicable law or agreed to in writing, software |
-# distributed under the License is distributed on an "AS IS" BASIS, |
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
-# See the License for the specific language governing permissions and |
-# limitations under the License. |
-"""Wildcard iterator class and supporting functions.""" |
- |
-from __future__ import absolute_import |
- |
-import fnmatch |
-import glob |
-import os |
-import re |
-import sys |
-import textwrap |
- |
-from gslib.bucket_listing_ref import BucketListingBucket |
-from gslib.bucket_listing_ref import BucketListingObject |
-from gslib.bucket_listing_ref import BucketListingPrefix |
-from gslib.cloud_api import AccessDeniedException |
-from gslib.cloud_api import CloudApi |
-from gslib.cloud_api import NotFoundException |
-from gslib.exception import CommandException |
-from gslib.storage_url import ContainsWildcard |
-from gslib.storage_url import StorageUrlFromString |
-from gslib.storage_url import StripOneSlash |
-from gslib.storage_url import WILDCARD_REGEX |
-from gslib.translation_helper import GenerationFromUrlAndString |
-from gslib.util import UTF8 |
- |
- |
-FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)') |
- |
- |
-class WildcardIterator(object): |
- """Class for iterating over Google Cloud Storage strings containing wildcards. |
- |
- The base class is abstract; you should instantiate using the |
- wildcard_iterator() static factory method, which chooses the right |
- implementation depending on the base string. |
- """ |
- |
- # TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both |
- # and make one return the other. |
- def __repr__(self): |
- """Returns string representation of WildcardIterator.""" |
- return 'WildcardIterator(%s)' % self.wildcard_url.url_string |
- |
- |
-class CloudWildcardIterator(WildcardIterator): |
- """WildcardIterator subclass for buckets, bucket subdirs and objects. |
- |
- Iterates over BucketListingRef matching the Url string wildcard. It's |
- much more efficient to first get metadata that's available in the Bucket |
- (for example to get the name and size of each object), because that |
- information is available in the object list results. |
- """ |
- |
- def __init__(self, wildcard_url, gsutil_api, all_versions=False, |
- debug=0, project_id=None): |
- """Instantiates an iterator that matches the wildcard URL. |
- |
- Args: |
- wildcard_url: CloudUrl that contains the wildcard to iterate. |
- gsutil_api: Cloud storage interface. Passed in for thread safety, also |
- settable for testing/mocking. |
- all_versions: If true, the iterator yields all versions of objects |
- matching the wildcard. If false, yields just the live |
- object version. |
- debug: Debug level to control debug output for iterator. |
- project_id: Project ID to use for bucket listings. |
- """ |
- self.wildcard_url = wildcard_url |
- self.all_versions = all_versions |
- self.debug = debug |
- self.gsutil_api = gsutil_api |
- self.project_id = project_id |
- |
- def __iter__(self, bucket_listing_fields=None, |
- expand_top_level_buckets=False): |
- """Iterator that gets called when iterating over the cloud wildcard. |
- |
- In the case where no wildcard is present, returns a single matching object, |
- single matching prefix, or one of each if both exist. |
- |
- Args: |
- bucket_listing_fields: Iterable fields to include in bucket listings. |
- Ex. ['name', 'acl']. Iterator is |
- responsible for converting these to list-style |
- format ['items/name', 'items/acl'] as well as |
- adding any fields necessary for listing such as |
- prefixes. API implemenation is responsible for |
- adding pagination fields. If this is None, |
- all fields are returned. |
- expand_top_level_buckets: If true, yield no BUCKET references. Instead, |
- expand buckets into top-level objects and |
- prefixes. |
- |
- Yields: |
- BucketListingRef of type BUCKET, OBJECT or PREFIX. |
- """ |
- single_version_request = self.wildcard_url.HasGeneration() |
- |
- # For wildcard expansion purposes, we need at a minimum the name of |
- # each object and prefix. If we're not using the default of requesting |
- # all fields, make sure at least these are requested. The Cloud API |
- # tolerates specifying the same field twice. |
- get_fields = None |
- if bucket_listing_fields: |
- get_fields = set() |
- for field in bucket_listing_fields: |
- get_fields.add(field) |
- bucket_listing_fields = self._GetToListFields( |
- get_fields=bucket_listing_fields) |
- bucket_listing_fields.update(['items/name', 'prefixes']) |
- get_fields.update(['name']) |
- # If we're making versioned requests, ensure generation and |
- # metageneration are also included. |
- if single_version_request or self.all_versions: |
- bucket_listing_fields.update(['items/generation', |
- 'items/metageneration']) |
- get_fields.update(['generation', 'metageneration']) |
- |
- # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then |
- # iterate over the expanded bucket strings and handle any object |
- # wildcarding. |
- for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']): |
- bucket_url_string = bucket_listing_ref.url_string |
- if self.wildcard_url.IsBucket(): |
- # IsBucket() guarantees there are no prefix or object wildcards, and |
- # thus this is a top-level listing of buckets. |
- if expand_top_level_buckets: |
- url = StorageUrlFromString(bucket_url_string) |
- for obj_or_prefix in self.gsutil_api.ListObjects( |
- url.bucket_name, delimiter='/', all_versions=self.all_versions, |
- provider=self.wildcard_url.scheme, |
- fields=bucket_listing_fields): |
- if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: |
- yield self._GetObjectRef(bucket_url_string, obj_or_prefix.data, |
- with_version=self.all_versions) |
- else: # CloudApi.CsObjectOrPrefixType.PREFIX: |
- yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data) |
- else: |
- yield bucket_listing_ref |
- else: |
- # By default, assume a non-wildcarded URL is an object, not a prefix. |
- # This prevents unnecessary listings (which are slower, more expensive, |
- # and also subject to eventual consistency). |
- if (not ContainsWildcard(self.wildcard_url.url_string) and |
- self.wildcard_url.IsObject() and not self.all_versions): |
- try: |
- get_object = self.gsutil_api.GetObjectMetadata( |
- self.wildcard_url.bucket_name, |
- self.wildcard_url.object_name, |
- generation=self.wildcard_url.generation, |
- provider=self.wildcard_url.scheme, |
- fields=get_fields) |
- yield self._GetObjectRef( |
- self.wildcard_url.bucket_url_string, get_object, |
- with_version=(self.all_versions or single_version_request)) |
- return |
- except (NotFoundException, AccessDeniedException): |
- # It's possible this is a prefix - try to list instead. |
- pass |
- |
- # Expand iteratively by building prefix/delimiter bucket listing |
- # request, filtering the results per the current level's wildcard |
- # (if present), and continuing with the next component of the |
- # wildcard. See _BuildBucketFilterStrings() documentation for details. |
- if single_version_request: |
- url_string = '%s%s#%s' % (bucket_url_string, |
- self.wildcard_url.object_name, |
- self.wildcard_url.generation) |
- else: |
- # Rstrip any prefixes to correspond with rstripped prefix wildcard |
- # from _BuildBucketFilterStrings(). |
- url_string = '%s%s' % (bucket_url_string, |
- StripOneSlash(self.wildcard_url.object_name) |
- or '/') # Cover root object named '/' case. |
- urls_needing_expansion = [url_string] |
- while urls_needing_expansion: |
- url = StorageUrlFromString(urls_needing_expansion.pop(0)) |
- (prefix, delimiter, prefix_wildcard, suffix_wildcard) = ( |
- self._BuildBucketFilterStrings(url.object_name)) |
- prog = re.compile(fnmatch.translate(prefix_wildcard)) |
- |
- # List bucket for objects matching prefix up to delimiter. |
- for obj_or_prefix in self.gsutil_api.ListObjects( |
- url.bucket_name, prefix=prefix, delimiter=delimiter, |
- all_versions=self.all_versions or single_version_request, |
- provider=self.wildcard_url.scheme, |
- fields=bucket_listing_fields): |
- if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: |
- gcs_object = obj_or_prefix.data |
- if prog.match(gcs_object.name): |
- if not suffix_wildcard or ( |
- StripOneSlash(gcs_object.name) == suffix_wildcard): |
- if not single_version_request or ( |
- self._SingleVersionMatches(gcs_object.generation)): |
- yield self._GetObjectRef( |
- bucket_url_string, gcs_object, with_version=( |
- self.all_versions or single_version_request)) |
- else: # CloudApi.CsObjectOrPrefixType.PREFIX |
- prefix = obj_or_prefix.data |
- # If the prefix ends with a slash, remove it. Note that we only |
- # remove one slash so that we can successfully enumerate dirs |
- # containing multiple slashes. |
- rstripped_prefix = StripOneSlash(prefix) |
- if prog.match(rstripped_prefix): |
- if suffix_wildcard and rstripped_prefix != suffix_wildcard: |
- # There's more wildcard left to expand. |
- url_append_string = '%s%s' % ( |
- bucket_url_string, rstripped_prefix + '/' + |
- suffix_wildcard) |
- urls_needing_expansion.append(url_append_string) |
- else: |
- # No wildcard to expand, just yield the prefix |
- yield self._GetPrefixRef(bucket_url_string, prefix) |
- |
- def _BuildBucketFilterStrings(self, wildcard): |
- """Builds strings needed for querying a bucket and filtering results. |
- |
- This implements wildcard object name matching. |
- |
- Args: |
- wildcard: The wildcard string to match to objects. |
- |
- Returns: |
- (prefix, delimiter, prefix_wildcard, suffix_wildcard) |
- where: |
- prefix is the prefix to be sent in bucket GET request. |
- delimiter is the delimiter to be sent in bucket GET request. |
- prefix_wildcard is the wildcard to be used to filter bucket GET results. |
- suffix_wildcard is wildcard to be appended to filtered bucket GET |
- results for next wildcard expansion iteration. |
- For example, given the wildcard gs://bucket/abc/d*e/f*.txt we |
- would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and |
- suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket |
- listing request will then produce a listing result set that can be |
- filtered using this prefix_wildcard; and we'd use this suffix_wildcard |
- to feed into the next call(s) to _BuildBucketFilterStrings(), for the |
- next iteration of listing/filtering. |
- |
- Raises: |
- AssertionError if wildcard doesn't contain any wildcard chars. |
- """ |
- # Generate a request prefix if the object name part of the wildcard starts |
- # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz'). |
- match = WILDCARD_REGEX.search(wildcard) |
- if not match: |
- # Input "wildcard" has no wildcard chars, so just return tuple that will |
- # cause a bucket listing to match the given input wildcard. Example: if |
- # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc, |
- # the next iteration will call _BuildBucketFilterStrings() with |
- # gs://bucket/dir/abc, and we will return prefix ='dir/abc', |
- # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''. |
- prefix = wildcard |
- delimiter = '/' |
- prefix_wildcard = wildcard |
- suffix_wildcard = '' |
- else: |
- if match.start() > 0: |
- # Wildcard does not occur at beginning of object name, so construct a |
- # prefix string to send to server. |
- prefix = wildcard[:match.start()] |
- wildcard_part = wildcard[match.start():] |
- else: |
- prefix = None |
- wildcard_part = wildcard |
- end = wildcard_part.find('/') |
- if end != -1: |
- wildcard_part = wildcard_part[:end+1] |
- # Remove trailing '/' so we will match gs://bucket/abc* as well as |
- # gs://bucket/abc*/ with the same wildcard regex. |
- prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part) |
- suffix_wildcard = wildcard[match.end():] |
- end = suffix_wildcard.find('/') |
- if end == -1: |
- suffix_wildcard = '' |
- else: |
- suffix_wildcard = suffix_wildcard[end+1:] |
- # To implement recursive (**) wildcarding, if prefix_wildcard |
- # suffix_wildcard starts with '**' don't send a delimiter, and combine |
- # suffix_wildcard at end of prefix_wildcard. |
- if prefix_wildcard.find('**') != -1: |
- delimiter = None |
- prefix_wildcard += suffix_wildcard |
- suffix_wildcard = '' |
- else: |
- delimiter = '/' |
- # The following debug output is useful for tracing how the algorithm |
- # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt |
- if self.debug > 1: |
- sys.stderr.write( |
- 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, ' |
- 'prefix_wildcard=%s, suffix_wildcard=%s\n' % |
- (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard)) |
- return (prefix, delimiter, prefix_wildcard, suffix_wildcard) |
- |
- def _SingleVersionMatches(self, listed_generation): |
- decoded_generation = GenerationFromUrlAndString(self.wildcard_url, |
- listed_generation) |
- return str(self.wildcard_url.generation) == str(decoded_generation) |
- |
- def _ExpandBucketWildcards(self, bucket_fields=None): |
- """Expands bucket and provider wildcards. |
- |
- Builds a list of bucket url strings that can be iterated on. |
- |
- Args: |
- bucket_fields: If present, populate only these metadata fields for |
- buckets. Example value: ['acl', 'defaultObjectAcl'] |
- |
- Yields: |
- BucketListingRefereneces of type BUCKET. |
- """ |
- bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string) |
- if (bucket_fields and set(bucket_fields) == set(['id']) and |
- not ContainsWildcard(self.wildcard_url.bucket_name)): |
- # If we just want the name of a non-wildcarded bucket URL, |
- # don't make an RPC. |
- yield BucketListingBucket(bucket_url) |
- elif(self.wildcard_url.IsBucket() and |
- not ContainsWildcard(self.wildcard_url.bucket_name)): |
- # If we have a non-wildcarded bucket URL, get just that bucket. |
- yield BucketListingBucket( |
- bucket_url, root_object=self.gsutil_api.GetBucket( |
- self.wildcard_url.bucket_name, provider=self.wildcard_url.scheme, |
- fields=bucket_fields)) |
- else: |
- regex = fnmatch.translate(self.wildcard_url.bucket_name) |
- prog = re.compile(regex) |
- |
- fields = self._GetToListFields(bucket_fields) |
- if fields: |
- fields.add('items/id') |
- for bucket in self.gsutil_api.ListBuckets( |
- fields=fields, project_id=self.project_id, |
- provider=self.wildcard_url.scheme): |
- if prog.match(bucket.id): |
- url = StorageUrlFromString( |
- '%s://%s/' % (self.wildcard_url.scheme, bucket.id)) |
- yield BucketListingBucket(url, root_object=bucket) |
- |
- def _GetToListFields(self, get_fields=None): |
- """Prepends 'items/' to the input fields and converts it to a set. |
- |
- This way field sets requested for GetBucket can be used in ListBucket calls. |
- Note that the input set must contain only bucket or object fields; listing |
- fields such as prefixes or nextPageToken should be added after calling |
- this function. |
- |
- Args: |
- get_fields: Iterable fields usable in GetBucket/GetObject calls. |
- |
- Returns: |
- Set of fields usable in ListBuckets/ListObjects calls. |
- """ |
- if get_fields: |
- list_fields = set() |
- for field in get_fields: |
- list_fields.add('items/' + field) |
- return list_fields |
- |
- def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False): |
- """Creates a BucketListingRef of type OBJECT from the arguments. |
- |
- Args: |
- bucket_url_string: Wildcardless string describing the containing bucket. |
- gcs_object: gsutil_api root Object for populating the BucketListingRef. |
- with_version: If true, return a reference with a versioned string. |
- |
- Returns: |
- BucketListingRef of type OBJECT. |
- """ |
- # Generation can be None in test mocks, so just return the |
- # live object for simplicity. |
- if with_version and gcs_object.generation is not None: |
- generation_str = GenerationFromUrlAndString(self.wildcard_url, |
- gcs_object.generation) |
- object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name, |
- generation_str) |
- else: |
- object_string = '%s%s' % (bucket_url_string, gcs_object.name) |
- object_url = StorageUrlFromString(object_string) |
- return BucketListingObject(object_url, root_object=gcs_object) |
- |
- def _GetPrefixRef(self, bucket_url_string, prefix): |
- """Creates a BucketListingRef of type PREFIX from the arguments. |
- |
- Args: |
- bucket_url_string: Wildcardless string describing the containing bucket. |
- prefix: gsutil_api Prefix for populating the BucketListingRef |
- |
- Returns: |
- BucketListingRef of type PREFIX. |
- """ |
- prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix)) |
- return BucketListingPrefix(prefix_url, root_object=prefix) |
- |
- def IterBuckets(self, bucket_fields=None): |
- """Iterates over the wildcard, returning refs for each expanded bucket. |
- |
- This ignores the object part of the URL entirely and expands only the |
- the bucket portion. It will yield BucketListingRefs of type BUCKET only. |
- |
- Args: |
- bucket_fields: Iterable fields to include in bucket listings. |
- Ex. ['defaultObjectAcl', 'logging']. This function is |
- responsible for converting these to listing-style |
- format ['items/defaultObjectAcl', 'items/logging'], as |
- well as adding any fields necessary for listing such as |
- 'items/id'. API implemenation is responsible for |
- adding pagination fields. If this is None, all fields are |
- returned. |
- |
- Yields: |
- BucketListingRef of type BUCKET, or empty iterator if no matches. |
- """ |
- for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields): |
- yield blr |
- |
- def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): |
- """Iterates over the wildcard, yielding bucket, prefix or object refs. |
- |
- Args: |
- bucket_listing_fields: If present, populate only these metadata |
- fields for listed objects. |
- expand_top_level_buckets: If true and the wildcard expands only to |
- Bucket(s), yields the expansion of each bucket |
- into a top-level listing of prefixes and objects |
- in that bucket instead of a BucketListingRef |
- to that bucket. |
- |
- Yields: |
- BucketListingRef, or empty iterator if no matches. |
- """ |
- for blr in self.__iter__( |
- bucket_listing_fields=bucket_listing_fields, |
- expand_top_level_buckets=expand_top_level_buckets): |
- yield blr |
- |
- def IterObjects(self, bucket_listing_fields=None): |
- """Iterates over the wildcard, yielding only object BucketListingRefs. |
- |
- Args: |
- bucket_listing_fields: If present, populate only these metadata |
- fields for listed objects. |
- |
- Yields: |
- BucketListingRefs of type OBJECT or empty iterator if no matches. |
- """ |
- for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields, |
- expand_top_level_buckets=True): |
- if blr.IsObject(): |
- yield blr |
- |
- |
-class FileWildcardIterator(WildcardIterator): |
- """WildcardIterator subclass for files and directories. |
- |
- If you use recursive wildcards ('**') only a single such wildcard is |
- supported. For example you could use the wildcard '**/*.txt' to list all .txt |
- files in any subdirectory of the current directory, but you couldn't use a |
- wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt |
- files in any subdirectory named 'abc'). |
- """ |
- |
- def __init__(self, wildcard_url, debug=0): |
- """Instantiates an iterator over BucketListingRefs matching wildcard URL. |
- |
- Args: |
- wildcard_url: FileUrl that contains the wildcard to iterate. |
- debug: Debug level (range 0..3). |
- """ |
- self.wildcard_url = wildcard_url |
- self.debug = debug |
- |
- def __iter__(self): |
- """Iterator that gets called when iterating over the file wildcard. |
- |
- In the case where no wildcard is present, returns a single matching file |
- or directory. |
- |
- Raises: |
- WildcardException: if invalid wildcard found. |
- |
- Yields: |
- BucketListingRef of type OBJECT (for files) or PREFIX (for directories) |
- """ |
- wildcard = self.wildcard_url.object_name |
- match = FLAT_LIST_REGEX.match(wildcard) |
- if match: |
- # Recursive wildcarding request ('.../**/...'). |
- # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' |
- base_dir = match.group('before')[:-1] |
- remaining_wildcard = match.group('after') |
- # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and |
- # remaining_wildcard = '/*' |
- if remaining_wildcard.startswith('*'): |
- raise WildcardException('Invalid wildcard with more than 2 consecutive ' |
- '*s (%s)' % wildcard) |
- # If there was no remaining wildcard past the recursive wildcard, |
- # treat it as if it were a '*'. For example, file://tmp/** is equivalent |
- # to file://tmp/**/* |
- if not remaining_wildcard: |
- remaining_wildcard = '*' |
- # Skip slash(es). |
- remaining_wildcard = remaining_wildcard.lstrip(os.sep) |
- filepaths = self._IterDir(base_dir, remaining_wildcard) |
- else: |
- # Not a recursive wildcarding request. |
- filepaths = glob.iglob(wildcard) |
- for filepath in filepaths: |
- expanded_url = StorageUrlFromString(filepath) |
- if os.path.isdir(filepath): |
- yield BucketListingPrefix(expanded_url) |
- else: |
- yield BucketListingObject(expanded_url) |
- |
- def _IterDir(self, directory, wildcard): |
- """An iterator over the specified dir and wildcard.""" |
- # UTF8-encode directory before passing it to os.walk() so if there are |
- # non-valid UTF8 chars in the file name (e.g., that can happen if the file |
- # originated on Windows) os.walk() will not attempt to decode and then die |
- # with a "codec can't decode byte" error, and instead we can catch the error |
- # at yield time and print a more informative error message. |
- for dirpath, unused_dirnames, filenames in os.walk(directory.encode(UTF8)): |
- for f in fnmatch.filter(filenames, wildcard): |
- try: |
- yield os.path.join(dirpath, f).decode(UTF8) |
- except UnicodeDecodeError: |
- # Note: We considered several ways to deal with this, but each had |
- # problems: |
- # 1. Raise an exception and try to catch in a higher layer (the |
- # gsutil cp command), so we can properly support the gsutil cp -c |
- # option. That doesn't work because raising an exception during |
- # iteration terminates the generator. |
- # 2. Accumulate a list of bad filenames and skip processing each |
- # during iteration, then raise at the end, with exception text |
- # printing the bad paths. That doesn't work because iteration is |
- # wrapped in PluralityCheckableIterator, so it's possible there |
- # are not-yet-performed copy operations at the time we reach the |
- # end of the iteration and raise the exception - which would cause |
- # us to skip copying validly named files. Moreover, the gsutil |
- # cp command loops over argv, so if you run the command gsutil cp |
- # -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1 |
- # would cause dir2 never to be visited. |
- # 3. Print the invalid pathname and skip it during iteration. That |
- # would work but would mean gsutil cp could exit with status 0 |
- # even though some files weren't copied. |
- # 4. Change the WildcardIterator to include an error status along with |
- # the result. That would solve the problem but would be a |
- # substantial change (WildcardIterator is used in many parts of |
- # gsutil), and we didn't feel that magnitude of change was |
- # warranted by this relatively uncommon corner case. |
- # Instead we chose to abort when one such file is encountered, and |
- # require the user to remove or rename the files and try again. |
- raise CommandException('\n'.join(textwrap.wrap( |
- 'Invalid Unicode path encountered (%s). gsutil cannot proceed ' |
- 'with such files present. Please remove or rename this file and ' |
- 'try again. NOTE: the path printed above replaces the ' |
- 'problematic characters with a hex-encoded printable ' |
- 'representation. For more details (including how to convert to a ' |
- 'gsutil-compatible encoding) see `gsutil help encoding`.' % |
- repr(os.path.join(dirpath, f))))) |
- |
- # pylint: disable=unused-argument |
- def IterObjects(self, bucket_listing_fields=None): |
- """Iterates over the wildcard, yielding only object (file) refs. |
- |
- Args: |
- bucket_listing_fields: Ignored as filesystems don't have buckets. |
- |
- Yields: |
- BucketListingRefs of type OBJECT or empty iterator if no matches. |
- """ |
- for bucket_listing_ref in self.IterAll(): |
- if bucket_listing_ref.IsObject(): |
- yield bucket_listing_ref |
- |
- # pylint: disable=unused-argument |
- def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): |
- """Iterates over the wildcard, yielding BucketListingRefs. |
- |
- Args: |
- bucket_listing_fields: Ignored; filesystems don't have buckets. |
- expand_top_level_buckets: Ignored; filesystems don't have buckets. |
- |
- Yields: |
- BucketListingRefs of type OBJECT (file) or PREFIX (directory), |
- or empty iterator if no matches. |
- """ |
- for bucket_listing_ref in self.__iter__(): |
- yield bucket_listing_ref |
- |
- def IterBuckets(self, unused_bucket_fields=None): |
- """Placeholder to allow polymorphic use of WildcardIterator. |
- |
- Args: |
- unused_bucket_fields: Ignored; filesystems don't have buckets. |
- |
- Raises: |
- WildcardException: in all cases. |
- """ |
- raise WildcardException( |
- 'Iterating over Buckets not possible for file wildcards') |
- |
- |
-class WildcardException(StandardError): |
- """Exception raised for invalid wildcard URLs.""" |
- |
- def __init__(self, reason): |
- StandardError.__init__(self) |
- self.reason = reason |
- |
- def __repr__(self): |
- return 'WildcardException: %s' % self.reason |
- |
- def __str__(self): |
- return 'WildcardException: %s' % self.reason |
- |
- |
-def CreateWildcardIterator(url_str, gsutil_api, all_versions=False, debug=0, |
- project_id=None): |
- """Instantiate a WildcardIterator for the given URL string. |
- |
- Args: |
- url_str: URL string naming wildcard object(s) to iterate. |
- gsutil_api: Cloud storage interface. Passed in for thread safety, also |
- settable for testing/mocking. |
- all_versions: If true, the iterator yields all versions of objects |
- matching the wildcard. If false, yields just the live |
- object version. |
- debug: Debug level to control debug output for iterator. |
- project_id: Project id to use for bucket listings. |
- |
- Returns: |
- A WildcardIterator that handles the requested iteration. |
- """ |
- |
- url = StorageUrlFromString(url_str) |
- if url.IsFileUrl(): |
- return FileWildcardIterator(url, debug=debug) |
- else: # Cloud URL |
- return CloudWildcardIterator( |
- url, gsutil_api, all_versions=all_versions, debug=debug, |
- project_id=project_id) |