third_party/gsutil/gslib/wildcard_iterator.py - Issue 2280023003: depot_tools: Remove third_party/gsutil

Unified Diff: third_party/gsutil/gslib/wildcard_iterator.py

Issue 2280023003: depot_tools: Remove third_party/gsutil (Closed)

Patch Set: Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/gsutil/gslib/wildcard_iterator.py

diff --git a/third_party/gsutil/gslib/wildcard_iterator.py b/third_party/gsutil/gslib/wildcard_iterator.py

deleted file mode 100644

index 97e7bc7a9793673642abd87c23b3f7ab5d575506..0000000000000000000000000000000000000000

--- a/third_party/gsutil/gslib/wildcard_iterator.py

+++ /dev/null

@@ -1,498 +0,0 @@

-# Permission is hereby granted, free of charge, to any person obtaining a

-# copy of this software and associated documentation files (the

-# "Software"), to deal in the Software without restriction, including

-# without limitation the rights to use, copy, modify, merge, publish, dis-

-# tribute, sublicense, and/or sell copies of the Software, and to permit

-# persons to whom the Software is furnished to do so, subject to the fol-

-# lowing conditions:

-# The above copyright notice and this permission notice shall be included

-# in all copies or substantial portions of the Software.

-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS

-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-

-# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT

-# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,

-# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS

-# IN THE SOFTWARE.

-"""Implementation of wildcarding over StorageUris.

-StorageUri is an abstraction that Google introduced in the boto library,

-for representing storage provider-independent bucket and object names with

-a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current

-class provides wildcarding support for StorageUri objects (including both

-bucket and file system objects), allowing one to express collections of

-objects with syntax like the following:

- gs://mybucket/images/*.png

- file:///tmp/???abc???

-We provide wildcarding support as part of gsutil rather than as part

-of boto because wildcarding is really part of shell command-like

-functionality.

-A comment about wildcard semantics: We support both single path component

-wildcards (e.g., using '*') and recursive wildcards (using '**'), for both

-file and cloud URIs. For example,

- gs://bucket/doc/*/*.html

-would enumerate HTML files one directory down from gs://bucket/doc, while

- gs://bucket/**/*.html

-would enumerate HTML files in all objects contained in the bucket.

-Note also that if you use file system wildcards it's likely your shell

-interprets the wildcarding before passing the command to gsutil. For example:

- % gsutil cp /opt/eclipse/*/*.html gs://bucket/eclipse

-would likely be expanded by the shell into the following before running gsutil:

- % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse

-Note also that most shells don't support '**' wildcarding (I think only

-zsh does). If you want to use '**' wildcarding with such a shell you can

-single quote each wildcarded string, so it gets passed uninterpreted by the

-shell to gsutil (at which point gsutil will perform the wildcarding expansion):

- % gsutil cp '/opt/eclipse/**/*.html' gs://bucket/eclipse

-"""

-import boto

-import fnmatch

-import glob

-import os

-import re

-import sys

-import urllib

-from boto.s3.prefix import Prefix

-from boto.storage_uri import BucketStorageUri

-from bucket_listing_ref import BucketListingRef

-# Regex to determine if a string contains any wildcards.

-WILDCARD_REGEX = re.compile('[*?\[\]]')

-WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator'

-WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator'

-class WildcardIterator(object):

- """Base class for wildcarding over StorageUris.

- This class implements support for iterating over StorageUris that

- contain wildcards.

- The base class is abstract; you should instantiate using the

- wildcard_iterator() static factory method, which chooses the right

- implementation depending on the StorageUri.

- """

- def __repr__(self):

- """Returns string representation of WildcardIterator."""

- return 'WildcardIterator(%s)' % self.wildcard_uri

-class CloudWildcardIterator(WildcardIterator):

- """WildcardIterator subclass for buckets and objects.

- Iterates over BucketListingRef matching the StorageUri wildcard. It's

- much more efficient to request the Key from the BucketListingRef (via

- GetKey()) than to request the StorageUri and then call uri.get_key()

- to retrieve the key, for cases where you want to get metadata that's

- available in the Bucket (for example to get the name and size of

- each object), because that information is available in the bucket GET

- results. If you were to iterate over URIs for such cases and then get

- the name and size info from each resulting StorageUri, it would cause

- an additional object GET request for each of the result URIs.

- """

- def __init__(self, wildcard_uri, proj_id_handler,

- bucket_storage_uri_class=BucketStorageUri, all_versions=False,

- headers=None, debug=0):

- """

- Instantiates an iterator over BucketListingRef matching given wildcard URI.

- Args:

- wildcard_uri: StorageUri that contains the wildcard to iterate.

- proj_id_handler: ProjectIdHandler to use for current command.

- bucket_storage_uri_class: BucketStorageUri interface.

- Settable for testing/mocking.

- headers: Dictionary containing optional HTTP headers to pass to boto.

- debug: Debug level to pass in to boto connection (range 0..3).

- """

- self.wildcard_uri = wildcard_uri

- # Make a copy of the headers so any updates we make during wildcard

- # expansion aren't left in the input params (specifically, so we don't

- # include the x-goog-project-id header needed by a subset of cases, in

- # the data returned to caller, which could then be used in other cases

- # where that header must not be passed).

- if headers is None:

- self.headers = {}

- else:

- self.headers = headers.copy()

- self.proj_id_handler = proj_id_handler

- self.bucket_storage_uri_class = bucket_storage_uri_class

- self.all_versions = all_versions

- self.debug = debug

- def __iter__(self):

- """Python iterator that gets called when iterating over cloud wildcard.

- Yields:

- BucketListingRef, or empty iterator if no matches.

- """

- # First handle bucket wildcarding, if any.

- if ContainsWildcard(self.wildcard_uri.bucket_name):

- regex = fnmatch.translate(self.wildcard_uri.bucket_name)

- bucket_uris = []

- prog = re.compile(regex)

- self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR,

- self.wildcard_uri,

- self.headers)

- for b in self.wildcard_uri.get_all_buckets(headers=self.headers):

- if prog.match(b.name):

- # Use str(b.name) because get_all_buckets() returns Unicode

- # string, which when used to construct x-goog-copy-src metadata

- # requests for object-to-object copies causes pathname '/' chars

- # to be entity-encoded (bucket%2Fdir instead of bucket/dir),

- # which causes the request to fail.

- uri_str = '%s://%s' % (self.wildcard_uri.scheme,

- urllib.quote_plus(str(b.name)))

- bucket_uris.append(

- boto.storage_uri(

- uri_str, debug=self.debug,

- bucket_storage_uri_class=self.bucket_storage_uri_class,

- suppress_consec_slashes=False))

- else:

- bucket_uris = [self.wildcard_uri.clone_replace_name('')]

- # Now iterate over bucket(s), and handle object wildcarding, if any.

- self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR,

- self.wildcard_uri,

- self.headers)

- for bucket_uri in bucket_uris:

- if self.wildcard_uri.names_bucket():

- # Bucket-only URI.

- yield BucketListingRef(bucket_uri, key=None, prefix=None,

- headers=self.headers)

- else:

- # URI contains an object name. If there's no wildcard just yield

- # the needed URI.

- if not ContainsWildcard(self.wildcard_uri.object_name):

- uri_to_yield = bucket_uri.clone_replace_name(

- self.wildcard_uri.object_name)

- yield BucketListingRef(uri_to_yield, key=None, prefix=None,

- headers=self.headers)

- else:

- # URI contains a wildcard. Expand iteratively by building

- # prefix/delimiter bucket listing request, filtering the results per

- # the current level's wildcard, and continuing with the next component

- # of the wildcard. See _BuildBucketFilterStrings() documentation

- # for details.

- #

- # Initialize the iteration with bucket name from bucket_uri but

- # object name from self.wildcard_uri. This is needed to handle cases

- # where both the bucket and object names contain wildcards.

- uris_needing_expansion = [

- bucket_uri.clone_replace_name(self.wildcard_uri.object_name)]

- while len(uris_needing_expansion) > 0:

- uri = uris_needing_expansion.pop(0)

- (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (

- self._BuildBucketFilterStrings(uri.object_name))

- prog = re.compile(fnmatch.translate(prefix_wildcard))

- # List bucket for objects matching prefix up to delimiter.

- for key in bucket_uri.list_bucket(prefix=prefix,

- delimiter=delimiter,

- headers=self.headers,

- all_versions=self.all_versions):

- # Check that the prefix regex matches rstripped key.name (to

- # correspond with the rstripped prefix_wildcard from

- # _BuildBucketFilterStrings()).

- if prog.match(key.name.rstrip('/')):

- if suffix_wildcard and key.name.rstrip('/') != suffix_wildcard:

- if isinstance(key, Prefix):

- # There's more wildcard left to expand.

- uris_needing_expansion.append(

- uri.clone_replace_name(key.name.rstrip('/') + '/'

- + suffix_wildcard))

- else:

- # Done expanding.

- expanded_uri = uri.clone_replace_key(key)

- if isinstance(key, Prefix):

- yield BucketListingRef(expanded_uri, key=None, prefix=key,

- headers=self.headers)

- else:

- if self.all_versions:

- yield BucketListingRef(expanded_uri, key=key, prefix=None,

- headers=self.headers)

- else:

- # Yield BLR wrapping version-less URI.

- yield BucketListingRef(expanded_uri.clone_replace_name(

- expanded_uri.object_name), key=key, prefix=None,

- headers=self.headers)

- def _BuildBucketFilterStrings(self, wildcard):

- """

- Builds strings needed for querying a bucket and filtering results to

- implement wildcard object name matching.

- Args:

- wildcard: The wildcard string to match to objects.

- Returns:

- (prefix, delimiter, prefix_wildcard, suffix_wildcard)

- where:

- prefix is the prefix to be sent in bucket GET request.

- delimiter is the delimiter to be sent in bucket GET request.

- prefix_wildcard is the wildcard to be used to filter bucket GET results.

- suffix_wildcard is wildcard to be appended to filtered bucket GET

- results for next wildcard expansion iteration.

- For example, given the wildcard gs://bucket/abc/d*e/f*.txt we

- would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and

- suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket

- listing request will then produce a listing result set that can be

- filtered using this prefix_wildcard; and we'd use this suffix_wildcard

- to feed into the next call(s) to _BuildBucketFilterStrings(), for the

- next iteration of listing/filtering.

- Raises:

- AssertionError if wildcard doesn't contain any wildcard chars.

- """

- # Generate a request prefix if the object name part of the wildcard starts

- # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').

- match = WILDCARD_REGEX.search(wildcard)

- if not match:

- # Input "wildcard" has no wildcard chars, so just return tuple that will

- # cause a bucket listing to match the given input wildcard. Example: if

- # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,

- # the next iteration will call _BuildBucketFilterStrings() with

- # gs://bucket/dir/abc, and we will return prefix ='dir/abc',

- # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.

- prefix = wildcard

- delimiter = '/'

- prefix_wildcard = wildcard

- suffix_wildcard = ''

- else:

- if match.start() > 0:

- # Wildcard does not occur at beginning of object name, so construct a

- # prefix string to send to server.

- prefix = wildcard[:match.start()]

- wildcard_part = wildcard[match.start():]

- else:

- prefix = None

- wildcard_part = wildcard

- end = wildcard_part.find('/')

- if end != -1:

- wildcard_part = wildcard_part[:end+1]

- # Remove trailing '/' so we will match gs://bucket/abc* as well as

- # gs://bucket/abc*/ with the same wildcard regex.

- prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/')

- suffix_wildcard = wildcard[match.end():]

- end = suffix_wildcard.find('/')

- if end == -1:

- suffix_wildcard = ''

- else:

- suffix_wildcard = suffix_wildcard[end+1:]

- # To implement recursive (**) wildcarding, if prefix_wildcard

- # suffix_wildcard starts with '**' don't send a delimiter, and combine

- # suffix_wildcard at end of prefix_wildcard.

- if prefix_wildcard.find('**') != -1:

- delimiter = None

- prefix_wildcard = prefix_wildcard + suffix_wildcard

- suffix_wildcard = ''

- else:

- delimiter = '/'

- delim_pos = suffix_wildcard.find(delimiter)

- # The following debug output is useful for tracing how the algorithm

- # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt

- if self.debug > 1:

- sys.stderr.write(

- 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '

- 'prefix_wildcard=%s, suffix_wildcard=%s\n' %

- (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))

- return (prefix, delimiter, prefix_wildcard, suffix_wildcard)

- def IterKeys(self):

- """

- Convenience iterator that runs underlying iterator and returns Key for each

- iteration.

- Yields:

- Subclass of boto.s3.key.Key, or empty iterator if no matches.

- Raises:

- WildcardException: for bucket-only uri.

- """

- for bucket_listing_ref in self. __iter__():

- if bucket_listing_ref.HasKey():

- yield bucket_listing_ref.GetKey()

- def IterUris(self):

- """

- Convenience iterator that runs underlying iterator and returns StorageUri

- for each iteration.

- Yields:

- StorageUri, or empty iterator if no matches.

- """

- for bucket_listing_ref in self. __iter__():

- yield bucket_listing_ref.GetUri()

- def IterUrisForKeys(self):

- """

- Convenience iterator that runs underlying iterator and returns the

- StorageUri for each iterated BucketListingRef that has a Key.

- Yields:

- StorageUri, or empty iterator if no matches.

- """

- for bucket_listing_ref in self. __iter__():

- if bucket_listing_ref.HasKey():

- yield bucket_listing_ref.GetUri()

-class FileWildcardIterator(WildcardIterator):

- """WildcardIterator subclass for files and directories.

- If you use recursive wildcards ('**') only a single such wildcard is

- supported. For example you could use the wildcard '**/*.txt' to list all .txt

- files in any subdirectory of the current directory, but you couldn't use a

- wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt

- files in any subdirectory named 'abc').

- """

- def __init__(self, wildcard_uri, headers=None, debug=0):

- """

- Instantiate an iterator over BucketListingRefs matching given wildcard URI.

- Args:

- wildcard_uri: StorageUri that contains the wildcard to iterate.

- headers: Dictionary containing optional HTTP headers to pass to boto.

- debug: Debug level to pass in to boto connection (range 0..3).

- """

- self.wildcard_uri = wildcard_uri

- self.headers = headers

- self.debug = debug

- def __iter__(self):

- wildcard = self.wildcard_uri.object_name

- match = re.search('\*\*', wildcard)

- if match:

- # Recursive wildcarding request ('.../**/...').

- # Example input: wildcard = '/tmp/tmp2pQJAX/**/*'

- base_dir = wildcard[:match.start()-1]

- remaining_wildcard = wildcard[match.start()+2:]

- # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and

- # remaining_wildcard = '/*'

- if remaining_wildcard.startswith('*'):

- raise WildcardException('Invalid wildcard with more than 2 consecutive '

- '*s (%s)' % wildcard)

- # If there was no remaining wildcard past the recursive wildcard,

- # treat it as if it were a '*'. For example, file://tmp/** is equivalent

- # to file://tmp/**/*

- if not remaining_wildcard:

- remaining_wildcard = '*'

- # Skip slash(es).

- remaining_wildcard = remaining_wildcard.lstrip(os.sep)

- filepaths = []

- for dirpath, unused_dirnames, filenames in os.walk(base_dir):

- filepaths.extend(

- os.path.join(dirpath, f) for f in fnmatch.filter(filenames,

- remaining_wildcard)

- )

- else:

- # Not a recursive wildcarding request.

- filepaths = glob.glob(wildcard)

- for filepath in filepaths:

- expanded_uri = self.wildcard_uri.clone_replace_name(filepath)

- yield BucketListingRef(expanded_uri)

- def IterKeys(self):

- """

- Placeholder to allow polymorphic use of WildcardIterator.

- Raises:

- WildcardException: in all cases.

- """

- raise WildcardException(

- 'Iterating over Keys not possible for file wildcards')

- def IterUris(self):

- """

- Convenience iterator that runs underlying iterator and returns StorageUri

- for each iteration.

- Yields:

- StorageUri, or empty iterator if no matches.

- """

- for bucket_listing_ref in self. __iter__():

- yield bucket_listing_ref.GetUri()

-class WildcardException(StandardError):

- """Exception thrown for invalid wildcard URIs."""

- def __init__(self, reason):

- StandardError.__init__(self)

- self.reason = reason

- def __repr__(self):

- return 'WildcardException: %s' % self.reason

- def __str__(self):

- return 'WildcardException: %s' % self.reason

-def wildcard_iterator(uri_or_str, proj_id_handler,

- bucket_storage_uri_class=BucketStorageUri,

- all_versions=False,

- headers=None, debug=0):

- """Instantiate a WildCardIterator for the given StorageUri.

- Args:

- uri_or_str: StorageUri or URI string naming wildcard objects to iterate.

- proj_id_handler: ProjectIdHandler to use for current command.

- bucket_storage_uri_class: BucketStorageUri interface.

- Settable for testing/mocking.

- headers: Dictionary containing optional HTTP headers to pass to boto.

- debug: Debug level to pass in to boto connection (range 0..3).

- Returns:

- A WildcardIterator that handles the requested iteration.

- """

- if isinstance(uri_or_str, basestring):

- # Disable enforce_bucket_naming, to allow bucket names containing wildcard

- # chars.

- uri = boto.storage_uri(

- uri_or_str, debug=debug, validate=False,

- bucket_storage_uri_class=bucket_storage_uri_class,

- suppress_consec_slashes=False)

- else:

- uri = uri_or_str

- if uri.is_cloud_uri():

- return CloudWildcardIterator(

- uri, proj_id_handler,

- bucket_storage_uri_class=bucket_storage_uri_class,

- all_versions=all_versions,

- headers=headers,

- debug=debug)

- elif uri.is_file_uri():

- return FileWildcardIterator(uri, headers=headers, debug=debug)

- else:

- raise WildcardException('Unexpected type of StorageUri (%s)' % uri)

-def ContainsWildcard(uri_or_str):

- """Checks whether uri_or_str contains a wildcard.

- Args:

- uri_or_str: StorageUri or URI string to check.

- Returns:

- bool indicator.

- """

- if isinstance(uri_or_str, basestring):

- return bool(WILDCARD_REGEX.search(uri_or_str))

- else:

- return bool(WILDCARD_REGEX.search(uri_or_str.uri))

« no previous file with comments | « third_party/gsutil/gslib/util.py ('k') | third_party/gsutil/gsutil » ('j') | no next file with comments »