third_party/gsutil/gslib/wildcard_iterator.py - Issue 12685010: Added gsutil/gslib to depot_tools/third_party

Unified Diff: third_party/gsutil/gslib/wildcard_iterator.py

Issue 12685010: Added gsutil/gslib to depot_tools/third_party (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Created 7 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/gsutil/gslib/wildcard_iterator.py

diff --git a/third_party/gsutil/gslib/wildcard_iterator.py b/third_party/gsutil/gslib/wildcard_iterator.py

new file mode 100644

index 0000000000000000000000000000000000000000..97e7bc7a9793673642abd87c23b3f7ab5d575506

--- /dev/null

+++ b/third_party/gsutil/gslib/wildcard_iterator.py

@@ -0,0 +1,498 @@

+# Permission is hereby granted, free of charge, to any person obtaining a

+# copy of this software and associated documentation files (the

+# "Software"), to deal in the Software without restriction, including

+# without limitation the rights to use, copy, modify, merge, publish, dis-

+# tribute, sublicense, and/or sell copies of the Software, and to permit

+# persons to whom the Software is furnished to do so, subject to the fol-

+# lowing conditions:

+# The above copyright notice and this permission notice shall be included

+# in all copies or substantial portions of the Software.

+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS

+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-

+# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT

+# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,

+# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS

+# IN THE SOFTWARE.

+"""Implementation of wildcarding over StorageUris.

+StorageUri is an abstraction that Google introduced in the boto library,

+for representing storage provider-independent bucket and object names with

+a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current

+class provides wildcarding support for StorageUri objects (including both

+bucket and file system objects), allowing one to express collections of

+objects with syntax like the following:

+ gs://mybucket/images/*.png

+ file:///tmp/???abc???

+We provide wildcarding support as part of gsutil rather than as part

+of boto because wildcarding is really part of shell command-like

+functionality.

+A comment about wildcard semantics: We support both single path component

+wildcards (e.g., using '*') and recursive wildcards (using '**'), for both

+file and cloud URIs. For example,

+ gs://bucket/doc/*/*.html

+would enumerate HTML files one directory down from gs://bucket/doc, while

+ gs://bucket/**/*.html

+would enumerate HTML files in all objects contained in the bucket.

+Note also that if you use file system wildcards it's likely your shell

+interprets the wildcarding before passing the command to gsutil. For example:

+ % gsutil cp /opt/eclipse/*/*.html gs://bucket/eclipse

+would likely be expanded by the shell into the following before running gsutil:

+ % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse

+Note also that most shells don't support '**' wildcarding (I think only

+zsh does). If you want to use '**' wildcarding with such a shell you can

+single quote each wildcarded string, so it gets passed uninterpreted by the

+shell to gsutil (at which point gsutil will perform the wildcarding expansion):

+ % gsutil cp '/opt/eclipse/**/*.html' gs://bucket/eclipse

+"""

+import boto

+import fnmatch

+import glob

+import os

+import re

+import sys

+import urllib

+from boto.s3.prefix import Prefix

+from boto.storage_uri import BucketStorageUri

+from bucket_listing_ref import BucketListingRef

+# Regex to determine if a string contains any wildcards.

+WILDCARD_REGEX = re.compile('[*?\[\]]')

+WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator'

+WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator'

+class WildcardIterator(object):

+ """Base class for wildcarding over StorageUris.

+ This class implements support for iterating over StorageUris that

+ contain wildcards.

+ The base class is abstract; you should instantiate using the

+ wildcard_iterator() static factory method, which chooses the right

+ implementation depending on the StorageUri.

+ """

+ def __repr__(self):

+ """Returns string representation of WildcardIterator."""

+ return 'WildcardIterator(%s)' % self.wildcard_uri

+class CloudWildcardIterator(WildcardIterator):

+ """WildcardIterator subclass for buckets and objects.

+ Iterates over BucketListingRef matching the StorageUri wildcard. It's

+ much more efficient to request the Key from the BucketListingRef (via

+ GetKey()) than to request the StorageUri and then call uri.get_key()

+ to retrieve the key, for cases where you want to get metadata that's

+ available in the Bucket (for example to get the name and size of

+ each object), because that information is available in the bucket GET

+ results. If you were to iterate over URIs for such cases and then get

+ the name and size info from each resulting StorageUri, it would cause

+ an additional object GET request for each of the result URIs.

+ """

+ def __init__(self, wildcard_uri, proj_id_handler,

+ bucket_storage_uri_class=BucketStorageUri, all_versions=False,

+ headers=None, debug=0):

+ """

+ Instantiates an iterator over BucketListingRef matching given wildcard URI.

+ Args:

+ wildcard_uri: StorageUri that contains the wildcard to iterate.

+ proj_id_handler: ProjectIdHandler to use for current command.

+ bucket_storage_uri_class: BucketStorageUri interface.

+ Settable for testing/mocking.

+ headers: Dictionary containing optional HTTP headers to pass to boto.

+ debug: Debug level to pass in to boto connection (range 0..3).

+ """

+ self.wildcard_uri = wildcard_uri

+ # Make a copy of the headers so any updates we make during wildcard

+ # expansion aren't left in the input params (specifically, so we don't

+ # include the x-goog-project-id header needed by a subset of cases, in

+ # the data returned to caller, which could then be used in other cases

+ # where that header must not be passed).

+ if headers is None:

+ self.headers = {}

+ else:

+ self.headers = headers.copy()

+ self.proj_id_handler = proj_id_handler

+ self.bucket_storage_uri_class = bucket_storage_uri_class

+ self.all_versions = all_versions

+ self.debug = debug

+ def __iter__(self):

+ """Python iterator that gets called when iterating over cloud wildcard.

+ Yields:

+ BucketListingRef, or empty iterator if no matches.

+ """

+ # First handle bucket wildcarding, if any.

+ if ContainsWildcard(self.wildcard_uri.bucket_name):

+ regex = fnmatch.translate(self.wildcard_uri.bucket_name)

+ bucket_uris = []

+ prog = re.compile(regex)

+ self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR,

+ self.wildcard_uri,

+ self.headers)

+ for b in self.wildcard_uri.get_all_buckets(headers=self.headers):

+ if prog.match(b.name):

+ # Use str(b.name) because get_all_buckets() returns Unicode

+ # string, which when used to construct x-goog-copy-src metadata

+ # requests for object-to-object copies causes pathname '/' chars

+ # to be entity-encoded (bucket%2Fdir instead of bucket/dir),

+ # which causes the request to fail.

+ uri_str = '%s://%s' % (self.wildcard_uri.scheme,

+ urllib.quote_plus(str(b.name)))

+ bucket_uris.append(

+ boto.storage_uri(

+ uri_str, debug=self.debug,

+ bucket_storage_uri_class=self.bucket_storage_uri_class,

+ suppress_consec_slashes=False))

+ else:

+ bucket_uris = [self.wildcard_uri.clone_replace_name('')]

+ # Now iterate over bucket(s), and handle object wildcarding, if any.

+ self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR,

+ self.wildcard_uri,

+ self.headers)

+ for bucket_uri in bucket_uris:

+ if self.wildcard_uri.names_bucket():

+ # Bucket-only URI.

+ yield BucketListingRef(bucket_uri, key=None, prefix=None,

+ headers=self.headers)

+ else:

+ # URI contains an object name. If there's no wildcard just yield

+ # the needed URI.

+ if not ContainsWildcard(self.wildcard_uri.object_name):

+ uri_to_yield = bucket_uri.clone_replace_name(

+ self.wildcard_uri.object_name)

+ yield BucketListingRef(uri_to_yield, key=None, prefix=None,

+ headers=self.headers)

+ else:

+ # URI contains a wildcard. Expand iteratively by building

+ # prefix/delimiter bucket listing request, filtering the results per

+ # the current level's wildcard, and continuing with the next component

+ # of the wildcard. See _BuildBucketFilterStrings() documentation

+ # for details.

+ #

+ # Initialize the iteration with bucket name from bucket_uri but

+ # object name from self.wildcard_uri. This is needed to handle cases

+ # where both the bucket and object names contain wildcards.

+ uris_needing_expansion = [

+ bucket_uri.clone_replace_name(self.wildcard_uri.object_name)]

+ while len(uris_needing_expansion) > 0:

+ uri = uris_needing_expansion.pop(0)

+ (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (

+ self._BuildBucketFilterStrings(uri.object_name))

+ prog = re.compile(fnmatch.translate(prefix_wildcard))

+ # List bucket for objects matching prefix up to delimiter.

+ for key in bucket_uri.list_bucket(prefix=prefix,

+ delimiter=delimiter,

+ headers=self.headers,

+ all_versions=self.all_versions):

+ # Check that the prefix regex matches rstripped key.name (to

+ # correspond with the rstripped prefix_wildcard from

+ # _BuildBucketFilterStrings()).

+ if prog.match(key.name.rstrip('/')):

+ if suffix_wildcard and key.name.rstrip('/') != suffix_wildcard:

+ if isinstance(key, Prefix):

+ # There's more wildcard left to expand.

+ uris_needing_expansion.append(

+ uri.clone_replace_name(key.name.rstrip('/') + '/'

+ + suffix_wildcard))

+ else:

+ # Done expanding.

+ expanded_uri = uri.clone_replace_key(key)

+ if isinstance(key, Prefix):

+ yield BucketListingRef(expanded_uri, key=None, prefix=key,

+ headers=self.headers)

+ else:

+ if self.all_versions:

+ yield BucketListingRef(expanded_uri, key=key, prefix=None,

+ headers=self.headers)

+ else:

+ # Yield BLR wrapping version-less URI.

+ yield BucketListingRef(expanded_uri.clone_replace_name(

+ expanded_uri.object_name), key=key, prefix=None,

+ headers=self.headers)

+ def _BuildBucketFilterStrings(self, wildcard):

+ """

+ Builds strings needed for querying a bucket and filtering results to

+ implement wildcard object name matching.

+ Args:

+ wildcard: The wildcard string to match to objects.

+ Returns:

+ (prefix, delimiter, prefix_wildcard, suffix_wildcard)

+ where:

+ prefix is the prefix to be sent in bucket GET request.

+ delimiter is the delimiter to be sent in bucket GET request.

+ prefix_wildcard is the wildcard to be used to filter bucket GET results.

+ suffix_wildcard is wildcard to be appended to filtered bucket GET

+ results for next wildcard expansion iteration.

+ For example, given the wildcard gs://bucket/abc/d*e/f*.txt we

+ would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and

+ suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket

+ listing request will then produce a listing result set that can be

+ filtered using this prefix_wildcard; and we'd use this suffix_wildcard

+ to feed into the next call(s) to _BuildBucketFilterStrings(), for the

+ next iteration of listing/filtering.

+ Raises:

+ AssertionError if wildcard doesn't contain any wildcard chars.

+ """

+ # Generate a request prefix if the object name part of the wildcard starts

+ # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').

+ match = WILDCARD_REGEX.search(wildcard)

+ if not match:

+ # Input "wildcard" has no wildcard chars, so just return tuple that will

+ # cause a bucket listing to match the given input wildcard. Example: if

+ # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,

+ # the next iteration will call _BuildBucketFilterStrings() with

+ # gs://bucket/dir/abc, and we will return prefix ='dir/abc',

+ # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.

+ prefix = wildcard

+ delimiter = '/'

+ prefix_wildcard = wildcard

+ suffix_wildcard = ''

+ else:

+ if match.start() > 0:

+ # Wildcard does not occur at beginning of object name, so construct a

+ # prefix string to send to server.

+ prefix = wildcard[:match.start()]

+ wildcard_part = wildcard[match.start():]

+ else:

+ prefix = None

+ wildcard_part = wildcard

+ end = wildcard_part.find('/')

+ if end != -1:

+ wildcard_part = wildcard_part[:end+1]

+ # Remove trailing '/' so we will match gs://bucket/abc* as well as

+ # gs://bucket/abc*/ with the same wildcard regex.

+ prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/')

+ suffix_wildcard = wildcard[match.end():]

+ end = suffix_wildcard.find('/')

+ if end == -1:

+ suffix_wildcard = ''

+ else:

+ suffix_wildcard = suffix_wildcard[end+1:]

+ # To implement recursive (**) wildcarding, if prefix_wildcard

+ # suffix_wildcard starts with '**' don't send a delimiter, and combine

+ # suffix_wildcard at end of prefix_wildcard.

+ if prefix_wildcard.find('**') != -1:

+ delimiter = None

+ prefix_wildcard = prefix_wildcard + suffix_wildcard

+ suffix_wildcard = ''

+ else:

+ delimiter = '/'

+ delim_pos = suffix_wildcard.find(delimiter)

+ # The following debug output is useful for tracing how the algorithm

+ # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt

+ if self.debug > 1:

+ sys.stderr.write(

+ 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '

+ 'prefix_wildcard=%s, suffix_wildcard=%s\n' %

+ (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))

+ return (prefix, delimiter, prefix_wildcard, suffix_wildcard)

+ def IterKeys(self):

+ """

+ Convenience iterator that runs underlying iterator and returns Key for each

+ iteration.

+ Yields:

+ Subclass of boto.s3.key.Key, or empty iterator if no matches.

+ Raises:

+ WildcardException: for bucket-only uri.

+ """

+ for bucket_listing_ref in self. __iter__():

+ if bucket_listing_ref.HasKey():

+ yield bucket_listing_ref.GetKey()

+ def IterUris(self):

+ """

+ Convenience iterator that runs underlying iterator and returns StorageUri

+ for each iteration.

+ Yields:

+ StorageUri, or empty iterator if no matches.

+ """

+ for bucket_listing_ref in self. __iter__():

+ yield bucket_listing_ref.GetUri()

+ def IterUrisForKeys(self):

+ """

+ Convenience iterator that runs underlying iterator and returns the

+ StorageUri for each iterated BucketListingRef that has a Key.

+ Yields:

+ StorageUri, or empty iterator if no matches.

+ """

+ for bucket_listing_ref in self. __iter__():

+ if bucket_listing_ref.HasKey():

+ yield bucket_listing_ref.GetUri()

+class FileWildcardIterator(WildcardIterator):

+ """WildcardIterator subclass for files and directories.

+ If you use recursive wildcards ('**') only a single such wildcard is

+ supported. For example you could use the wildcard '**/*.txt' to list all .txt

+ files in any subdirectory of the current directory, but you couldn't use a

+ wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt

+ files in any subdirectory named 'abc').

+ """

+ def __init__(self, wildcard_uri, headers=None, debug=0):

+ """

+ Instantiate an iterator over BucketListingRefs matching given wildcard URI.

+ Args:

+ wildcard_uri: StorageUri that contains the wildcard to iterate.

+ headers: Dictionary containing optional HTTP headers to pass to boto.

+ debug: Debug level to pass in to boto connection (range 0..3).

+ """

+ self.wildcard_uri = wildcard_uri

+ self.headers = headers

+ self.debug = debug

+ def __iter__(self):

+ wildcard = self.wildcard_uri.object_name

+ match = re.search('\*\*', wildcard)

+ if match:

+ # Recursive wildcarding request ('.../**/...').

+ # Example input: wildcard = '/tmp/tmp2pQJAX/**/*'

+ base_dir = wildcard[:match.start()-1]

+ remaining_wildcard = wildcard[match.start()+2:]

+ # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and

+ # remaining_wildcard = '/*'

+ if remaining_wildcard.startswith('*'):

+ raise WildcardException('Invalid wildcard with more than 2 consecutive '

+ '*s (%s)' % wildcard)

+ # If there was no remaining wildcard past the recursive wildcard,

+ # treat it as if it were a '*'. For example, file://tmp/** is equivalent

+ # to file://tmp/**/*

+ if not remaining_wildcard:

+ remaining_wildcard = '*'

+ # Skip slash(es).

+ remaining_wildcard = remaining_wildcard.lstrip(os.sep)

+ filepaths = []

+ for dirpath, unused_dirnames, filenames in os.walk(base_dir):

+ filepaths.extend(

+ os.path.join(dirpath, f) for f in fnmatch.filter(filenames,

+ remaining_wildcard)

+ )

+ else:

+ # Not a recursive wildcarding request.

+ filepaths = glob.glob(wildcard)

+ for filepath in filepaths:

+ expanded_uri = self.wildcard_uri.clone_replace_name(filepath)

+ yield BucketListingRef(expanded_uri)

+ def IterKeys(self):

+ """

+ Placeholder to allow polymorphic use of WildcardIterator.

+ Raises:

+ WildcardException: in all cases.

+ """

+ raise WildcardException(

+ 'Iterating over Keys not possible for file wildcards')

+ def IterUris(self):

+ """

+ Convenience iterator that runs underlying iterator and returns StorageUri

+ for each iteration.

+ Yields:

+ StorageUri, or empty iterator if no matches.

+ """

+ for bucket_listing_ref in self. __iter__():

+ yield bucket_listing_ref.GetUri()

+class WildcardException(StandardError):

+ """Exception thrown for invalid wildcard URIs."""

+ def __init__(self, reason):

+ StandardError.__init__(self)

+ self.reason = reason

+ def __repr__(self):

+ return 'WildcardException: %s' % self.reason

+ def __str__(self):

+ return 'WildcardException: %s' % self.reason

+def wildcard_iterator(uri_or_str, proj_id_handler,

+ bucket_storage_uri_class=BucketStorageUri,

+ all_versions=False,

+ headers=None, debug=0):

+ """Instantiate a WildCardIterator for the given StorageUri.

+ Args:

+ uri_or_str: StorageUri or URI string naming wildcard objects to iterate.

+ proj_id_handler: ProjectIdHandler to use for current command.

+ bucket_storage_uri_class: BucketStorageUri interface.

+ Settable for testing/mocking.

+ headers: Dictionary containing optional HTTP headers to pass to boto.

+ debug: Debug level to pass in to boto connection (range 0..3).

+ Returns:

+ A WildcardIterator that handles the requested iteration.

+ """

+ if isinstance(uri_or_str, basestring):

+ # Disable enforce_bucket_naming, to allow bucket names containing wildcard

+ # chars.

+ uri = boto.storage_uri(

+ uri_or_str, debug=debug, validate=False,

+ bucket_storage_uri_class=bucket_storage_uri_class,

+ suppress_consec_slashes=False)

+ else:

+ uri = uri_or_str

+ if uri.is_cloud_uri():

+ return CloudWildcardIterator(

+ uri, proj_id_handler,

+ bucket_storage_uri_class=bucket_storage_uri_class,

+ all_versions=all_versions,

+ headers=headers,

+ debug=debug)

+ elif uri.is_file_uri():

+ return FileWildcardIterator(uri, headers=headers, debug=debug)

+ else:

+ raise WildcardException('Unexpected type of StorageUri (%s)' % uri)

+def ContainsWildcard(uri_or_str):

+ """Checks whether uri_or_str contains a wildcard.

+ Args:

+ uri_or_str: StorageUri or URI string to check.

+ Returns:

+ bool indicator.

+ """

+ if isinstance(uri_or_str, basestring):

+ return bool(WILDCARD_REGEX.search(uri_or_str))

+ else:

+ return bool(WILDCARD_REGEX.search(uri_or_str.uri))

« third_party/gsutil/gslib/command_runner.py ('K') | « third_party/gsutil/gslib/util.py ('k') | no next file » | no next file with comments »