| Index: third_party/gsutil/gslib/wildcard_iterator.py
|
| diff --git a/third_party/gsutil/gslib/wildcard_iterator.py b/third_party/gsutil/gslib/wildcard_iterator.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..97e7bc7a9793673642abd87c23b3f7ab5d575506
|
| --- /dev/null
|
| +++ b/third_party/gsutil/gslib/wildcard_iterator.py
|
| @@ -0,0 +1,498 @@
|
| +# Copyright 2010 Google Inc. All Rights Reserved.
|
| +#
|
| +# Permission is hereby granted, free of charge, to any person obtaining a
|
| +# copy of this software and associated documentation files (the
|
| +# "Software"), to deal in the Software without restriction, including
|
| +# without limitation the rights to use, copy, modify, merge, publish, dis-
|
| +# tribute, sublicense, and/or sell copies of the Software, and to permit
|
| +# persons to whom the Software is furnished to do so, subject to the fol-
|
| +# lowing conditions:
|
| +#
|
| +# The above copyright notice and this permission notice shall be included
|
| +# in all copies or substantial portions of the Software.
|
| +#
|
| +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
| +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
|
| +# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
| +# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
| +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
| +# IN THE SOFTWARE.
|
| +
|
| +"""Implementation of wildcarding over StorageUris.
|
| +
|
| +StorageUri is an abstraction that Google introduced in the boto library,
|
| +for representing storage provider-independent bucket and object names with
|
| +a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current
|
| +class provides wildcarding support for StorageUri objects (including both
|
| +bucket and file system objects), allowing one to express collections of
|
| +objects with syntax like the following:
|
| + gs://mybucket/images/*.png
|
| + file:///tmp/???abc???
|
| +
|
| +We provide wildcarding support as part of gsutil rather than as part
|
| +of boto because wildcarding is really part of shell command-like
|
| +functionality.
|
| +
|
| +A comment about wildcard semantics: We support both single path component
|
| +wildcards (e.g., using '*') and recursive wildcards (using '**'), for both
|
| +file and cloud URIs. For example,
|
| + gs://bucket/doc/*/*.html
|
| +would enumerate HTML files one directory down from gs://bucket/doc, while
|
| + gs://bucket/**/*.html
|
| +would enumerate HTML files in all objects contained in the bucket.
|
| +
|
| +Note also that if you use file system wildcards it's likely your shell
|
| +interprets the wildcarding before passing the command to gsutil. For example:
|
| + % gsutil cp /opt/eclipse/*/*.html gs://bucket/eclipse
|
| +would likely be expanded by the shell into the following before running gsutil:
|
| + % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse
|
| +
|
| +Note also that most shells don't support '**' wildcarding (I think only
|
| +zsh does). If you want to use '**' wildcarding with such a shell you can
|
| +single quote each wildcarded string, so it gets passed uninterpreted by the
|
| +shell to gsutil (at which point gsutil will perform the wildcarding expansion):
|
| + % gsutil cp '/opt/eclipse/**/*.html' gs://bucket/eclipse
|
| +"""
|
| +
|
| +import boto
|
| +import fnmatch
|
| +import glob
|
| +import os
|
| +import re
|
| +import sys
|
| +import urllib
|
| +
|
| +from boto.s3.prefix import Prefix
|
| +from boto.storage_uri import BucketStorageUri
|
| +from bucket_listing_ref import BucketListingRef
|
| +
|
| +# Regex to determine if a string contains any wildcards.
|
| +WILDCARD_REGEX = re.compile('[*?\[\]]')
|
| +
|
| +WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator'
|
| +WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator'
|
| +
|
| +
|
| +class WildcardIterator(object):
|
| + """Base class for wildcarding over StorageUris.
|
| +
|
| + This class implements support for iterating over StorageUris that
|
| + contain wildcards.
|
| +
|
| + The base class is abstract; you should instantiate using the
|
| + wildcard_iterator() static factory method, which chooses the right
|
| + implementation depending on the StorageUri.
|
| + """
|
| +
|
| + def __repr__(self):
|
| + """Returns string representation of WildcardIterator."""
|
| + return 'WildcardIterator(%s)' % self.wildcard_uri
|
| +
|
| +
|
| +class CloudWildcardIterator(WildcardIterator):
|
| + """WildcardIterator subclass for buckets and objects.
|
| +
|
| + Iterates over BucketListingRef matching the StorageUri wildcard. It's
|
| + much more efficient to request the Key from the BucketListingRef (via
|
| + GetKey()) than to request the StorageUri and then call uri.get_key()
|
| + to retrieve the key, for cases where you want to get metadata that's
|
| + available in the Bucket (for example to get the name and size of
|
| + each object), because that information is available in the bucket GET
|
| + results. If you were to iterate over URIs for such cases and then get
|
| + the name and size info from each resulting StorageUri, it would cause
|
| + an additional object GET request for each of the result URIs.
|
| + """
|
| +
|
| + def __init__(self, wildcard_uri, proj_id_handler,
|
| + bucket_storage_uri_class=BucketStorageUri, all_versions=False,
|
| + headers=None, debug=0):
|
| + """
|
| + Instantiates an iterator over BucketListingRef matching given wildcard URI.
|
| +
|
| + Args:
|
| + wildcard_uri: StorageUri that contains the wildcard to iterate.
|
| + proj_id_handler: ProjectIdHandler to use for current command.
|
| + bucket_storage_uri_class: BucketStorageUri interface.
|
| + Settable for testing/mocking.
|
| + headers: Dictionary containing optional HTTP headers to pass to boto.
|
| + debug: Debug level to pass in to boto connection (range 0..3).
|
| + """
|
| + self.wildcard_uri = wildcard_uri
|
| + # Make a copy of the headers so any updates we make during wildcard
|
| + # expansion aren't left in the input params (specifically, so we don't
|
| + # include the x-goog-project-id header needed by a subset of cases, in
|
| + # the data returned to caller, which could then be used in other cases
|
| + # where that header must not be passed).
|
| + if headers is None:
|
| + self.headers = {}
|
| + else:
|
| + self.headers = headers.copy()
|
| + self.proj_id_handler = proj_id_handler
|
| + self.bucket_storage_uri_class = bucket_storage_uri_class
|
| + self.all_versions = all_versions
|
| + self.debug = debug
|
| +
|
| + def __iter__(self):
|
| + """Python iterator that gets called when iterating over cloud wildcard.
|
| +
|
| + Yields:
|
| + BucketListingRef, or empty iterator if no matches.
|
| + """
|
| + # First handle bucket wildcarding, if any.
|
| + if ContainsWildcard(self.wildcard_uri.bucket_name):
|
| + regex = fnmatch.translate(self.wildcard_uri.bucket_name)
|
| + bucket_uris = []
|
| + prog = re.compile(regex)
|
| + self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR,
|
| + self.wildcard_uri,
|
| + self.headers)
|
| + for b in self.wildcard_uri.get_all_buckets(headers=self.headers):
|
| + if prog.match(b.name):
|
| + # Use str(b.name) because get_all_buckets() returns Unicode
|
| + # string, which when used to construct x-goog-copy-src metadata
|
| + # requests for object-to-object copies causes pathname '/' chars
|
| + # to be entity-encoded (bucket%2Fdir instead of bucket/dir),
|
| + # which causes the request to fail.
|
| + uri_str = '%s://%s' % (self.wildcard_uri.scheme,
|
| + urllib.quote_plus(str(b.name)))
|
| + bucket_uris.append(
|
| + boto.storage_uri(
|
| + uri_str, debug=self.debug,
|
| + bucket_storage_uri_class=self.bucket_storage_uri_class,
|
| + suppress_consec_slashes=False))
|
| + else:
|
| + bucket_uris = [self.wildcard_uri.clone_replace_name('')]
|
| +
|
| + # Now iterate over bucket(s), and handle object wildcarding, if any.
|
| + self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR,
|
| + self.wildcard_uri,
|
| + self.headers)
|
| + for bucket_uri in bucket_uris:
|
| + if self.wildcard_uri.names_bucket():
|
| + # Bucket-only URI.
|
| + yield BucketListingRef(bucket_uri, key=None, prefix=None,
|
| + headers=self.headers)
|
| + else:
|
| + # URI contains an object name. If there's no wildcard just yield
|
| + # the needed URI.
|
| + if not ContainsWildcard(self.wildcard_uri.object_name):
|
| + uri_to_yield = bucket_uri.clone_replace_name(
|
| + self.wildcard_uri.object_name)
|
| + yield BucketListingRef(uri_to_yield, key=None, prefix=None,
|
| + headers=self.headers)
|
| + else:
|
| + # URI contains a wildcard. Expand iteratively by building
|
| + # prefix/delimiter bucket listing request, filtering the results per
|
| + # the current level's wildcard, and continuing with the next component
|
| + # of the wildcard. See _BuildBucketFilterStrings() documentation
|
| + # for details.
|
| + #
|
| + # Initialize the iteration with bucket name from bucket_uri but
|
| + # object name from self.wildcard_uri. This is needed to handle cases
|
| + # where both the bucket and object names contain wildcards.
|
| + uris_needing_expansion = [
|
| + bucket_uri.clone_replace_name(self.wildcard_uri.object_name)]
|
| + while len(uris_needing_expansion) > 0:
|
| + uri = uris_needing_expansion.pop(0)
|
| + (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (
|
| + self._BuildBucketFilterStrings(uri.object_name))
|
| + prog = re.compile(fnmatch.translate(prefix_wildcard))
|
| + # List bucket for objects matching prefix up to delimiter.
|
| + for key in bucket_uri.list_bucket(prefix=prefix,
|
| + delimiter=delimiter,
|
| + headers=self.headers,
|
| + all_versions=self.all_versions):
|
| + # Check that the prefix regex matches rstripped key.name (to
|
| + # correspond with the rstripped prefix_wildcard from
|
| + # _BuildBucketFilterStrings()).
|
| + if prog.match(key.name.rstrip('/')):
|
| + if suffix_wildcard and key.name.rstrip('/') != suffix_wildcard:
|
| + if isinstance(key, Prefix):
|
| + # There's more wildcard left to expand.
|
| + uris_needing_expansion.append(
|
| + uri.clone_replace_name(key.name.rstrip('/') + '/'
|
| + + suffix_wildcard))
|
| + else:
|
| + # Done expanding.
|
| + expanded_uri = uri.clone_replace_key(key)
|
| +
|
| + if isinstance(key, Prefix):
|
| + yield BucketListingRef(expanded_uri, key=None, prefix=key,
|
| + headers=self.headers)
|
| + else:
|
| + if self.all_versions:
|
| + yield BucketListingRef(expanded_uri, key=key, prefix=None,
|
| + headers=self.headers)
|
| + else:
|
| + # Yield BLR wrapping version-less URI.
|
| + yield BucketListingRef(expanded_uri.clone_replace_name(
|
| + expanded_uri.object_name), key=key, prefix=None,
|
| + headers=self.headers)
|
| +
|
| + def _BuildBucketFilterStrings(self, wildcard):
|
| + """
|
| + Builds strings needed for querying a bucket and filtering results to
|
| + implement wildcard object name matching.
|
| +
|
| + Args:
|
| + wildcard: The wildcard string to match to objects.
|
| +
|
| + Returns:
|
| + (prefix, delimiter, prefix_wildcard, suffix_wildcard)
|
| + where:
|
| + prefix is the prefix to be sent in bucket GET request.
|
| + delimiter is the delimiter to be sent in bucket GET request.
|
| + prefix_wildcard is the wildcard to be used to filter bucket GET results.
|
| + suffix_wildcard is wildcard to be appended to filtered bucket GET
|
| + results for next wildcard expansion iteration.
|
| + For example, given the wildcard gs://bucket/abc/d*e/f*.txt we
|
| + would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and
|
| + suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket
|
| + listing request will then produce a listing result set that can be
|
| + filtered using this prefix_wildcard; and we'd use this suffix_wildcard
|
| + to feed into the next call(s) to _BuildBucketFilterStrings(), for the
|
| + next iteration of listing/filtering.
|
| +
|
| + Raises:
|
| + AssertionError if wildcard doesn't contain any wildcard chars.
|
| + """
|
| + # Generate a request prefix if the object name part of the wildcard starts
|
| + # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').
|
| + match = WILDCARD_REGEX.search(wildcard)
|
| + if not match:
|
| + # Input "wildcard" has no wildcard chars, so just return tuple that will
|
| + # cause a bucket listing to match the given input wildcard. Example: if
|
| + # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,
|
| + # the next iteration will call _BuildBucketFilterStrings() with
|
| + # gs://bucket/dir/abc, and we will return prefix ='dir/abc',
|
| + # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.
|
| + prefix = wildcard
|
| + delimiter = '/'
|
| + prefix_wildcard = wildcard
|
| + suffix_wildcard = ''
|
| + else:
|
| + if match.start() > 0:
|
| + # Wildcard does not occur at beginning of object name, so construct a
|
| + # prefix string to send to server.
|
| + prefix = wildcard[:match.start()]
|
| + wildcard_part = wildcard[match.start():]
|
| + else:
|
| + prefix = None
|
| + wildcard_part = wildcard
|
| + end = wildcard_part.find('/')
|
| + if end != -1:
|
| + wildcard_part = wildcard_part[:end+1]
|
| + # Remove trailing '/' so we will match gs://bucket/abc* as well as
|
| + # gs://bucket/abc*/ with the same wildcard regex.
|
| + prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/')
|
| + suffix_wildcard = wildcard[match.end():]
|
| + end = suffix_wildcard.find('/')
|
| + if end == -1:
|
| + suffix_wildcard = ''
|
| + else:
|
| + suffix_wildcard = suffix_wildcard[end+1:]
|
| + # To implement recursive (**) wildcarding, if prefix_wildcard
|
| + # suffix_wildcard starts with '**' don't send a delimiter, and combine
|
| + # suffix_wildcard at end of prefix_wildcard.
|
| + if prefix_wildcard.find('**') != -1:
|
| + delimiter = None
|
| + prefix_wildcard = prefix_wildcard + suffix_wildcard
|
| + suffix_wildcard = ''
|
| + else:
|
| + delimiter = '/'
|
| + delim_pos = suffix_wildcard.find(delimiter)
|
| + # The following debug output is useful for tracing how the algorithm
|
| + # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt
|
| + if self.debug > 1:
|
| + sys.stderr.write(
|
| + 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '
|
| + 'prefix_wildcard=%s, suffix_wildcard=%s\n' %
|
| + (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))
|
| + return (prefix, delimiter, prefix_wildcard, suffix_wildcard)
|
| +
|
| + def IterKeys(self):
|
| + """
|
| + Convenience iterator that runs underlying iterator and returns Key for each
|
| + iteration.
|
| +
|
| + Yields:
|
| + Subclass of boto.s3.key.Key, or empty iterator if no matches.
|
| +
|
| + Raises:
|
| + WildcardException: for bucket-only uri.
|
| + """
|
| + for bucket_listing_ref in self. __iter__():
|
| + if bucket_listing_ref.HasKey():
|
| + yield bucket_listing_ref.GetKey()
|
| +
|
| + def IterUris(self):
|
| + """
|
| + Convenience iterator that runs underlying iterator and returns StorageUri
|
| + for each iteration.
|
| +
|
| + Yields:
|
| + StorageUri, or empty iterator if no matches.
|
| + """
|
| + for bucket_listing_ref in self. __iter__():
|
| + yield bucket_listing_ref.GetUri()
|
| +
|
| + def IterUrisForKeys(self):
|
| + """
|
| + Convenience iterator that runs underlying iterator and returns the
|
| + StorageUri for each iterated BucketListingRef that has a Key.
|
| +
|
| + Yields:
|
| + StorageUri, or empty iterator if no matches.
|
| + """
|
| + for bucket_listing_ref in self. __iter__():
|
| + if bucket_listing_ref.HasKey():
|
| + yield bucket_listing_ref.GetUri()
|
| +
|
| +
|
| +class FileWildcardIterator(WildcardIterator):
|
| + """WildcardIterator subclass for files and directories.
|
| +
|
| + If you use recursive wildcards ('**') only a single such wildcard is
|
| + supported. For example you could use the wildcard '**/*.txt' to list all .txt
|
| + files in any subdirectory of the current directory, but you couldn't use a
|
| + wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt
|
| + files in any subdirectory named 'abc').
|
| + """
|
| +
|
| + def __init__(self, wildcard_uri, headers=None, debug=0):
|
| + """
|
| + Instantiate an iterator over BucketListingRefs matching given wildcard URI.
|
| +
|
| + Args:
|
| + wildcard_uri: StorageUri that contains the wildcard to iterate.
|
| + headers: Dictionary containing optional HTTP headers to pass to boto.
|
| + debug: Debug level to pass in to boto connection (range 0..3).
|
| + """
|
| + self.wildcard_uri = wildcard_uri
|
| + self.headers = headers
|
| + self.debug = debug
|
| +
|
| + def __iter__(self):
|
| + wildcard = self.wildcard_uri.object_name
|
| + match = re.search('\*\*', wildcard)
|
| + if match:
|
| + # Recursive wildcarding request ('.../**/...').
|
| + # Example input: wildcard = '/tmp/tmp2pQJAX/**/*'
|
| + base_dir = wildcard[:match.start()-1]
|
| + remaining_wildcard = wildcard[match.start()+2:]
|
| + # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and
|
| + # remaining_wildcard = '/*'
|
| + if remaining_wildcard.startswith('*'):
|
| + raise WildcardException('Invalid wildcard with more than 2 consecutive '
|
| + '*s (%s)' % wildcard)
|
| + # If there was no remaining wildcard past the recursive wildcard,
|
| + # treat it as if it were a '*'. For example, file://tmp/** is equivalent
|
| + # to file://tmp/**/*
|
| + if not remaining_wildcard:
|
| + remaining_wildcard = '*'
|
| + # Skip slash(es).
|
| + remaining_wildcard = remaining_wildcard.lstrip(os.sep)
|
| + filepaths = []
|
| + for dirpath, unused_dirnames, filenames in os.walk(base_dir):
|
| + filepaths.extend(
|
| + os.path.join(dirpath, f) for f in fnmatch.filter(filenames,
|
| + remaining_wildcard)
|
| + )
|
| + else:
|
| + # Not a recursive wildcarding request.
|
| + filepaths = glob.glob(wildcard)
|
| + for filepath in filepaths:
|
| + expanded_uri = self.wildcard_uri.clone_replace_name(filepath)
|
| + yield BucketListingRef(expanded_uri)
|
| +
|
| + def IterKeys(self):
|
| + """
|
| + Placeholder to allow polymorphic use of WildcardIterator.
|
| +
|
| + Raises:
|
| + WildcardException: in all cases.
|
| + """
|
| + raise WildcardException(
|
| + 'Iterating over Keys not possible for file wildcards')
|
| +
|
| + def IterUris(self):
|
| + """
|
| + Convenience iterator that runs underlying iterator and returns StorageUri
|
| + for each iteration.
|
| +
|
| + Yields:
|
| + StorageUri, or empty iterator if no matches.
|
| + """
|
| + for bucket_listing_ref in self. __iter__():
|
| + yield bucket_listing_ref.GetUri()
|
| +
|
| +
|
| +class WildcardException(StandardError):
|
| + """Exception thrown for invalid wildcard URIs."""
|
| +
|
| + def __init__(self, reason):
|
| + StandardError.__init__(self)
|
| + self.reason = reason
|
| +
|
| + def __repr__(self):
|
| + return 'WildcardException: %s' % self.reason
|
| +
|
| + def __str__(self):
|
| + return 'WildcardException: %s' % self.reason
|
| +
|
| +
|
| +def wildcard_iterator(uri_or_str, proj_id_handler,
|
| + bucket_storage_uri_class=BucketStorageUri,
|
| + all_versions=False,
|
| + headers=None, debug=0):
|
| + """Instantiate a WildCardIterator for the given StorageUri.
|
| +
|
| + Args:
|
| + uri_or_str: StorageUri or URI string naming wildcard objects to iterate.
|
| + proj_id_handler: ProjectIdHandler to use for current command.
|
| + bucket_storage_uri_class: BucketStorageUri interface.
|
| + Settable for testing/mocking.
|
| + headers: Dictionary containing optional HTTP headers to pass to boto.
|
| + debug: Debug level to pass in to boto connection (range 0..3).
|
| +
|
| + Returns:
|
| + A WildcardIterator that handles the requested iteration.
|
| + """
|
| +
|
| + if isinstance(uri_or_str, basestring):
|
| + # Disable enforce_bucket_naming, to allow bucket names containing wildcard
|
| + # chars.
|
| + uri = boto.storage_uri(
|
| + uri_or_str, debug=debug, validate=False,
|
| + bucket_storage_uri_class=bucket_storage_uri_class,
|
| + suppress_consec_slashes=False)
|
| + else:
|
| + uri = uri_or_str
|
| +
|
| + if uri.is_cloud_uri():
|
| + return CloudWildcardIterator(
|
| + uri, proj_id_handler,
|
| + bucket_storage_uri_class=bucket_storage_uri_class,
|
| + all_versions=all_versions,
|
| + headers=headers,
|
| + debug=debug)
|
| + elif uri.is_file_uri():
|
| + return FileWildcardIterator(uri, headers=headers, debug=debug)
|
| + else:
|
| + raise WildcardException('Unexpected type of StorageUri (%s)' % uri)
|
| +
|
| +
|
| +def ContainsWildcard(uri_or_str):
|
| + """Checks whether uri_or_str contains a wildcard.
|
| +
|
| + Args:
|
| + uri_or_str: StorageUri or URI string to check.
|
| +
|
| + Returns:
|
| + bool indicator.
|
| + """
|
| + if isinstance(uri_or_str, basestring):
|
| + return bool(WILDCARD_REGEX.search(uri_or_str))
|
| + else:
|
| + return bool(WILDCARD_REGEX.search(uri_or_str.uri))
|
|
|