Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(650)

Unified Diff: third_party/gsutil/gslib/wildcard_iterator.py

Issue 2280023003: depot_tools: Remove third_party/gsutil (Closed)
Patch Set: Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/gsutil/gslib/util.py ('k') | third_party/gsutil/gsutil » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/gsutil/gslib/wildcard_iterator.py
diff --git a/third_party/gsutil/gslib/wildcard_iterator.py b/third_party/gsutil/gslib/wildcard_iterator.py
deleted file mode 100644
index 97e7bc7a9793673642abd87c23b3f7ab5d575506..0000000000000000000000000000000000000000
--- a/third_party/gsutil/gslib/wildcard_iterator.py
+++ /dev/null
@@ -1,498 +0,0 @@
-# Copyright 2010 Google Inc. All Rights Reserved.
-#
-# Permission is hereby granted, free of charge, to any person obtaining a
-# copy of this software and associated documentation files (the
-# "Software"), to deal in the Software without restriction, including
-# without limitation the rights to use, copy, modify, merge, publish, dis-
-# tribute, sublicense, and/or sell copies of the Software, and to permit
-# persons to whom the Software is furnished to do so, subject to the fol-
-# lowing conditions:
-#
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-
-# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
-# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
-
-"""Implementation of wildcarding over StorageUris.
-
-StorageUri is an abstraction that Google introduced in the boto library,
-for representing storage provider-independent bucket and object names with
-a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current
-class provides wildcarding support for StorageUri objects (including both
-bucket and file system objects), allowing one to express collections of
-objects with syntax like the following:
- gs://mybucket/images/*.png
- file:///tmp/???abc???
-
-We provide wildcarding support as part of gsutil rather than as part
-of boto because wildcarding is really part of shell command-like
-functionality.
-
-A comment about wildcard semantics: We support both single path component
-wildcards (e.g., using '*') and recursive wildcards (using '**'), for both
-file and cloud URIs. For example,
- gs://bucket/doc/*/*.html
-would enumerate HTML files one directory down from gs://bucket/doc, while
- gs://bucket/**/*.html
-would enumerate HTML files in all objects contained in the bucket.
-
-Note also that if you use file system wildcards it's likely your shell
-interprets the wildcarding before passing the command to gsutil. For example:
- % gsutil cp /opt/eclipse/*/*.html gs://bucket/eclipse
-would likely be expanded by the shell into the following before running gsutil:
- % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse
-
-Note also that most shells don't support '**' wildcarding (I think only
-zsh does). If you want to use '**' wildcarding with such a shell you can
-single quote each wildcarded string, so it gets passed uninterpreted by the
-shell to gsutil (at which point gsutil will perform the wildcarding expansion):
- % gsutil cp '/opt/eclipse/**/*.html' gs://bucket/eclipse
-"""
-
-import boto
-import fnmatch
-import glob
-import os
-import re
-import sys
-import urllib
-
-from boto.s3.prefix import Prefix
-from boto.storage_uri import BucketStorageUri
-from bucket_listing_ref import BucketListingRef
-
-# Regex to determine if a string contains any wildcards.
-WILDCARD_REGEX = re.compile('[*?\[\]]')
-
-WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator'
-WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator'
-
-
-class WildcardIterator(object):
- """Base class for wildcarding over StorageUris.
-
- This class implements support for iterating over StorageUris that
- contain wildcards.
-
- The base class is abstract; you should instantiate using the
- wildcard_iterator() static factory method, which chooses the right
- implementation depending on the StorageUri.
- """
-
- def __repr__(self):
- """Returns string representation of WildcardIterator."""
- return 'WildcardIterator(%s)' % self.wildcard_uri
-
-
-class CloudWildcardIterator(WildcardIterator):
- """WildcardIterator subclass for buckets and objects.
-
- Iterates over BucketListingRef matching the StorageUri wildcard. It's
- much more efficient to request the Key from the BucketListingRef (via
- GetKey()) than to request the StorageUri and then call uri.get_key()
- to retrieve the key, for cases where you want to get metadata that's
- available in the Bucket (for example to get the name and size of
- each object), because that information is available in the bucket GET
- results. If you were to iterate over URIs for such cases and then get
- the name and size info from each resulting StorageUri, it would cause
- an additional object GET request for each of the result URIs.
- """
-
- def __init__(self, wildcard_uri, proj_id_handler,
- bucket_storage_uri_class=BucketStorageUri, all_versions=False,
- headers=None, debug=0):
- """
- Instantiates an iterator over BucketListingRef matching given wildcard URI.
-
- Args:
- wildcard_uri: StorageUri that contains the wildcard to iterate.
- proj_id_handler: ProjectIdHandler to use for current command.
- bucket_storage_uri_class: BucketStorageUri interface.
- Settable for testing/mocking.
- headers: Dictionary containing optional HTTP headers to pass to boto.
- debug: Debug level to pass in to boto connection (range 0..3).
- """
- self.wildcard_uri = wildcard_uri
- # Make a copy of the headers so any updates we make during wildcard
- # expansion aren't left in the input params (specifically, so we don't
- # include the x-goog-project-id header needed by a subset of cases, in
- # the data returned to caller, which could then be used in other cases
- # where that header must not be passed).
- if headers is None:
- self.headers = {}
- else:
- self.headers = headers.copy()
- self.proj_id_handler = proj_id_handler
- self.bucket_storage_uri_class = bucket_storage_uri_class
- self.all_versions = all_versions
- self.debug = debug
-
- def __iter__(self):
- """Python iterator that gets called when iterating over cloud wildcard.
-
- Yields:
- BucketListingRef, or empty iterator if no matches.
- """
- # First handle bucket wildcarding, if any.
- if ContainsWildcard(self.wildcard_uri.bucket_name):
- regex = fnmatch.translate(self.wildcard_uri.bucket_name)
- bucket_uris = []
- prog = re.compile(regex)
- self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR,
- self.wildcard_uri,
- self.headers)
- for b in self.wildcard_uri.get_all_buckets(headers=self.headers):
- if prog.match(b.name):
- # Use str(b.name) because get_all_buckets() returns Unicode
- # string, which when used to construct x-goog-copy-src metadata
- # requests for object-to-object copies causes pathname '/' chars
- # to be entity-encoded (bucket%2Fdir instead of bucket/dir),
- # which causes the request to fail.
- uri_str = '%s://%s' % (self.wildcard_uri.scheme,
- urllib.quote_plus(str(b.name)))
- bucket_uris.append(
- boto.storage_uri(
- uri_str, debug=self.debug,
- bucket_storage_uri_class=self.bucket_storage_uri_class,
- suppress_consec_slashes=False))
- else:
- bucket_uris = [self.wildcard_uri.clone_replace_name('')]
-
- # Now iterate over bucket(s), and handle object wildcarding, if any.
- self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR,
- self.wildcard_uri,
- self.headers)
- for bucket_uri in bucket_uris:
- if self.wildcard_uri.names_bucket():
- # Bucket-only URI.
- yield BucketListingRef(bucket_uri, key=None, prefix=None,
- headers=self.headers)
- else:
- # URI contains an object name. If there's no wildcard just yield
- # the needed URI.
- if not ContainsWildcard(self.wildcard_uri.object_name):
- uri_to_yield = bucket_uri.clone_replace_name(
- self.wildcard_uri.object_name)
- yield BucketListingRef(uri_to_yield, key=None, prefix=None,
- headers=self.headers)
- else:
- # URI contains a wildcard. Expand iteratively by building
- # prefix/delimiter bucket listing request, filtering the results per
- # the current level's wildcard, and continuing with the next component
- # of the wildcard. See _BuildBucketFilterStrings() documentation
- # for details.
- #
- # Initialize the iteration with bucket name from bucket_uri but
- # object name from self.wildcard_uri. This is needed to handle cases
- # where both the bucket and object names contain wildcards.
- uris_needing_expansion = [
- bucket_uri.clone_replace_name(self.wildcard_uri.object_name)]
- while len(uris_needing_expansion) > 0:
- uri = uris_needing_expansion.pop(0)
- (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (
- self._BuildBucketFilterStrings(uri.object_name))
- prog = re.compile(fnmatch.translate(prefix_wildcard))
- # List bucket for objects matching prefix up to delimiter.
- for key in bucket_uri.list_bucket(prefix=prefix,
- delimiter=delimiter,
- headers=self.headers,
- all_versions=self.all_versions):
- # Check that the prefix regex matches rstripped key.name (to
- # correspond with the rstripped prefix_wildcard from
- # _BuildBucketFilterStrings()).
- if prog.match(key.name.rstrip('/')):
- if suffix_wildcard and key.name.rstrip('/') != suffix_wildcard:
- if isinstance(key, Prefix):
- # There's more wildcard left to expand.
- uris_needing_expansion.append(
- uri.clone_replace_name(key.name.rstrip('/') + '/'
- + suffix_wildcard))
- else:
- # Done expanding.
- expanded_uri = uri.clone_replace_key(key)
-
- if isinstance(key, Prefix):
- yield BucketListingRef(expanded_uri, key=None, prefix=key,
- headers=self.headers)
- else:
- if self.all_versions:
- yield BucketListingRef(expanded_uri, key=key, prefix=None,
- headers=self.headers)
- else:
- # Yield BLR wrapping version-less URI.
- yield BucketListingRef(expanded_uri.clone_replace_name(
- expanded_uri.object_name), key=key, prefix=None,
- headers=self.headers)
-
- def _BuildBucketFilterStrings(self, wildcard):
- """
- Builds strings needed for querying a bucket and filtering results to
- implement wildcard object name matching.
-
- Args:
- wildcard: The wildcard string to match to objects.
-
- Returns:
- (prefix, delimiter, prefix_wildcard, suffix_wildcard)
- where:
- prefix is the prefix to be sent in bucket GET request.
- delimiter is the delimiter to be sent in bucket GET request.
- prefix_wildcard is the wildcard to be used to filter bucket GET results.
- suffix_wildcard is wildcard to be appended to filtered bucket GET
- results for next wildcard expansion iteration.
- For example, given the wildcard gs://bucket/abc/d*e/f*.txt we
- would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and
- suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket
- listing request will then produce a listing result set that can be
- filtered using this prefix_wildcard; and we'd use this suffix_wildcard
- to feed into the next call(s) to _BuildBucketFilterStrings(), for the
- next iteration of listing/filtering.
-
- Raises:
- AssertionError if wildcard doesn't contain any wildcard chars.
- """
- # Generate a request prefix if the object name part of the wildcard starts
- # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').
- match = WILDCARD_REGEX.search(wildcard)
- if not match:
- # Input "wildcard" has no wildcard chars, so just return tuple that will
- # cause a bucket listing to match the given input wildcard. Example: if
- # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,
- # the next iteration will call _BuildBucketFilterStrings() with
- # gs://bucket/dir/abc, and we will return prefix ='dir/abc',
- # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.
- prefix = wildcard
- delimiter = '/'
- prefix_wildcard = wildcard
- suffix_wildcard = ''
- else:
- if match.start() > 0:
- # Wildcard does not occur at beginning of object name, so construct a
- # prefix string to send to server.
- prefix = wildcard[:match.start()]
- wildcard_part = wildcard[match.start():]
- else:
- prefix = None
- wildcard_part = wildcard
- end = wildcard_part.find('/')
- if end != -1:
- wildcard_part = wildcard_part[:end+1]
- # Remove trailing '/' so we will match gs://bucket/abc* as well as
- # gs://bucket/abc*/ with the same wildcard regex.
- prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/')
- suffix_wildcard = wildcard[match.end():]
- end = suffix_wildcard.find('/')
- if end == -1:
- suffix_wildcard = ''
- else:
- suffix_wildcard = suffix_wildcard[end+1:]
- # To implement recursive (**) wildcarding, if prefix_wildcard
- # suffix_wildcard starts with '**' don't send a delimiter, and combine
- # suffix_wildcard at end of prefix_wildcard.
- if prefix_wildcard.find('**') != -1:
- delimiter = None
- prefix_wildcard = prefix_wildcard + suffix_wildcard
- suffix_wildcard = ''
- else:
- delimiter = '/'
- delim_pos = suffix_wildcard.find(delimiter)
- # The following debug output is useful for tracing how the algorithm
- # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt
- if self.debug > 1:
- sys.stderr.write(
- 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '
- 'prefix_wildcard=%s, suffix_wildcard=%s\n' %
- (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))
- return (prefix, delimiter, prefix_wildcard, suffix_wildcard)
-
- def IterKeys(self):
- """
- Convenience iterator that runs underlying iterator and returns Key for each
- iteration.
-
- Yields:
- Subclass of boto.s3.key.Key, or empty iterator if no matches.
-
- Raises:
- WildcardException: for bucket-only uri.
- """
- for bucket_listing_ref in self. __iter__():
- if bucket_listing_ref.HasKey():
- yield bucket_listing_ref.GetKey()
-
- def IterUris(self):
- """
- Convenience iterator that runs underlying iterator and returns StorageUri
- for each iteration.
-
- Yields:
- StorageUri, or empty iterator if no matches.
- """
- for bucket_listing_ref in self. __iter__():
- yield bucket_listing_ref.GetUri()
-
- def IterUrisForKeys(self):
- """
- Convenience iterator that runs underlying iterator and returns the
- StorageUri for each iterated BucketListingRef that has a Key.
-
- Yields:
- StorageUri, or empty iterator if no matches.
- """
- for bucket_listing_ref in self. __iter__():
- if bucket_listing_ref.HasKey():
- yield bucket_listing_ref.GetUri()
-
-
-class FileWildcardIterator(WildcardIterator):
- """WildcardIterator subclass for files and directories.
-
- If you use recursive wildcards ('**') only a single such wildcard is
- supported. For example you could use the wildcard '**/*.txt' to list all .txt
- files in any subdirectory of the current directory, but you couldn't use a
- wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt
- files in any subdirectory named 'abc').
- """
-
- def __init__(self, wildcard_uri, headers=None, debug=0):
- """
- Instantiate an iterator over BucketListingRefs matching given wildcard URI.
-
- Args:
- wildcard_uri: StorageUri that contains the wildcard to iterate.
- headers: Dictionary containing optional HTTP headers to pass to boto.
- debug: Debug level to pass in to boto connection (range 0..3).
- """
- self.wildcard_uri = wildcard_uri
- self.headers = headers
- self.debug = debug
-
- def __iter__(self):
- wildcard = self.wildcard_uri.object_name
- match = re.search('\*\*', wildcard)
- if match:
- # Recursive wildcarding request ('.../**/...').
- # Example input: wildcard = '/tmp/tmp2pQJAX/**/*'
- base_dir = wildcard[:match.start()-1]
- remaining_wildcard = wildcard[match.start()+2:]
- # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and
- # remaining_wildcard = '/*'
- if remaining_wildcard.startswith('*'):
- raise WildcardException('Invalid wildcard with more than 2 consecutive '
- '*s (%s)' % wildcard)
- # If there was no remaining wildcard past the recursive wildcard,
- # treat it as if it were a '*'. For example, file://tmp/** is equivalent
- # to file://tmp/**/*
- if not remaining_wildcard:
- remaining_wildcard = '*'
- # Skip slash(es).
- remaining_wildcard = remaining_wildcard.lstrip(os.sep)
- filepaths = []
- for dirpath, unused_dirnames, filenames in os.walk(base_dir):
- filepaths.extend(
- os.path.join(dirpath, f) for f in fnmatch.filter(filenames,
- remaining_wildcard)
- )
- else:
- # Not a recursive wildcarding request.
- filepaths = glob.glob(wildcard)
- for filepath in filepaths:
- expanded_uri = self.wildcard_uri.clone_replace_name(filepath)
- yield BucketListingRef(expanded_uri)
-
- def IterKeys(self):
- """
- Placeholder to allow polymorphic use of WildcardIterator.
-
- Raises:
- WildcardException: in all cases.
- """
- raise WildcardException(
- 'Iterating over Keys not possible for file wildcards')
-
- def IterUris(self):
- """
- Convenience iterator that runs underlying iterator and returns StorageUri
- for each iteration.
-
- Yields:
- StorageUri, or empty iterator if no matches.
- """
- for bucket_listing_ref in self. __iter__():
- yield bucket_listing_ref.GetUri()
-
-
-class WildcardException(StandardError):
- """Exception thrown for invalid wildcard URIs."""
-
- def __init__(self, reason):
- StandardError.__init__(self)
- self.reason = reason
-
- def __repr__(self):
- return 'WildcardException: %s' % self.reason
-
- def __str__(self):
- return 'WildcardException: %s' % self.reason
-
-
-def wildcard_iterator(uri_or_str, proj_id_handler,
- bucket_storage_uri_class=BucketStorageUri,
- all_versions=False,
- headers=None, debug=0):
- """Instantiate a WildCardIterator for the given StorageUri.
-
- Args:
- uri_or_str: StorageUri or URI string naming wildcard objects to iterate.
- proj_id_handler: ProjectIdHandler to use for current command.
- bucket_storage_uri_class: BucketStorageUri interface.
- Settable for testing/mocking.
- headers: Dictionary containing optional HTTP headers to pass to boto.
- debug: Debug level to pass in to boto connection (range 0..3).
-
- Returns:
- A WildcardIterator that handles the requested iteration.
- """
-
- if isinstance(uri_or_str, basestring):
- # Disable enforce_bucket_naming, to allow bucket names containing wildcard
- # chars.
- uri = boto.storage_uri(
- uri_or_str, debug=debug, validate=False,
- bucket_storage_uri_class=bucket_storage_uri_class,
- suppress_consec_slashes=False)
- else:
- uri = uri_or_str
-
- if uri.is_cloud_uri():
- return CloudWildcardIterator(
- uri, proj_id_handler,
- bucket_storage_uri_class=bucket_storage_uri_class,
- all_versions=all_versions,
- headers=headers,
- debug=debug)
- elif uri.is_file_uri():
- return FileWildcardIterator(uri, headers=headers, debug=debug)
- else:
- raise WildcardException('Unexpected type of StorageUri (%s)' % uri)
-
-
-def ContainsWildcard(uri_or_str):
- """Checks whether uri_or_str contains a wildcard.
-
- Args:
- uri_or_str: StorageUri or URI string to check.
-
- Returns:
- bool indicator.
- """
- if isinstance(uri_or_str, basestring):
- return bool(WILDCARD_REGEX.search(uri_or_str))
- else:
- return bool(WILDCARD_REGEX.search(uri_or_str.uri))
« no previous file with comments | « third_party/gsutil/gslib/util.py ('k') | third_party/gsutil/gsutil » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698