| Index: third_party/gsutil/gslib/name_expansion.py
|
| diff --git a/third_party/gsutil/gslib/name_expansion.py b/third_party/gsutil/gslib/name_expansion.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..0d8b6cae9bbb4b1dec88942eeca0ed4c70328497
|
| --- /dev/null
|
| +++ b/third_party/gsutil/gslib/name_expansion.py
|
| @@ -0,0 +1,530 @@
|
| +# -*- coding: utf-8 -*-
|
| +# Copyright 2012 Google Inc. All Rights Reserved.
|
| +#
|
| +# Licensed under the Apache License, Version 2.0 (the "License");
|
| +# you may not use this file except in compliance with the License.
|
| +# You may obtain a copy of the License at
|
| +#
|
| +# http://www.apache.org/licenses/LICENSE-2.0
|
| +#
|
| +# Unless required by applicable law or agreed to in writing, software
|
| +# distributed under the License is distributed on an "AS IS" BASIS,
|
| +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| +# See the License for the specific language governing permissions and
|
| +# limitations under the License.
|
| +"""Name expansion iterator and result classes.
|
| +
|
| +Name expansion support for the various ways gsutil lets users refer to
|
| +collections of data (via explicit wildcarding as well as directory,
|
| +bucket, and bucket subdir implicit wildcarding). This class encapsulates
|
| +the various rules for determining how these expansions are done.
|
| +"""
|
| +
|
| +# Disable warnings for NameExpansionIteratorQueue functions; they implement
|
| +# an interface which does not follow lint guidelines.
|
| +# pylint: disable=invalid-name
|
| +
|
| +from __future__ import absolute_import
|
| +
|
| +import multiprocessing
|
| +import os
|
| +import sys
|
| +
|
| +from gslib.exception import CommandException
|
| +from gslib.plurality_checkable_iterator import PluralityCheckableIterator
|
| +import gslib.wildcard_iterator
|
| +from gslib.wildcard_iterator import StorageUrlFromString
|
| +
|
| +
|
| +class NameExpansionResult(object):
|
| + """Holds one fully expanded result from iterating over NameExpansionIterator.
|
| +
|
| + The member data in this class need to be pickleable because
|
| + NameExpansionResult instances are passed through Multiprocessing.Queue. In
|
| + particular, don't include any boto state like StorageUri, since that pulls
|
| + in a big tree of objects, some of which aren't pickleable (and even if
|
| + they were, pickling/unpickling such a large object tree would result in
|
| + significant overhead).
|
| +
|
| + The state held in this object is needed for handling the various naming cases
|
| + (e.g., copying from a single source URL to a directory generates different
|
| + dest URL names than copying multiple URLs to a directory, to be consistent
|
| + with naming rules used by the Unix cp command). For more details see comments
|
| + in _NameExpansionIterator.
|
| + """
|
| +
|
| + def __init__(self, source_storage_url, is_multi_source_request,
|
| + names_container, expanded_storage_url):
|
| + """Instantiates a result from name expansion.
|
| +
|
| + Args:
|
| + source_storage_url: StorageUrl that was being expanded.
|
| + is_multi_source_request: bool indicator whether src_url_str expanded to
|
| + more than one BucketListingRef.
|
| + names_container: Bool indicator whether src_url names a container.
|
| + expanded_storage_url: StorageUrl that was expanded.
|
| + """
|
| + self.source_storage_url = source_storage_url
|
| + self.is_multi_source_request = is_multi_source_request
|
| + self.names_container = names_container
|
| + self.expanded_storage_url = expanded_storage_url
|
| +
|
| + def __repr__(self):
|
| + return '%s' % self._expanded_storage_url
|
| +
|
| +
|
| +class _NameExpansionIterator(object):
|
| + """Class that iterates over all source URLs passed to the iterator.
|
| +
|
| + See details in __iter__ function doc.
|
| + """
|
| +
|
| + def __init__(self, command_name, debug, logger, gsutil_api, url_strs,
|
| + recursion_requested, all_versions=False,
|
| + cmd_supports_recursion=True, project_id=None,
|
| + continue_on_error=False):
|
| + """Creates a NameExpansionIterator.
|
| +
|
| + Args:
|
| + command_name: name of command being run.
|
| + debug: Debug level to pass to underlying iterators (range 0..3).
|
| + logger: logging.Logger object.
|
| + gsutil_api: Cloud storage interface. Settable for testing/mocking.
|
| + url_strs: PluralityCheckableIterator of URL strings needing expansion.
|
| + recursion_requested: True if -r specified on command-line. If so,
|
| + listings will be flattened so mapped-to results contain objects
|
| + spanning subdirectories.
|
| + all_versions: Bool indicating whether to iterate over all object versions.
|
| + cmd_supports_recursion: Bool indicating whether this command supports a
|
| + '-r' flag. Useful for printing helpful error messages.
|
| + project_id: Project id to use for bucket retrieval.
|
| + continue_on_error: If true, yield no-match exceptions encountered during
|
| + iteration instead of raising them.
|
| +
|
| + Examples of _NameExpansionIterator with recursion_requested=True:
|
| + - Calling with one of the url_strs being 'gs://bucket' will enumerate all
|
| + top-level objects, as will 'gs://bucket/' and 'gs://bucket/*'.
|
| + - 'gs://bucket/**' will enumerate all objects in the bucket.
|
| + - 'gs://bucket/abc' will enumerate either the single object abc or, if
|
| + abc is a subdirectory, all objects under abc and any of its
|
| + subdirectories.
|
| + - 'gs://bucket/abc/**' will enumerate all objects under abc or any of its
|
| + subdirectories.
|
| + - 'file:///tmp' will enumerate all files under /tmp, as will
|
| + 'file:///tmp/*'
|
| + - 'file:///tmp/**' will enumerate all files under /tmp or any of its
|
| + subdirectories.
|
| +
|
| + Example if recursion_requested=False:
|
| + calling with gs://bucket/abc/* lists matching objects
|
| + or subdirs, but not sub-subdirs or objects beneath subdirs.
|
| +
|
| + Note: In step-by-step comments below we give examples assuming there's a
|
| + gs://bucket with object paths:
|
| + abcd/o1.txt
|
| + abcd/o2.txt
|
| + xyz/o1.txt
|
| + xyz/o2.txt
|
| + and a directory file://dir with file paths:
|
| + dir/a.txt
|
| + dir/b.txt
|
| + dir/c/
|
| + """
|
| + self.command_name = command_name
|
| + self.debug = debug
|
| + self.logger = logger
|
| + self.gsutil_api = gsutil_api
|
| + self.url_strs = url_strs
|
| + self.recursion_requested = recursion_requested
|
| + self.all_versions = all_versions
|
| + # Check self.url_strs.HasPlurality() at start because its value can change
|
| + # if url_strs is itself an iterator.
|
| + self.url_strs.has_plurality = self.url_strs.HasPlurality()
|
| + self.cmd_supports_recursion = cmd_supports_recursion
|
| + self.project_id = project_id
|
| + self.continue_on_error = continue_on_error
|
| +
|
| + # Map holding wildcard strings to use for flat vs subdir-by-subdir listings.
|
| + # (A flat listing means show all objects expanded all the way down.)
|
| + self._flatness_wildcard = {True: '**', False: '*'}
|
| +
|
| + def __iter__(self):
|
| + """Iterates over all source URLs passed to the iterator.
|
| +
|
| + For each src url, expands wildcards, object-less bucket names,
|
| + subdir bucket names, and directory names, and generates a flat listing of
|
| + all the matching objects/files.
|
| +
|
| + You should instantiate this object using the static factory function
|
| + NameExpansionIterator, because consumers of this iterator need the
|
| + PluralityCheckableIterator wrapper built by that function.
|
| +
|
| + Yields:
|
| + gslib.name_expansion.NameExpansionResult.
|
| +
|
| + Raises:
|
| + CommandException: if errors encountered.
|
| + """
|
| + for url_str in self.url_strs:
|
| + storage_url = StorageUrlFromString(url_str)
|
| +
|
| + if storage_url.IsFileUrl() and storage_url.IsStream():
|
| + if self.url_strs.has_plurality:
|
| + raise CommandException('Multiple URL strings are not supported '
|
| + 'with streaming ("-") URLs.')
|
| + yield NameExpansionResult(storage_url, False, False, storage_url)
|
| + continue
|
| +
|
| + # Step 1: Expand any explicitly specified wildcards. The output from this
|
| + # step is an iterator of BucketListingRef.
|
| + # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd
|
| +
|
| + src_names_bucket = False
|
| + if (storage_url.IsCloudUrl() and storage_url.IsBucket()
|
| + and not self.recursion_requested):
|
| + # UNIX commands like rm and cp will omit directory references.
|
| + # If url_str refers only to buckets and we are not recursing,
|
| + # then produce references of type BUCKET, because they are guaranteed
|
| + # to pass through Step 2 and be omitted in Step 3.
|
| + post_step1_iter = PluralityCheckableIterator(
|
| + self.WildcardIterator(url_str).IterBuckets(
|
| + bucket_fields=['id']))
|
| + else:
|
| + # Get a list of objects and prefixes, expanding the top level for
|
| + # any listed buckets. If our source is a bucket, however, we need
|
| + # to treat all of the top level expansions as names_container=True.
|
| + post_step1_iter = PluralityCheckableIterator(
|
| + self.WildcardIterator(url_str).IterAll(
|
| + bucket_listing_fields=['name'],
|
| + expand_top_level_buckets=True))
|
| + if storage_url.IsCloudUrl() and storage_url.IsBucket():
|
| + src_names_bucket = True
|
| +
|
| + # Step 2: Expand bucket subdirs. The output from this
|
| + # step is an iterator of (names_container, BucketListingRef).
|
| + # Starting with gs://bucket/abcd this step would expand to:
|
| + # iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]).
|
| + subdir_exp_wildcard = self._flatness_wildcard[self.recursion_requested]
|
| + if self.recursion_requested:
|
| + post_step2_iter = _ImplicitBucketSubdirIterator(
|
| + self, post_step1_iter, subdir_exp_wildcard)
|
| + else:
|
| + post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter)
|
| + post_step2_iter = PluralityCheckableIterator(post_step2_iter)
|
| +
|
| + # Because we actually perform and check object listings here, this will
|
| + # raise if url_args includes a non-existent object. However,
|
| + # plurality_checkable_iterator will buffer the exception for us, not
|
| + # raising it until the iterator is actually asked to yield the first
|
| + # result.
|
| + if post_step2_iter.IsEmpty():
|
| + if self.continue_on_error:
|
| + try:
|
| + raise CommandException('No URLs matched: %s' % url_str)
|
| + except CommandException, e:
|
| + # Yield a specialized tuple of (exception, stack_trace) to
|
| + # the wrapping PluralityCheckableIterator.
|
| + yield (e, sys.exc_info()[2])
|
| + else:
|
| + raise CommandException('No URLs matched: %s' % url_str)
|
| +
|
| + # Step 3. Omit any directories, buckets, or bucket subdirectories for
|
| + # non-recursive expansions.
|
| + post_step3_iter = PluralityCheckableIterator(_OmitNonRecursiveIterator(
|
| + post_step2_iter, self.recursion_requested, self.command_name,
|
| + self.cmd_supports_recursion, self.logger))
|
| +
|
| + src_url_expands_to_multi = post_step3_iter.HasPlurality()
|
| + is_multi_source_request = (self.url_strs.has_plurality
|
| + or src_url_expands_to_multi)
|
| +
|
| + # Step 4. Expand directories and buckets. This step yields the iterated
|
| + # values. Starting with gs://bucket this step would expand to:
|
| + # [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt]
|
| + # Starting with file://dir this step would expand to:
|
| + # [dir/a.txt, dir/b.txt, dir/c/]
|
| + for (names_container, blr) in post_step3_iter:
|
| + src_names_container = src_names_bucket or names_container
|
| +
|
| + if blr.IsObject():
|
| + yield NameExpansionResult(
|
| + storage_url, is_multi_source_request, src_names_container,
|
| + blr.storage_url)
|
| + else:
|
| + # Use implicit wildcarding to do the enumeration.
|
| + # At this point we are guaranteed that:
|
| + # - Recursion has been requested because non-object entries are
|
| + # filtered in step 3 otherwise.
|
| + # - This is a prefix or bucket subdirectory because only
|
| + # non-recursive iterations product bucket references.
|
| + expanded_url = StorageUrlFromString(blr.url_string)
|
| + if expanded_url.IsFileUrl():
|
| + # Convert dir to implicit recursive wildcard.
|
| + url_to_iterate = '%s%s%s' % (blr, os.sep, subdir_exp_wildcard)
|
| + else:
|
| + # Convert subdir to implicit recursive wildcard.
|
| + url_to_iterate = expanded_url.CreatePrefixUrl(
|
| + wildcard_suffix=subdir_exp_wildcard)
|
| +
|
| + wc_iter = PluralityCheckableIterator(
|
| + self.WildcardIterator(url_to_iterate).IterObjects(
|
| + bucket_listing_fields=['name']))
|
| + src_url_expands_to_multi = (src_url_expands_to_multi
|
| + or wc_iter.HasPlurality())
|
| + is_multi_source_request = (self.url_strs.has_plurality
|
| + or src_url_expands_to_multi)
|
| + # This will be a flattened listing of all underlying objects in the
|
| + # subdir.
|
| + for blr in wc_iter:
|
| + yield NameExpansionResult(
|
| + storage_url, is_multi_source_request, True, blr.storage_url)
|
| +
|
| + def WildcardIterator(self, url_string):
|
| + """Helper to instantiate gslib.WildcardIterator.
|
| +
|
| + Args are same as gslib.WildcardIterator interface, but this method fills
|
| + in most of the values from instance state.
|
| +
|
| + Args:
|
| + url_string: URL string naming wildcard objects to iterate.
|
| +
|
| + Returns:
|
| + Wildcard iterator over URL string.
|
| + """
|
| + return gslib.wildcard_iterator.CreateWildcardIterator(
|
| + url_string, self.gsutil_api, debug=self.debug,
|
| + all_versions=self.all_versions,
|
| + project_id=self.project_id)
|
| +
|
| +
|
| +def NameExpansionIterator(command_name, debug, logger, gsutil_api, url_strs,
|
| + recursion_requested, all_versions=False,
|
| + cmd_supports_recursion=True, project_id=None,
|
| + continue_on_error=False):
|
| + """Static factory function for instantiating _NameExpansionIterator.
|
| +
|
| + This wraps the resulting iterator in a PluralityCheckableIterator and checks
|
| + that it is non-empty. Also, allows url_strs to be either an array or an
|
| + iterator.
|
| +
|
| + Args:
|
| + command_name: name of command being run.
|
| + debug: Debug level to pass to underlying iterators (range 0..3).
|
| + logger: logging.Logger object.
|
| + gsutil_api: Cloud storage interface. Settable for testing/mocking.
|
| + url_strs: Iterable URL strings needing expansion.
|
| + recursion_requested: True if -r specified on command-line. If so,
|
| + listings will be flattened so mapped-to results contain objects
|
| + spanning subdirectories.
|
| + all_versions: Bool indicating whether to iterate over all object versions.
|
| + cmd_supports_recursion: Bool indicating whether this command supports a '-r'
|
| + flag. Useful for printing helpful error messages.
|
| + project_id: Project id to use for the current command.
|
| + continue_on_error: If true, yield no-match exceptions encountered during
|
| + iteration instead of raising them.
|
| +
|
| + Raises:
|
| + CommandException if underlying iterator is empty.
|
| +
|
| + Returns:
|
| + Name expansion iterator instance.
|
| +
|
| + For example semantics, see comments in NameExpansionIterator.__init__.
|
| + """
|
| + url_strs = PluralityCheckableIterator(url_strs)
|
| + name_expansion_iterator = _NameExpansionIterator(
|
| + command_name, debug, logger, gsutil_api, url_strs, recursion_requested,
|
| + all_versions=all_versions, cmd_supports_recursion=cmd_supports_recursion,
|
| + project_id=project_id, continue_on_error=continue_on_error)
|
| + name_expansion_iterator = PluralityCheckableIterator(name_expansion_iterator)
|
| + if name_expansion_iterator.IsEmpty():
|
| + raise CommandException('No URLs matched')
|
| + return name_expansion_iterator
|
| +
|
| +
|
| +class NameExpansionIteratorQueue(object):
|
| + """Wrapper around NameExpansionIterator with Multiprocessing.Queue interface.
|
| +
|
| + Only a blocking get() function can be called, and the block and timeout
|
| + params on that function are ignored. All other class functions raise
|
| + NotImplementedError.
|
| +
|
| + This class is thread safe.
|
| + """
|
| +
|
| + def __init__(self, name_expansion_iterator, final_value):
|
| + self.name_expansion_iterator = name_expansion_iterator
|
| + self.final_value = final_value
|
| + self.lock = multiprocessing.Manager().Lock()
|
| +
|
| + def qsize(self):
|
| + raise NotImplementedError(
|
| + 'NameExpansionIteratorQueue.qsize() not implemented')
|
| +
|
| + def empty(self):
|
| + raise NotImplementedError(
|
| + 'NameExpansionIteratorQueue.empty() not implemented')
|
| +
|
| + def full(self):
|
| + raise NotImplementedError(
|
| + 'NameExpansionIteratorQueue.full() not implemented')
|
| +
|
| + # pylint: disable=unused-argument
|
| + def put(self, obj=None, block=None, timeout=None):
|
| + raise NotImplementedError(
|
| + 'NameExpansionIteratorQueue.put() not implemented')
|
| +
|
| + def put_nowait(self, obj):
|
| + raise NotImplementedError(
|
| + 'NameExpansionIteratorQueue.put_nowait() not implemented')
|
| +
|
| + # pylint: disable=unused-argument
|
| + def get(self, block=None, timeout=None):
|
| + self.lock.acquire()
|
| + try:
|
| + if self.name_expansion_iterator.IsEmpty():
|
| + return self.final_value
|
| + return self.name_expansion_iterator.next()
|
| + finally:
|
| + self.lock.release()
|
| +
|
| + def get_nowait(self):
|
| + raise NotImplementedError(
|
| + 'NameExpansionIteratorQueue.get_nowait() not implemented')
|
| +
|
| + def get_no_wait(self):
|
| + raise NotImplementedError(
|
| + 'NameExpansionIteratorQueue.get_no_wait() not implemented')
|
| +
|
| + def close(self):
|
| + raise NotImplementedError(
|
| + 'NameExpansionIteratorQueue.close() not implemented')
|
| +
|
| + def join_thread(self):
|
| + raise NotImplementedError(
|
| + 'NameExpansionIteratorQueue.join_thread() not implemented')
|
| +
|
| + def cancel_join_thread(self):
|
| + raise NotImplementedError(
|
| + 'NameExpansionIteratorQueue.cancel_join_thread() not implemented')
|
| +
|
| +
|
| +class _NonContainerTuplifyIterator(object):
|
| + """Iterator that produces the tuple (False, blr) for each iterated value.
|
| +
|
| + Used for cases where blr_iter iterates over a set of
|
| + BucketListingRefs known not to name containers.
|
| + """
|
| +
|
| + def __init__(self, blr_iter):
|
| + """Instantiates iterator.
|
| +
|
| + Args:
|
| + blr_iter: iterator of BucketListingRef.
|
| + """
|
| + self.blr_iter = blr_iter
|
| +
|
| + def __iter__(self):
|
| + for blr in self.blr_iter:
|
| + yield (False, blr)
|
| +
|
| +
|
| +class _OmitNonRecursiveIterator(object):
|
| + """Iterator wrapper for that omits certain values for non-recursive requests.
|
| +
|
| + This iterates over tuples of (names_container, BucketListingReference) and
|
| + omits directories, prefixes, and buckets from non-recurisve requests
|
| + so that we can properly calculate whether the source URL expands to multiple
|
| + URLs.
|
| +
|
| + For example, if we have a bucket containing two objects: bucket/foo and
|
| + bucket/foo/bar and we do a non-recursive iteration, only bucket/foo will be
|
| + yielded.
|
| + """
|
| +
|
| + def __init__(self, tuple_iter, recursion_requested, command_name,
|
| + cmd_supports_recursion, logger):
|
| + """Instanties the iterator.
|
| +
|
| + Args:
|
| + tuple_iter: Iterator over names_container, BucketListingReference
|
| + from step 2 in the NameExpansionIterator
|
| + recursion_requested: If false, omit buckets, dirs, and subdirs
|
| + command_name: Command name for user messages
|
| + cmd_supports_recursion: Command recursion support for user messages
|
| + logger: Log object for user messages
|
| + """
|
| + self.tuple_iter = tuple_iter
|
| + self.recursion_requested = recursion_requested
|
| + self.command_name = command_name
|
| + self.cmd_supports_recursion = cmd_supports_recursion
|
| + self.logger = logger
|
| +
|
| + def __iter__(self):
|
| + for (names_container, blr) in self.tuple_iter:
|
| + if not self.recursion_requested and not blr.IsObject():
|
| + # At this point we either have a bucket or a prefix,
|
| + # so if recursion is not requested, we're going to omit it.
|
| + expanded_url = StorageUrlFromString(blr.url_string)
|
| + if expanded_url.IsFileUrl():
|
| + desc = 'directory'
|
| + else:
|
| + desc = blr.type_name
|
| + if self.cmd_supports_recursion:
|
| + self.logger.info(
|
| + 'Omitting %s "%s". (Did you mean to do %s -r?)',
|
| + desc, blr.url_string, self.command_name)
|
| + else:
|
| + self.logger.info('Omitting %s "%s".', desc, blr.url_string)
|
| + else:
|
| + yield (names_container, blr)
|
| +
|
| +
|
| +class _ImplicitBucketSubdirIterator(object):
|
| + """Iterator wrapper that performs implicit bucket subdir expansion.
|
| +
|
| + Each iteration yields tuple (names_container, expanded BucketListingRefs)
|
| + where names_container is true if URL names a directory, bucket,
|
| + or bucket subdir.
|
| +
|
| + For example, iterating over [BucketListingRef("gs://abc")] would expand to:
|
| + [BucketListingRef("gs://abc/o1"), BucketListingRef("gs://abc/o2")]
|
| + if those subdir objects exist, and [BucketListingRef("gs://abc") otherwise.
|
| + """
|
| +
|
| + def __init__(self, name_exp_instance, blr_iter, subdir_exp_wildcard):
|
| + """Instantiates the iterator.
|
| +
|
| + Args:
|
| + name_exp_instance: calling instance of NameExpansion class.
|
| + blr_iter: iterator over BucketListingRef prefixes and objects.
|
| + subdir_exp_wildcard: wildcard for expanding subdirectories;
|
| + expected values are ** if the mapped-to results should contain
|
| + objects spanning subdirectories, or * if only one level should
|
| + be listed.
|
| + """
|
| + self.blr_iter = blr_iter
|
| + self.name_exp_instance = name_exp_instance
|
| + self.subdir_exp_wildcard = subdir_exp_wildcard
|
| +
|
| + def __iter__(self):
|
| + for blr in self.blr_iter:
|
| + if blr.IsPrefix():
|
| + # This is a bucket subdirectory, list objects according to the wildcard.
|
| + prefix_url = StorageUrlFromString(blr.url_string).CreatePrefixUrl(
|
| + wildcard_suffix=self.subdir_exp_wildcard)
|
| + implicit_subdir_iterator = PluralityCheckableIterator(
|
| + self.name_exp_instance.WildcardIterator(
|
| + prefix_url).IterAll(bucket_listing_fields=['name']))
|
| + if not implicit_subdir_iterator.IsEmpty():
|
| + for exp_blr in implicit_subdir_iterator:
|
| + yield (True, exp_blr)
|
| + else:
|
| + # Prefix that contains no objects, for example in the $folder$ case
|
| + # or an empty filesystem directory.
|
| + yield (False, blr)
|
| + elif blr.IsObject():
|
| + yield (False, blr)
|
| + else:
|
| + raise CommandException(
|
| + '_ImplicitBucketSubdirIterator got a bucket reference %s' % blr)
|
|
|