gslib/wildcard_iterator.py - Issue 698893003: Update checked in version of gsutil to version 4.6

Unified Diff: gslib/wildcard_iterator.py

Issue 698893003: Update checked in version of gsutil to version 4.6 (Closed) Base URL: http://dart.googlecode.com/svn/third_party/gsutil/

Patch Set: Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: gslib/wildcard_iterator.py

===================================================================

--- gslib/wildcard_iterator.py (revision 33376)

+++ gslib/wildcard_iterator.py (working copy)

@@ -1,245 +1,236 @@

+# -*- coding: utf-8 -*-

-# Permission is hereby granted, free of charge, to any person obtaining a

-# copy of this software and associated documentation files (the

-# "Software"), to deal in the Software without restriction, including

-# without limitation the rights to use, copy, modify, merge, publish, dis-

-# tribute, sublicense, and/or sell copies of the Software, and to permit

-# persons to whom the Software is furnished to do so, subject to the fol-

-# lowing conditions:

+# Licensed under the Apache License, Version 2.0 (the "License");

+# you may not use this file except in compliance with the License.

+# You may obtain a copy of the License at

-# The above copyright notice and this permission notice shall be included

-# in all copies or substantial portions of the Software.

+# http://www.apache.org/licenses/LICENSE-2.0

-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS

-# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-

-# ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT

-# SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,

-# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS

-# IN THE SOFTWARE.

+# Unless required by applicable law or agreed to in writing, software

+# distributed under the License is distributed on an "AS IS" BASIS,

+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

+# See the License for the specific language governing permissions and

+# limitations under the License.

+"""Wildcard iterator class and supporting functions."""

-"""Implementation of wildcarding over StorageUris.

+from __future__ import absolute_import

-StorageUri is an abstraction that Google introduced in the boto library,

-for representing storage provider-independent bucket and object names with

-a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current

-class provides wildcarding support for StorageUri objects (including both

-bucket and file system objects), allowing one to express collections of

-objects with syntax like the following:

- gs://mybucket/images/*.png

- file:///tmp/???abc???

-We provide wildcarding support as part of gsutil rather than as part

-of boto because wildcarding is really part of shell command-like

-functionality.

-A comment about wildcard semantics: We support both single path component

-wildcards (e.g., using '*') and recursive wildcards (using '**'), for both

-file and cloud URIs. For example,

- gs://bucket/doc/*/*.html

-would enumerate HTML files one directory down from gs://bucket/doc, while

- gs://bucket/**/*.html

-would enumerate HTML files in all objects contained in the bucket.

-Note also that if you use file system wildcards it's likely your shell

-interprets the wildcarding before passing the command to gsutil. For example:

- % gsutil cp /opt/eclipse/*/*.html gs://bucket/eclipse

-would likely be expanded by the shell into the following before running gsutil:

- % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse

-Note also that most shells don't support '**' wildcarding (I think only

-zsh does). If you want to use '**' wildcarding with such a shell you can

-single quote each wildcarded string, so it gets passed uninterpreted by the

-shell to gsutil (at which point gsutil will perform the wildcarding expansion):

- % gsutil cp '/opt/eclipse/**/*.html' gs://bucket/eclipse

-"""

-import boto

import fnmatch

import glob

import os

import re

import sys

-import urllib

+import textwrap

-from boto.s3.prefix import Prefix

-from boto.storage_uri import BucketStorageUri

-from bucket_listing_ref import BucketListingRef

+from gslib.bucket_listing_ref import BucketListingBucket

+from gslib.bucket_listing_ref import BucketListingObject

+from gslib.bucket_listing_ref import BucketListingPrefix

+from gslib.cloud_api import AccessDeniedException

+from gslib.cloud_api import CloudApi

+from gslib.cloud_api import NotFoundException

+from gslib.exception import CommandException

+from gslib.storage_url import ContainsWildcard

+from gslib.storage_url import StorageUrlFromString

+from gslib.storage_url import StripOneSlash

+from gslib.storage_url import WILDCARD_REGEX

+from gslib.translation_helper import GenerationFromUrlAndString

+from gslib.util import UTF8

-# Regex to determine if a string contains any wildcards.

-WILDCARD_REGEX = re.compile('[*?\[\]]')

-WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator'

-WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator'

+FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)')

class WildcardIterator(object):

- """Base class for wildcarding over StorageUris.

+ """Class for iterating over Google Cloud Storage strings containing wildcards.

- This class implements support for iterating over StorageUris that

- contain wildcards.

The base class is abstract; you should instantiate using the

wildcard_iterator() static factory method, which chooses the right

- implementation depending on the StorageUri.

+ implementation depending on the base string.

"""

+ # TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both

+ # and make one return the other.

def __repr__(self):

"""Returns string representation of WildcardIterator."""

- return 'WildcardIterator(%s)' % self.wildcard_uri

+ return 'WildcardIterator(%s)' % self.wildcard_url.url_string

class CloudWildcardIterator(WildcardIterator):

- """WildcardIterator subclass for buckets and objects.

+ """WildcardIterator subclass for buckets, bucket subdirs and objects.

- Iterates over BucketListingRef matching the StorageUri wildcard. It's

- much more efficient to request the Key from the BucketListingRef (via

- GetKey()) than to request the StorageUri and then call uri.get_key()

- to retrieve the key, for cases where you want to get metadata that's

- available in the Bucket (for example to get the name and size of

- each object), because that information is available in the bucket GET

- results. If you were to iterate over URIs for such cases and then get

- the name and size info from each resulting StorageUri, it would cause

- an additional object GET request for each of the result URIs.

+ Iterates over BucketListingRef matching the Url string wildcard. It's

+ much more efficient to first get metadata that's available in the Bucket

+ (for example to get the name and size of each object), because that

+ information is available in the object list results.

"""

- def __init__(self, wildcard_uri, proj_id_handler,

- bucket_storage_uri_class=BucketStorageUri, all_versions=False,

- headers=None, debug=0):

- """

- Instantiates an iterator over BucketListingRef matching given wildcard URI.

+ def __init__(self, wildcard_url, gsutil_api, all_versions=False,

+ debug=0, project_id=None):

+ """Instantiates an iterator that matches the wildcard URL.

Args:

- wildcard_uri: StorageUri that contains the wildcard to iterate.

- proj_id_handler: ProjectIdHandler to use for current command.

- bucket_storage_uri_class: BucketStorageUri interface.

- Settable for testing/mocking.

- headers: Dictionary containing optional HTTP headers to pass to boto.

- debug: Debug level to pass in to boto connection (range 0..3).

+ wildcard_url: CloudUrl that contains the wildcard to iterate.

+ gsutil_api: Cloud storage interface. Passed in for thread safety, also

+ settable for testing/mocking.

+ all_versions: If true, the iterator yields all versions of objects

+ matching the wildcard. If false, yields just the live

+ object version.

+ debug: Debug level to control debug output for iterator.

+ project_id: Project ID to use for bucket listings.

"""

- self.wildcard_uri = wildcard_uri

- # Make a copy of the headers so any updates we make during wildcard

- # expansion aren't left in the input params (specifically, so we don't

- # include the x-goog-project-id header needed by a subset of cases, in

- # the data returned to caller, which could then be used in other cases

- # where that header must not be passed).

- if headers is None:

- self.headers = {}

- else:

- self.headers = headers.copy()

- self.proj_id_handler = proj_id_handler

- self.bucket_storage_uri_class = bucket_storage_uri_class

+ self.wildcard_url = wildcard_url

self.all_versions = all_versions

self.debug = debug

+ self.gsutil_api = gsutil_api

+ self.project_id = project_id

- def __iter__(self):

- """Python iterator that gets called when iterating over cloud wildcard.

+ def __iter__(self, bucket_listing_fields=None,

+ expand_top_level_buckets=False):

+ """Iterator that gets called when iterating over the cloud wildcard.

+ In the case where no wildcard is present, returns a single matching object,

+ single matching prefix, or one of each if both exist.

+ Args:

+ bucket_listing_fields: Iterable fields to include in bucket listings.

+ Ex. ['name', 'acl']. Iterator is

+ responsible for converting these to list-style

+ format ['items/name', 'items/acl'] as well as

+ adding any fields necessary for listing such as

+ prefixes. API implemenation is responsible for

+ adding pagination fields. If this is None,

+ all fields are returned.

+ expand_top_level_buckets: If true, yield no BUCKET references. Instead,

+ expand buckets into top-level objects and

+ prefixes.

Yields:

- BucketListingRef, or empty iterator if no matches.

+ BucketListingRef of type BUCKET, OBJECT or PREFIX.

"""

- # First handle bucket wildcarding, if any.

- if ContainsWildcard(self.wildcard_uri.bucket_name):

- regex = fnmatch.translate(self.wildcard_uri.bucket_name)

- bucket_uris = []

- prog = re.compile(regex)

- self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR,

- self.wildcard_uri,

- self.headers)

- for b in self.wildcard_uri.get_all_buckets(headers=self.headers):

- if prog.match(b.name):

- # Use str(b.name) because get_all_buckets() returns Unicode

- # string, which when used to construct x-goog-copy-src metadata

- # requests for object-to-object copies causes pathname '/' chars

- # to be entity-encoded (bucket%2Fdir instead of bucket/dir),

- # which causes the request to fail.

- uri_str = '%s://%s' % (self.wildcard_uri.scheme,

- urllib.quote_plus(str(b.name)))

- # TODO: Move bucket_uris to a separate generator function that yields

- # values instead of pre-computing the list.

- bucket_uris.append(

- boto.storage_uri(

- uri_str, debug=self.debug,

- bucket_storage_uri_class=self.bucket_storage_uri_class,

- suppress_consec_slashes=False))

- else:

- bucket_uris = [self.wildcard_uri.clone_replace_name('')]

+ single_version_request = self.wildcard_url.HasGeneration()

- # Now iterate over bucket(s), and handle object wildcarding, if any.

- self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR,

- self.wildcard_uri,

- self.headers)

- for bucket_uri in bucket_uris:

- if self.wildcard_uri.names_bucket():

- # Bucket-only URI.

- yield BucketListingRef(bucket_uri, key=None, prefix=None,

- headers=self.headers)

+ # For wildcard expansion purposes, we need at a minimum the name of

+ # each object and prefix. If we're not using the default of requesting

+ # all fields, make sure at least these are requested. The Cloud API

+ # tolerates specifying the same field twice.

+ get_fields = None

+ if bucket_listing_fields:

+ get_fields = set()

+ for field in bucket_listing_fields:

+ get_fields.add(field)

+ bucket_listing_fields = self._GetToListFields(

+ get_fields=bucket_listing_fields)

+ bucket_listing_fields.update(['items/name', 'prefixes'])

+ get_fields.update(['name'])

+ # If we're making versioned requests, ensure generation and

+ # metageneration are also included.

+ if single_version_request or self.all_versions:

+ bucket_listing_fields.update(['items/generation',

+ 'items/metageneration'])

+ get_fields.update(['generation', 'metageneration'])

+ # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then

+ # iterate over the expanded bucket strings and handle any object

+ # wildcarding.

+ for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']):

+ bucket_url_string = bucket_listing_ref.url_string

+ if self.wildcard_url.IsBucket():

+ # IsBucket() guarantees there are no prefix or object wildcards, and

+ # thus this is a top-level listing of buckets.

+ if expand_top_level_buckets:

+ url = StorageUrlFromString(bucket_url_string)

+ for obj_or_prefix in self.gsutil_api.ListObjects(

+ url.bucket_name, delimiter='/', all_versions=self.all_versions,

+ provider=self.wildcard_url.scheme,

+ fields=bucket_listing_fields):

+ if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:

+ yield self._GetObjectRef(bucket_url_string, obj_or_prefix.data,

+ with_version=self.all_versions)

+ else: # CloudApi.CsObjectOrPrefixType.PREFIX:

+ yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data)

+ else:

+ yield bucket_listing_ref

else:

- # URI contains an object name. If there's no wildcard just yield

- # the needed URI.

- if not ContainsWildcard(self.wildcard_uri.object_name):

- uri_to_yield = bucket_uri.clone_replace_name(

- self.wildcard_uri.object_name)

- yield BucketListingRef(uri_to_yield, key=None, prefix=None,

- headers=self.headers)

+ # By default, assume a non-wildcarded URL is an object, not a prefix.

+ # This prevents unnecessary listings (which are slower, more expensive,

+ # and also subject to eventual consistency).

+ if (not ContainsWildcard(self.wildcard_url.url_string) and

+ self.wildcard_url.IsObject() and not self.all_versions):

+ try:

+ get_object = self.gsutil_api.GetObjectMetadata(

+ self.wildcard_url.bucket_name,

+ self.wildcard_url.object_name,

+ generation=self.wildcard_url.generation,

+ provider=self.wildcard_url.scheme,

+ fields=get_fields)

+ yield self._GetObjectRef(

+ self.wildcard_url.bucket_url_string, get_object,

+ with_version=(self.all_versions or single_version_request))

+ return

+ except (NotFoundException, AccessDeniedException):

+ # It's possible this is a prefix - try to list instead.

+ pass

+ # Expand iteratively by building prefix/delimiter bucket listing

+ # request, filtering the results per the current level's wildcard

+ # (if present), and continuing with the next component of the

+ # wildcard. See _BuildBucketFilterStrings() documentation for details.

+ if single_version_request:

+ url_string = '%s%s#%s' % (bucket_url_string,

+ self.wildcard_url.object_name,

+ self.wildcard_url.generation)

else:

- # URI contains a wildcard. Expand iteratively by building

- # prefix/delimiter bucket listing request, filtering the results per

- # the current level's wildcard, and continuing with the next component

- # of the wildcard. See _BuildBucketFilterStrings() documentation

- # for details.

- #

- # Initialize the iteration with bucket name from bucket_uri but

- # object name from self.wildcard_uri. This is needed to handle cases

- # where both the bucket and object names contain wildcards.

- uris_needing_expansion = [

- bucket_uri.clone_replace_name(self.wildcard_uri.object_name)]

- while len(uris_needing_expansion) > 0:

- uri = uris_needing_expansion.pop(0)

- (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (

- self._BuildBucketFilterStrings(uri.object_name))

- prog = re.compile(fnmatch.translate(prefix_wildcard))

- # List bucket for objects matching prefix up to delimiter.

- for key in bucket_uri.list_bucket(prefix=prefix,

- delimiter=delimiter,

- headers=self.headers,

- all_versions=self.all_versions):

- # Check that the prefix regex matches rstripped key.name (to

- # correspond with the rstripped prefix_wildcard from

- # _BuildBucketFilterStrings()).

- keyname = key.name

- if isinstance(key, Prefix):

- keyname = keyname.rstrip('/')

- if prog.match(keyname):

- if suffix_wildcard and keyname != suffix_wildcard:

- if isinstance(key, Prefix):

- # There's more wildcard left to expand.

- uris_needing_expansion.append(

- uri.clone_replace_name(key.name.rstrip('/') + '/'

- + suffix_wildcard))

+ # Rstrip any prefixes to correspond with rstripped prefix wildcard

+ # from _BuildBucketFilterStrings().

+ url_string = '%s%s' % (bucket_url_string,

+ StripOneSlash(self.wildcard_url.object_name)

+ or '/') # Cover root object named '/' case.

+ urls_needing_expansion = [url_string]

+ while urls_needing_expansion:

+ url = StorageUrlFromString(urls_needing_expansion.pop(0))

+ (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (

+ self._BuildBucketFilterStrings(url.object_name))

+ prog = re.compile(fnmatch.translate(prefix_wildcard))

+ # List bucket for objects matching prefix up to delimiter.

+ for obj_or_prefix in self.gsutil_api.ListObjects(

+ url.bucket_name, prefix=prefix, delimiter=delimiter,

+ all_versions=self.all_versions or single_version_request,

+ provider=self.wildcard_url.scheme,

+ fields=bucket_listing_fields):

+ if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:

+ gcs_object = obj_or_prefix.data

+ if prog.match(gcs_object.name):

+ if not suffix_wildcard or (

+ StripOneSlash(gcs_object.name) == suffix_wildcard):

+ if not single_version_request or (

+ self._SingleVersionMatches(gcs_object.generation)):

+ yield self._GetObjectRef(

+ bucket_url_string, gcs_object, with_version=(

+ self.all_versions or single_version_request))

+ else: # CloudApi.CsObjectOrPrefixType.PREFIX

+ prefix = obj_or_prefix.data

+ # If the prefix ends with a slash, remove it. Note that we only

+ # remove one slash so that we can successfully enumerate dirs

+ # containing multiple slashes.

+ rstripped_prefix = StripOneSlash(prefix)

+ if prog.match(rstripped_prefix):

+ if suffix_wildcard and rstripped_prefix != suffix_wildcard:

+ # There's more wildcard left to expand.

+ url_append_string = '%s%s' % (

+ bucket_url_string, rstripped_prefix + '/' +

+ suffix_wildcard)

+ urls_needing_expansion.append(url_append_string)

else:

- # Done expanding.

- expanded_uri = uri.clone_replace_key(key)

+ # No wildcard to expand, just yield the prefix

+ yield self._GetPrefixRef(bucket_url_string, prefix)

- if isinstance(key, Prefix):

- yield BucketListingRef(expanded_uri, key=None, prefix=key,

- headers=self.headers)

- else:

- if self.all_versions:

- yield BucketListingRef(expanded_uri, key=key, prefix=None,

- headers=self.headers)

- else:

- # Yield BLR wrapping version-less URI.

- yield BucketListingRef(expanded_uri.clone_replace_name(

- expanded_uri.object_name), key=key, prefix=None,

- headers=self.headers)

def _BuildBucketFilterStrings(self, wildcard):

- """

- Builds strings needed for querying a bucket and filtering results to

- implement wildcard object name matching.

+ """Builds strings needed for querying a bucket and filtering results.

+ This implements wildcard object name matching.

Args:

wildcard: The wildcard string to match to objects.

@@ -290,7 +281,7 @@

wildcard_part = wildcard_part[:end+1]

# Remove trailing '/' so we will match gs://bucket/abc* as well as

# gs://bucket/abc*/ with the same wildcard regex.

- prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/')

+ prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part)

suffix_wildcard = wildcard[match.end():]

end = suffix_wildcard.find('/')

if end == -1:

@@ -302,11 +293,10 @@

# suffix_wildcard at end of prefix_wildcard.

if prefix_wildcard.find('**') != -1:

delimiter = None

- prefix_wildcard = prefix_wildcard + suffix_wildcard

+ prefix_wildcard += suffix_wildcard

suffix_wildcard = ''

else:

delimiter = '/'

- delim_pos = suffix_wildcard.find(delimiter)

# The following debug output is useful for tracing how the algorithm

# walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt

if self.debug > 1:

@@ -316,43 +306,163 @@

(wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))

return (prefix, delimiter, prefix_wildcard, suffix_wildcard)

- def IterKeys(self):

- """

- Convenience iterator that runs underlying iterator and returns Key for each

- iteration.

+ def _SingleVersionMatches(self, listed_generation):

+ decoded_generation = GenerationFromUrlAndString(self.wildcard_url,

+ listed_generation)

+ return str(self.wildcard_url.generation) == str(decoded_generation)

+ def _ExpandBucketWildcards(self, bucket_fields=None):

+ """Expands bucket and provider wildcards.

+ Builds a list of bucket url strings that can be iterated on.

+ Args:

+ bucket_fields: If present, populate only these metadata fields for

+ buckets. Example value: ['acl', 'defaultObjectAcl']

Yields:

- Subclass of boto.s3.key.Key, or empty iterator if no matches.

+ BucketListingRefereneces of type BUCKET.

+ """

+ bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string)

+ if (bucket_fields and set(bucket_fields) == set(['id']) and

+ not ContainsWildcard(self.wildcard_url.bucket_name)):

+ # If we just want the name of a non-wildcarded bucket URL,

+ # don't make an RPC.

+ yield BucketListingBucket(bucket_url)

+ elif(self.wildcard_url.IsBucket() and

+ not ContainsWildcard(self.wildcard_url.bucket_name)):

+ # If we have a non-wildcarded bucket URL, get just that bucket.

+ yield BucketListingBucket(

+ bucket_url, root_object=self.gsutil_api.GetBucket(

+ self.wildcard_url.bucket_name, provider=self.wildcard_url.scheme,

+ fields=bucket_fields))

+ else:

+ regex = fnmatch.translate(self.wildcard_url.bucket_name)

+ prog = re.compile(regex)

- Raises:

- WildcardException: for bucket-only uri.

+ fields = self._GetToListFields(bucket_fields)

+ if fields:

+ fields.add('items/id')

+ for bucket in self.gsutil_api.ListBuckets(

+ fields=fields, project_id=self.project_id,

+ provider=self.wildcard_url.scheme):

+ if prog.match(bucket.id):

+ url = StorageUrlFromString(

+ '%s://%s/' % (self.wildcard_url.scheme, bucket.id))

+ yield BucketListingBucket(url, root_object=bucket)

+ def _GetToListFields(self, get_fields=None):

+ """Prepends 'items/' to the input fields and converts it to a set.

+ This way field sets requested for GetBucket can be used in ListBucket calls.

+ Note that the input set must contain only bucket or object fields; listing

+ fields such as prefixes or nextPageToken should be added after calling

+ this function.

+ Args:

+ get_fields: Iterable fields usable in GetBucket/GetObject calls.

+ Returns:

+ Set of fields usable in ListBuckets/ListObjects calls.

"""

- for bucket_listing_ref in self. __iter__():

- if bucket_listing_ref.HasKey():

- yield bucket_listing_ref.GetKey()

+ if get_fields:

+ list_fields = set()

+ for field in get_fields:

+ list_fields.add('items/' + field)

+ return list_fields

- def IterUris(self):

+ def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False):

+ """Creates a BucketListingRef of type OBJECT from the arguments.

+ Args:

+ bucket_url_string: Wildcardless string describing the containing bucket.

+ gcs_object: gsutil_api root Object for populating the BucketListingRef.

+ with_version: If true, return a reference with a versioned string.

+ Returns:

+ BucketListingRef of type OBJECT.

"""

- Convenience iterator that runs underlying iterator and returns StorageUri

- for each iteration.

+ # Generation can be None in test mocks, so just return the

+ # live object for simplicity.

+ if with_version and gcs_object.generation is not None:

+ generation_str = GenerationFromUrlAndString(self.wildcard_url,

+ gcs_object.generation)

+ object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name,

+ generation_str)

+ else:

+ object_string = '%s%s' % (bucket_url_string, gcs_object.name)

+ object_url = StorageUrlFromString(object_string)

+ return BucketListingObject(object_url, root_object=gcs_object)

+ def _GetPrefixRef(self, bucket_url_string, prefix):

+ """Creates a BucketListingRef of type PREFIX from the arguments.

+ Args:

+ bucket_url_string: Wildcardless string describing the containing bucket.

+ prefix: gsutil_api Prefix for populating the BucketListingRef

+ Returns:

+ BucketListingRef of type PREFIX.

+ """

+ prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix))

+ return BucketListingPrefix(prefix_url, root_object=prefix)

+ def IterBuckets(self, bucket_fields=None):

+ """Iterates over the wildcard, returning refs for each expanded bucket.

+ This ignores the object part of the URL entirely and expands only the

+ the bucket portion. It will yield BucketListingRefs of type BUCKET only.

+ Args:

+ bucket_fields: Iterable fields to include in bucket listings.

+ Ex. ['defaultObjectAcl', 'logging']. This function is

+ responsible for converting these to listing-style

+ format ['items/defaultObjectAcl', 'items/logging'], as

+ well as adding any fields necessary for listing such as

+ 'items/id'. API implemenation is responsible for

+ adding pagination fields. If this is None, all fields are

+ returned.

Yields:

- StorageUri, or empty iterator if no matches.

+ BucketListingRef of type BUCKET, or empty iterator if no matches.

"""

- for bucket_listing_ref in self. __iter__():

- yield bucket_listing_ref.GetUri()

+ for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields):

+ yield blr

- def IterUrisForKeys(self):

+ def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):

+ """Iterates over the wildcard, yielding bucket, prefix or object refs.

+ Args:

+ bucket_listing_fields: If present, populate only these metadata

+ fields for listed objects.

+ expand_top_level_buckets: If true and the wildcard expands only to

+ Bucket(s), yields the expansion of each bucket

+ into a top-level listing of prefixes and objects

+ in that bucket instead of a BucketListingRef

+ to that bucket.

+ Yields:

+ BucketListingRef, or empty iterator if no matches.

"""

- Convenience iterator that runs underlying iterator and returns the

- StorageUri for each iterated BucketListingRef that has a Key.

+ for blr in self. __iter__(

+ bucket_listing_fields=bucket_listing_fields,

+ expand_top_level_buckets=expand_top_level_buckets):

+ yield blr

+ def IterObjects(self, bucket_listing_fields=None):

+ """Iterates over the wildcard, yielding only object BucketListingRefs.

+ Args:

+ bucket_listing_fields: If present, populate only these metadata

+ fields for listed objects.

Yields:

- StorageUri, or empty iterator if no matches.

+ BucketListingRefs of type OBJECT or empty iterator if no matches.

"""

- for bucket_listing_ref in self. __iter__():

- if bucket_listing_ref.HasKey():

- yield bucket_listing_ref.GetUri()

+ for blr in self. __iter__(bucket_listing_fields=bucket_listing_fields,

+ expand_top_level_buckets=True):

+ if blr.IsObject():

+ yield blr

class FileWildcardIterator(WildcardIterator):

@@ -365,27 +475,35 @@

files in any subdirectory named 'abc').

"""

- def __init__(self, wildcard_uri, headers=None, debug=0):

- """

- Instantiate an iterator over BucketListingRefs matching given wildcard URI.

+ def __init__(self, wildcard_url, debug=0):

+ """Instantiates an iterator over BucketListingRefs matching wildcard URL.

Args:

- wildcard_uri: StorageUri that contains the wildcard to iterate.

- headers: Dictionary containing optional HTTP headers to pass to boto.

- debug: Debug level to pass in to boto connection (range 0..3).

+ wildcard_url: FileUrl that contains the wildcard to iterate.

+ debug: Debug level (range 0..3).

"""

- self.wildcard_uri = wildcard_uri

- self.headers = headers

+ self.wildcard_url = wildcard_url

self.debug = debug

def __iter__(self):

- wildcard = self.wildcard_uri.object_name

- match = re.search('\*\*', wildcard)

+ """Iterator that gets called when iterating over the file wildcard.

+ In the case where no wildcard is present, returns a single matching file

+ or directory.

+ Raises:

+ WildcardException: if invalid wildcard found.

+ Yields:

+ BucketListingRef of type OBJECT (for files) or PREFIX (for directories)

+ """

+ wildcard = self.wildcard_url.object_name

+ match = FLAT_LIST_REGEX.match(wildcard)

if match:

# Recursive wildcarding request ('.../**/...').

# Example input: wildcard = '/tmp/tmp2pQJAX/**/*'

- base_dir = wildcard[:match.start()-1]

- remaining_wildcard = wildcard[match.start()+2:]

+ base_dir = match.group('before')[:-1]

+ remaining_wildcard = match.group('after')

# At this point for the above example base_dir = '/tmp/tmp2pQJAX' and

# remaining_wildcard = '/*'

if remaining_wildcard.startswith('*'):

@@ -398,44 +516,104 @@

remaining_wildcard = '*'

# Skip slash(es).

remaining_wildcard = remaining_wildcard.lstrip(os.sep)

- filepaths = self._iter_dir(base_dir, remaining_wildcard)

+ filepaths = self._IterDir(base_dir, remaining_wildcard)

else:

# Not a recursive wildcarding request.

filepaths = glob.iglob(wildcard)

for filepath in filepaths:

- expanded_uri = self.wildcard_uri.clone_replace_name(filepath)

- yield BucketListingRef(expanded_uri)

+ expanded_url = StorageUrlFromString(filepath)

+ if os.path.isdir(filepath):

+ yield BucketListingPrefix(expanded_url)

+ else:

+ yield BucketListingObject(expanded_url)

- def _iter_dir(self, dir, wildcard):

+ def _IterDir(self, directory, wildcard):

"""An iterator over the specified dir and wildcard."""

- for dirpath, unused_dirnames, filenames in os.walk(dir):

+ # UTF8-encode directory before passing it to os.walk() so if there are

+ # non-valid UTF8 chars in the file name (e.g., that can happen if the file

+ # originated on Windows) os.walk() will not attempt to decode and then die

+ # with a "codec can't decode byte" error, and instead we can catch the error

+ # at yield time and print a more informative error message.

+ for dirpath, unused_dirnames, filenames in os.walk(directory.encode(UTF8)):

for f in fnmatch.filter(filenames, wildcard):

- yield os.path.join(dirpath, f)

+ try:

+ yield os.path.join(dirpath, f).decode(UTF8)

+ except UnicodeDecodeError:

+ # Note: We considered several ways to deal with this, but each had

+ # problems:

+ # 1. Raise an exception and try to catch in a higher layer (the

+ # gsutil cp command), so we can properly support the gsutil cp -c

+ # option. That doesn't work because raising an exception during

+ # iteration terminates the generator.

+ # 2. Accumulate a list of bad filenames and skip processing each

+ # during iteration, then raise at the end, with exception text

+ # printing the bad paths. That doesn't work because iteration is

+ # wrapped in PluralityCheckableIterator, so it's possible there

+ # are not-yet-performed copy operations at the time we reach the

+ # end of the iteration and raise the exception - which would cause

+ # us to skip copying validly named files. Moreover, the gsutil

+ # cp command loops over argv, so if you run the command gsutil cp

+ # -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1

+ # would cause dir2 never to be visited.

+ # 3. Print the invalid pathname and skip it during iteration. That

+ # would work but would mean gsutil cp could exit with status 0

+ # even though some files weren't copied.

+ # 4. Change the WildcardIterator to include an error status along with

+ # the result. That would solve the problem but would be a

+ # substantial change (WildcardIterator is used in many parts of

+ # gsutil), and we didn't feel that magnitude of change was

+ # warranted by this relatively uncommon corner case.

+ # Instead we chose to abort when one such file is encountered, and

+ # require the user to remove or rename the files and try again.

+ raise CommandException('\n'.join(textwrap.wrap(

+ 'Invalid Unicode path encountered (%s). gsutil cannot proceed '

+ 'with such files present. Please remove or rename this file and '

+ 'try again.' % repr(os.path.join(dirpath, f)))))

- def IterKeys(self):

+ # pylint: disable=unused-argument

+ def IterObjects(self, bucket_listing_fields=None):

+ """Iterates over the wildcard, yielding only object (file) refs.

+ Args:

+ bucket_listing_fields: Ignored as filesystems don't have buckets.

+ Yields:

+ BucketListingRefs of type OBJECT or empty iterator if no matches.

"""

- Placeholder to allow polymorphic use of WildcardIterator.

+ for bucket_listing_ref in self.IterAll():

+ if bucket_listing_ref.IsObject():

+ yield bucket_listing_ref

+ # pylint: disable=unused-argument

+ def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):

+ """Iterates over the wildcard, yielding BucketListingRefs.

+ Args:

+ bucket_listing_fields: Ignored; filesystems don't have buckets.

+ expand_top_level_buckets: Ignored; filesystems don't have buckets.

+ Yields:

+ BucketListingRefs of type OBJECT (file) or PREFIX (directory),

+ or empty iterator if no matches.

+ """

+ for bucket_listing_ref in self.__iter__():

+ yield bucket_listing_ref

+ def IterBuckets(self, unused_bucket_fields=None):

+ """Placeholder to allow polymorphic use of WildcardIterator.

+ Args:

+ unused_bucket_fields: Ignored; filesystems don't have buckets.

Raises:

WildcardException: in all cases.

"""

raise WildcardException(

- 'Iterating over Keys not possible for file wildcards')

+ 'Iterating over Buckets not possible for file wildcards')

- def IterUris(self):

- """

- Convenience iterator that runs underlying iterator and returns StorageUri

- for each iteration.

- Yields:

- StorageUri, or empty iterator if no matches.

- """

- for bucket_listing_ref in self. __iter__():

- yield bucket_listing_ref.GetUri()

class WildcardException(StandardError):

- """Exception thrown for invalid wildcard URIs."""

+ """Exception raised for invalid wildcard URLs."""

def __init__(self, reason):

StandardError.__init__(self)

@@ -448,57 +626,28 @@

return 'WildcardException: %s' % self.reason

-def wildcard_iterator(uri_or_str, proj_id_handler,

- bucket_storage_uri_class=BucketStorageUri,

- all_versions=False,

- headers=None, debug=0):

- """Instantiate a WildCardIterator for the given StorageUri.

+def CreateWildcardIterator(url_str, gsutil_api, all_versions=False, debug=0,

+ project_id=None):

+ """Instantiate a WildcardIterator for the given URL string.

Args:

- uri_or_str: StorageUri or URI string naming wildcard objects to iterate.

- proj_id_handler: ProjectIdHandler to use for current command.

- bucket_storage_uri_class: BucketStorageUri interface.

- Settable for testing/mocking.

- headers: Dictionary containing optional HTTP headers to pass to boto.

- debug: Debug level to pass in to boto connection (range 0..3).

+ url_str: URL string naming wildcard object(s) to iterate.

+ gsutil_api: Cloud storage interface. Passed in for thread safety, also

+ settable for testing/mocking.

+ all_versions: If true, the iterator yields all versions of objects

+ matching the wildcard. If false, yields just the live

+ object version.

+ debug: Debug level to control debug output for iterator.

+ project_id: Project id to use for bucket listings.

Returns:

A WildcardIterator that handles the requested iteration.

"""

- if isinstance(uri_or_str, basestring):

- # Disable enforce_bucket_naming, to allow bucket names containing wildcard

- # chars.

- uri = boto.storage_uri(

- uri_or_str, debug=debug, validate=False,

- bucket_storage_uri_class=bucket_storage_uri_class,

- suppress_consec_slashes=False)

- else:

- uri = uri_or_str

- if uri.is_cloud_uri():

+ url = StorageUrlFromString(url_str)

+ if url.IsFileUrl():

+ return FileWildcardIterator(url, debug=debug)

+ else: # Cloud URL

return CloudWildcardIterator(

- uri, proj_id_handler,

- bucket_storage_uri_class=bucket_storage_uri_class,

- all_versions=all_versions,

- headers=headers,

- debug=debug)

- elif uri.is_file_uri():

- return FileWildcardIterator(uri, headers=headers, debug=debug)

- else:

- raise WildcardException('Unexpected type of StorageUri (%s)' % uri)

-def ContainsWildcard(uri_or_str):

- """Checks whether uri_or_str contains a wildcard.

- Args:

- uri_or_str: StorageUri or URI string to check.

- Returns:

- bool indicator.

- """

- if isinstance(uri_or_str, basestring):

- return bool(WILDCARD_REGEX.search(uri_or_str))

- else:

- return bool(WILDCARD_REGEX.search(uri_or_str.uri))

+ url, gsutil_api, all_versions=all_versions, debug=debug,

+ project_id=project_id)

« no previous file with comments | « gslib/util.py ('k') | gsutil » ('j') | no next file with comments »