third_party/gsutil/gslib/wildcard_iterator.py - Issue 1377933002: [catapult] - Copy Telemetry's gsutilz over to third_party.

Side by Side Diff: third_party/gsutil/gslib/wildcard_iterator.py

Issue 1377933002: [catapult] - Copy Telemetry's gsutilz over to third_party. (Closed) Base URL: https://github.com/catapult-project/catapult.git@master

Patch Set: Rename to gsutil. Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # -- coding: utf-8 --

	2 # Copyright 2010 Google Inc. All Rights Reserved.

	3 #

	4 # Licensed under the Apache License, Version 2.0 (the "License");

	5 # you may not use this file except in compliance with the License.

	6 # You may obtain a copy of the License at

	7 #

	8 # http://www.apache.org/licenses/LICENSE-2.0

	9 #

	10 # Unless required by applicable law or agreed to in writing, software

	11 # distributed under the License is distributed on an "AS IS" BASIS,

	12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

	13 # See the License for the specific language governing permissions and

	14 # limitations under the License.

	15 """Wildcard iterator class and supporting functions."""

	16

	17 from __future__ import absolute_import

	18

	19 import fnmatch

	20 import glob

	21 import os

	22 import re

	23 import sys

	24 import textwrap

	25

	26 from gslib.bucket_listing_ref import BucketListingBucket

	27 from gslib.bucket_listing_ref import BucketListingObject

	28 from gslib.bucket_listing_ref import BucketListingPrefix

	29 from gslib.cloud_api import AccessDeniedException

	30 from gslib.cloud_api import CloudApi

	31 from gslib.cloud_api import NotFoundException

	32 from gslib.exception import CommandException

	33 from gslib.storage_url import ContainsWildcard

	34 from gslib.storage_url import StorageUrlFromString

	35 from gslib.storage_url import StripOneSlash

	36 from gslib.storage_url import WILDCARD_REGEX

	37 from gslib.translation_helper import GenerationFromUrlAndString

	38 from gslib.util import UTF8

	39

	40

	41 FLAT_LIST_REGEX = re.compile(r'(?P<before>.?)\\(?P<after>.)')

	42

	43

	44 class WildcardIterator(object):

	45 """Class for iterating over Google Cloud Storage strings containing wildcards.

	46

	47 The base class is abstract; you should instantiate using the

	48 wildcard_iterator() static factory method, which chooses the right

	49 implementation depending on the base string.

	50 """

	51

	52 # TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both

	53 # and make one return the other.

	54 def __repr__(self):

	55 """Returns string representation of WildcardIterator."""

	56 return 'WildcardIterator(%s)' % self.wildcard_url.url_string

	57

	58

	59 class CloudWildcardIterator(WildcardIterator):

	60 """WildcardIterator subclass for buckets, bucket subdirs and objects.

	61

	62 Iterates over BucketListingRef matching the Url string wildcard. It's

	63 much more efficient to first get metadata that's available in the Bucket

	64 (for example to get the name and size of each object), because that

	65 information is available in the object list results.

	66 """

	67

	68 def __init__(self, wildcard_url, gsutil_api, all_versions=False,

	69 debug=0, project_id=None):

	70 """Instantiates an iterator that matches the wildcard URL.

	71

	72 Args:

	73 wildcard_url: CloudUrl that contains the wildcard to iterate.

	74 gsutil_api: Cloud storage interface. Passed in for thread safety, also

	75 settable for testing/mocking.

	76 all_versions: If true, the iterator yields all versions of objects

	77 matching the wildcard. If false, yields just the live

	78 object version.

	79 debug: Debug level to control debug output for iterator.

	80 project_id: Project ID to use for bucket listings.

	81 """

	82 self.wildcard_url = wildcard_url

	83 self.all_versions = all_versions

	84 self.debug = debug

	85 self.gsutil_api = gsutil_api

	86 self.project_id = project_id

	87

	88 def __iter__(self, bucket_listing_fields=None,

	89 expand_top_level_buckets=False):

	90 """Iterator that gets called when iterating over the cloud wildcard.

	91

	92 In the case where no wildcard is present, returns a single matching object,

	93 single matching prefix, or one of each if both exist.

	94

	95 Args:

	96 bucket_listing_fields: Iterable fields to include in bucket listings.

	97 Ex. ['name', 'acl']. Iterator is

	98 responsible for converting these to list-style

	99 format ['items/name', 'items/acl'] as well as

	100 adding any fields necessary for listing such as

	101 prefixes. API implemenation is responsible for

	102 adding pagination fields. If this is None,

	103 all fields are returned.

	104 expand_top_level_buckets: If true, yield no BUCKET references. Instead,

	105 expand buckets into top-level objects and

	106 prefixes.

	107

	108 Yields:

	109 BucketListingRef of type BUCKET, OBJECT or PREFIX.

	110 """

	111 single_version_request = self.wildcard_url.HasGeneration()

	112

	113 # For wildcard expansion purposes, we need at a minimum the name of

	114 # each object and prefix. If we're not using the default of requesting

	115 # all fields, make sure at least these are requested. The Cloud API

	116 # tolerates specifying the same field twice.

	117 get_fields = None

	118 if bucket_listing_fields:

	119 get_fields = set()

	120 for field in bucket_listing_fields:

	121 get_fields.add(field)

	122 bucket_listing_fields = self._GetToListFields(

	123 get_fields=bucket_listing_fields)

	124 bucket_listing_fields.update(['items/name', 'prefixes'])

	125 get_fields.update(['name'])

	126 # If we're making versioned requests, ensure generation and

	127 # metageneration are also included.

	128 if single_version_request or self.all_versions:

	129 bucket_listing_fields.update(['items/generation',

	130 'items/metageneration'])

	131 get_fields.update(['generation', 'metageneration'])

	132

	133 # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then

	134 # iterate over the expanded bucket strings and handle any object

	135 # wildcarding.

	136 for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']):

	137 bucket_url_string = bucket_listing_ref.url_string

	138 if self.wildcard_url.IsBucket():

	139 # IsBucket() guarantees there are no prefix or object wildcards, and

	140 # thus this is a top-level listing of buckets.

	141 if expand_top_level_buckets:

	142 url = StorageUrlFromString(bucket_url_string)

	143 for obj_or_prefix in self.gsutil_api.ListObjects(

	144 url.bucket_name, delimiter='/', all_versions=self.all_versions,

	145 provider=self.wildcard_url.scheme,

	146 fields=bucket_listing_fields):

	147 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:

	148 yield self._GetObjectRef(bucket_url_string, obj_or_prefix.data,

	149 with_version=self.all_versions)

	150 else: # CloudApi.CsObjectOrPrefixType.PREFIX:

	151 yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data)

	152 else:

	153 yield bucket_listing_ref

	154 else:

	155 # By default, assume a non-wildcarded URL is an object, not a prefix.

	156 # This prevents unnecessary listings (which are slower, more expensive,

	157 # and also subject to eventual consistency).

	158 if (not ContainsWildcard(self.wildcard_url.url_string) and

	159 self.wildcard_url.IsObject() and not self.all_versions):

	160 try:

	161 get_object = self.gsutil_api.GetObjectMetadata(

	162 self.wildcard_url.bucket_name,

	163 self.wildcard_url.object_name,

	164 generation=self.wildcard_url.generation,

	165 provider=self.wildcard_url.scheme,

	166 fields=get_fields)

	167 yield self._GetObjectRef(

	168 self.wildcard_url.bucket_url_string, get_object,

	169 with_version=(self.all_versions or single_version_request))

	170 return

	171 except (NotFoundException, AccessDeniedException):

	172 # It's possible this is a prefix - try to list instead.

	173 pass

	174

	175 # Expand iteratively by building prefix/delimiter bucket listing

	176 # request, filtering the results per the current level's wildcard

	177 # (if present), and continuing with the next component of the

	178 # wildcard. See _BuildBucketFilterStrings() documentation for details.

	179 if single_version_request:

	180 url_string = '%s%s#%s' % (bucket_url_string,

	181 self.wildcard_url.object_name,

	182 self.wildcard_url.generation)

	183 else:

	184 # Rstrip any prefixes to correspond with rstripped prefix wildcard

	185 # from _BuildBucketFilterStrings().

	186 url_string = '%s%s' % (bucket_url_string,

	187 StripOneSlash(self.wildcard_url.object_name)

	188 or '/') # Cover root object named '/' case.

	189 urls_needing_expansion = [url_string]

	190 while urls_needing_expansion:

	191 url = StorageUrlFromString(urls_needing_expansion.pop(0))

	192 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (

	193 self._BuildBucketFilterStrings(url.object_name))

	194 prog = re.compile(fnmatch.translate(prefix_wildcard))

	195

	196 # List bucket for objects matching prefix up to delimiter.

	197 for obj_or_prefix in self.gsutil_api.ListObjects(

	198 url.bucket_name, prefix=prefix, delimiter=delimiter,

	199 all_versions=self.all_versions or single_version_request,

	200 provider=self.wildcard_url.scheme,

	201 fields=bucket_listing_fields):

	202 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:

	203 gcs_object = obj_or_prefix.data

	204 if prog.match(gcs_object.name):

	205 if not suffix_wildcard or (

	206 StripOneSlash(gcs_object.name) == suffix_wildcard):

	207 if not single_version_request or (

	208 self._SingleVersionMatches(gcs_object.generation)):

	209 yield self._GetObjectRef(

	210 bucket_url_string, gcs_object, with_version=(

	211 self.all_versions or single_version_request))

	212 else: # CloudApi.CsObjectOrPrefixType.PREFIX

	213 prefix = obj_or_prefix.data

	214 # If the prefix ends with a slash, remove it. Note that we only

	215 # remove one slash so that we can successfully enumerate dirs

	216 # containing multiple slashes.

	217 rstripped_prefix = StripOneSlash(prefix)

	218 if prog.match(rstripped_prefix):

	219 if suffix_wildcard and rstripped_prefix != suffix_wildcard:

	220 # There's more wildcard left to expand.

	221 url_append_string = '%s%s' % (

	222 bucket_url_string, rstripped_prefix + '/' +

	223 suffix_wildcard)

	224 urls_needing_expansion.append(url_append_string)

	225 else:

	226 # No wildcard to expand, just yield the prefix

	227 yield self._GetPrefixRef(bucket_url_string, prefix)

	228

	229 def _BuildBucketFilterStrings(self, wildcard):

	230 """Builds strings needed for querying a bucket and filtering results.

	231

	232 This implements wildcard object name matching.

	233

	234 Args:

	235 wildcard: The wildcard string to match to objects.

	236

	237 Returns:

	238 (prefix, delimiter, prefix_wildcard, suffix_wildcard)

	239 where:

	240 prefix is the prefix to be sent in bucket GET request.

	241 delimiter is the delimiter to be sent in bucket GET request.

	242 prefix_wildcard is the wildcard to be used to filter bucket GET results.

	243 suffix_wildcard is wildcard to be appended to filtered bucket GET

	244 results for next wildcard expansion iteration.

	245 For example, given the wildcard gs://bucket/abc/de/f.txt we

	246 would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and

	247 suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket

	248 listing request will then produce a listing result set that can be

	249 filtered using this prefix_wildcard; and we'd use this suffix_wildcard

	250 to feed into the next call(s) to _BuildBucketFilterStrings(), for the

	251 next iteration of listing/filtering.

	252

	253 Raises:

	254 AssertionError if wildcard doesn't contain any wildcard chars.

	255 """

	256 # Generate a request prefix if the object name part of the wildcard starts

	257 # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').

	258 match = WILDCARD_REGEX.search(wildcard)

	259 if not match:

	260 # Input "wildcard" has no wildcard chars, so just return tuple that will

	261 # cause a bucket listing to match the given input wildcard. Example: if

	262 # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,

	263 # the next iteration will call _BuildBucketFilterStrings() with

	264 # gs://bucket/dir/abc, and we will return prefix ='dir/abc',

	265 # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.

	266 prefix = wildcard

	267 delimiter = '/'

	268 prefix_wildcard = wildcard

	269 suffix_wildcard = ''

	270 else:

	271 if match.start() > 0:

	272 # Wildcard does not occur at beginning of object name, so construct a

	273 # prefix string to send to server.

	274 prefix = wildcard[:match.start()]

	275 wildcard_part = wildcard[match.start():]

	276 else:

	277 prefix = None

	278 wildcard_part = wildcard

	279 end = wildcard_part.find('/')

	280 if end != -1:

	281 wildcard_part = wildcard_part[:end+1]

	282 # Remove trailing '/' so we will match gs://bucket/abc* as well as

	283 # gs://bucket/abc*/ with the same wildcard regex.

	284 prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part)

	285 suffix_wildcard = wildcard[match.end():]

	286 end = suffix_wildcard.find('/')

	287 if end == -1:

	288 suffix_wildcard = ''

	289 else:

	290 suffix_wildcard = suffix_wildcard[end+1:]

	291 # To implement recursive (**) wildcarding, if prefix_wildcard

	292 # suffix_wildcard starts with '**' don't send a delimiter, and combine

	293 # suffix_wildcard at end of prefix_wildcard.

	294 if prefix_wildcard.find('**') != -1:

	295 delimiter = None

	296 prefix_wildcard += suffix_wildcard

	297 suffix_wildcard = ''

	298 else:

	299 delimiter = '/'

	300 # The following debug output is useful for tracing how the algorithm

	301 # walks through a multi-part wildcard like gs://bucket/abc/de/f.txt

	302 if self.debug > 1:

	303 sys.stderr.write(

	304 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '

	305 'prefix_wildcard=%s, suffix_wildcard=%s\n' %

	306 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))

	307 return (prefix, delimiter, prefix_wildcard, suffix_wildcard)

	308

	309 def _SingleVersionMatches(self, listed_generation):

	310 decoded_generation = GenerationFromUrlAndString(self.wildcard_url,

	311 listed_generation)

	312 return str(self.wildcard_url.generation) == str(decoded_generation)

	313

	314 def _ExpandBucketWildcards(self, bucket_fields=None):

	315 """Expands bucket and provider wildcards.

	316

	317 Builds a list of bucket url strings that can be iterated on.

	318

	319 Args:

	320 bucket_fields: If present, populate only these metadata fields for

	321 buckets. Example value: ['acl', 'defaultObjectAcl']

	322

	323 Yields:

	324 BucketListingRefereneces of type BUCKET.

	325 """

	326 bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string)

	327 if (bucket_fields and set(bucket_fields) == set(['id']) and

	328 not ContainsWildcard(self.wildcard_url.bucket_name)):

	329 # If we just want the name of a non-wildcarded bucket URL,

	330 # don't make an RPC.

	331 yield BucketListingBucket(bucket_url)

	332 elif(self.wildcard_url.IsBucket() and

	333 not ContainsWildcard(self.wildcard_url.bucket_name)):

	334 # If we have a non-wildcarded bucket URL, get just that bucket.

	335 yield BucketListingBucket(

	336 bucket_url, root_object=self.gsutil_api.GetBucket(

	337 self.wildcard_url.bucket_name, provider=self.wildcard_url.scheme,

	338 fields=bucket_fields))

	339 else:

	340 regex = fnmatch.translate(self.wildcard_url.bucket_name)

	341 prog = re.compile(regex)

	342

	343 fields = self._GetToListFields(bucket_fields)

	344 if fields:

	345 fields.add('items/id')

	346 for bucket in self.gsutil_api.ListBuckets(

	347 fields=fields, project_id=self.project_id,

	348 provider=self.wildcard_url.scheme):

	349 if prog.match(bucket.id):

	350 url = StorageUrlFromString(

	351 '%s://%s/' % (self.wildcard_url.scheme, bucket.id))

	352 yield BucketListingBucket(url, root_object=bucket)

	353

	354 def _GetToListFields(self, get_fields=None):

	355 """Prepends 'items/' to the input fields and converts it to a set.

	356

	357 This way field sets requested for GetBucket can be used in ListBucket calls.

	358 Note that the input set must contain only bucket or object fields; listing

	359 fields such as prefixes or nextPageToken should be added after calling

	360 this function.

	361

	362 Args:

	363 get_fields: Iterable fields usable in GetBucket/GetObject calls.

	364

	365 Returns:

	366 Set of fields usable in ListBuckets/ListObjects calls.

	367 """

	368 if get_fields:

	369 list_fields = set()

	370 for field in get_fields:

	371 list_fields.add('items/' + field)

	372 return list_fields

	373

	374 def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False):

	375 """Creates a BucketListingRef of type OBJECT from the arguments.

	376

	377 Args:

	378 bucket_url_string: Wildcardless string describing the containing bucket.

	379 gcs_object: gsutil_api root Object for populating the BucketListingRef.

	380 with_version: If true, return a reference with a versioned string.

	381

	382 Returns:

	383 BucketListingRef of type OBJECT.

	384 """

	385 # Generation can be None in test mocks, so just return the

	386 # live object for simplicity.

	387 if with_version and gcs_object.generation is not None:

	388 generation_str = GenerationFromUrlAndString(self.wildcard_url,

	389 gcs_object.generation)

	390 object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name,

	391 generation_str)

	392 else:

	393 object_string = '%s%s' % (bucket_url_string, gcs_object.name)

	394 object_url = StorageUrlFromString(object_string)

	395 return BucketListingObject(object_url, root_object=gcs_object)

	396

	397 def _GetPrefixRef(self, bucket_url_string, prefix):

	398 """Creates a BucketListingRef of type PREFIX from the arguments.

	399

	400 Args:

	401 bucket_url_string: Wildcardless string describing the containing bucket.

	402 prefix: gsutil_api Prefix for populating the BucketListingRef

	403

	404 Returns:

	405 BucketListingRef of type PREFIX.

	406 """

	407 prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix))

	408 return BucketListingPrefix(prefix_url, root_object=prefix)

	409

	410 def IterBuckets(self, bucket_fields=None):

	411 """Iterates over the wildcard, returning refs for each expanded bucket.

	412

	413 This ignores the object part of the URL entirely and expands only the

	414 the bucket portion. It will yield BucketListingRefs of type BUCKET only.

	415

	416 Args:

	417 bucket_fields: Iterable fields to include in bucket listings.

	418 Ex. ['defaultObjectAcl', 'logging']. This function is

	419 responsible for converting these to listing-style

	420 format ['items/defaultObjectAcl', 'items/logging'], as

	421 well as adding any fields necessary for listing such as

	422 'items/id'. API implemenation is responsible for

	423 adding pagination fields. If this is None, all fields are

	424 returned.

	425

	426 Yields:

	427 BucketListingRef of type BUCKET, or empty iterator if no matches.

	428 """

	429 for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields):

	430 yield blr

	431

	432 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):

	433 """Iterates over the wildcard, yielding bucket, prefix or object refs.

	434

	435 Args:

	436 bucket_listing_fields: If present, populate only these metadata

	437 fields for listed objects.

	438 expand_top_level_buckets: If true and the wildcard expands only to

	439 Bucket(s), yields the expansion of each bucket

	440 into a top-level listing of prefixes and objects

	441 in that bucket instead of a BucketListingRef

	442 to that bucket.

	443

	444 Yields:

	445 BucketListingRef, or empty iterator if no matches.

	446 """

	447 for blr in self.__iter__(

	448 bucket_listing_fields=bucket_listing_fields,

	449 expand_top_level_buckets=expand_top_level_buckets):

	450 yield blr

	451

	452 def IterObjects(self, bucket_listing_fields=None):

	453 """Iterates over the wildcard, yielding only object BucketListingRefs.

	454

	455 Args:

	456 bucket_listing_fields: If present, populate only these metadata

	457 fields for listed objects.

	458

	459 Yields:

	460 BucketListingRefs of type OBJECT or empty iterator if no matches.

	461 """

	462 for blr in self.__iter__(bucket_listing_fields=bucket_listing_fields,

	463 expand_top_level_buckets=True):

	464 if blr.IsObject():

	465 yield blr

	466

	467

	468 class FileWildcardIterator(WildcardIterator):

	469 """WildcardIterator subclass for files and directories.

	470

	471 If you use recursive wildcards ('**') only a single such wildcard is

	472 supported. For example you could use the wildcard '*/.txt' to list all .txt

	473 files in any subdirectory of the current directory, but you couldn't use a

	474 wildcard like '/abc//*.txt' (which would, if supported, let you find .txt

	475 files in any subdirectory named 'abc').

	476 """

	477

	478 def __init__(self, wildcard_url, debug=0):

	479 """Instantiates an iterator over BucketListingRefs matching wildcard URL.

	480

	481 Args:

	482 wildcard_url: FileUrl that contains the wildcard to iterate.

	483 debug: Debug level (range 0..3).

	484 """

	485 self.wildcard_url = wildcard_url

	486 self.debug = debug

	487

	488 def __iter__(self):

	489 """Iterator that gets called when iterating over the file wildcard.

	490

	491 In the case where no wildcard is present, returns a single matching file

	492 or directory.

	493

	494 Raises:

	495 WildcardException: if invalid wildcard found.

	496

	497 Yields:

	498 BucketListingRef of type OBJECT (for files) or PREFIX (for directories)

	499 """

	500 wildcard = self.wildcard_url.object_name

	501 match = FLAT_LIST_REGEX.match(wildcard)

	502 if match:

	503 # Recursive wildcarding request ('.../**/...').

	504 # Example input: wildcard = '/tmp/tmp2pQJAX/*/'

	505 base_dir = match.group('before')[:-1]

	506 remaining_wildcard = match.group('after')

	507 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and

	508 # remaining_wildcard = '/*'

	509 if remaining_wildcard.startswith('*'):

	510 raise WildcardException('Invalid wildcard with more than 2 consecutive '

	511 '*s (%s)' % wildcard)

	512 # If there was no remaining wildcard past the recursive wildcard,

	513 # treat it as if it were a ''. For example, file://tmp/* is equivalent

	514 # to file://tmp/*/

	515 if not remaining_wildcard:

	516 remaining_wildcard = '*'

	517 # Skip slash(es).

	518 remaining_wildcard = remaining_wildcard.lstrip(os.sep)

	519 filepaths = self._IterDir(base_dir, remaining_wildcard)

	520 else:

	521 # Not a recursive wildcarding request.

	522 filepaths = glob.iglob(wildcard)

	523 for filepath in filepaths:

	524 expanded_url = StorageUrlFromString(filepath)

	525 if os.path.isdir(filepath):

	526 yield BucketListingPrefix(expanded_url)

	527 else:

	528 yield BucketListingObject(expanded_url)

	529

	530 def _IterDir(self, directory, wildcard):

	531 """An iterator over the specified dir and wildcard."""

	532 # UTF8-encode directory before passing it to os.walk() so if there are

	533 # non-valid UTF8 chars in the file name (e.g., that can happen if the file

	534 # originated on Windows) os.walk() will not attempt to decode and then die

	535 # with a "codec can't decode byte" error, and instead we can catch the error

	536 # at yield time and print a more informative error message.

	537 for dirpath, unused_dirnames, filenames in os.walk(directory.encode(UTF8)):

	538 for f in fnmatch.filter(filenames, wildcard):

	539 try:

	540 yield os.path.join(dirpath, f).decode(UTF8)

	541 except UnicodeDecodeError:

	542 # Note: We considered several ways to deal with this, but each had

	543 # problems:

	544 # 1. Raise an exception and try to catch in a higher layer (the

	545 # gsutil cp command), so we can properly support the gsutil cp -c

	546 # option. That doesn't work because raising an exception during

	547 # iteration terminates the generator.

	548 # 2. Accumulate a list of bad filenames and skip processing each

	549 # during iteration, then raise at the end, with exception text

	550 # printing the bad paths. That doesn't work because iteration is

	551 # wrapped in PluralityCheckableIterator, so it's possible there

	552 # are not-yet-performed copy operations at the time we reach the

	553 # end of the iteration and raise the exception - which would cause

	554 # us to skip copying validly named files. Moreover, the gsutil

	555 # cp command loops over argv, so if you run the command gsutil cp

	556 # -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1

	557 # would cause dir2 never to be visited.

	558 # 3. Print the invalid pathname and skip it during iteration. That

	559 # would work but would mean gsutil cp could exit with status 0

	560 # even though some files weren't copied.

	561 # 4. Change the WildcardIterator to include an error status along with

	562 # the result. That would solve the problem but would be a

	563 # substantial change (WildcardIterator is used in many parts of

	564 # gsutil), and we didn't feel that magnitude of change was

	565 # warranted by this relatively uncommon corner case.

	566 # Instead we chose to abort when one such file is encountered, and

	567 # require the user to remove or rename the files and try again.

	568 raise CommandException('\n'.join(textwrap.wrap(

	569 'Invalid Unicode path encountered (%s). gsutil cannot proceed '

	570 'with such files present. Please remove or rename this file and '

	571 'try again. NOTE: the path printed above replaces the '

	572 'problematic characters with a hex-encoded printable '

	573 'representation. For more details (including how to convert to a '

	574 'gsutil-compatible encoding) see `gsutil help encoding`.' %

	575 repr(os.path.join(dirpath, f)))))

	576

	577 # pylint: disable=unused-argument

	578 def IterObjects(self, bucket_listing_fields=None):

	579 """Iterates over the wildcard, yielding only object (file) refs.

	580

	581 Args:

	582 bucket_listing_fields: Ignored as filesystems don't have buckets.

	583

	584 Yields:

	585 BucketListingRefs of type OBJECT or empty iterator if no matches.

	586 """

	587 for bucket_listing_ref in self.IterAll():

	588 if bucket_listing_ref.IsObject():

	589 yield bucket_listing_ref

	590

	591 # pylint: disable=unused-argument

	592 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):

	593 """Iterates over the wildcard, yielding BucketListingRefs.

	594

	595 Args:

	596 bucket_listing_fields: Ignored; filesystems don't have buckets.

	597 expand_top_level_buckets: Ignored; filesystems don't have buckets.

	598

	599 Yields:

	600 BucketListingRefs of type OBJECT (file) or PREFIX (directory),

	601 or empty iterator if no matches.

	602 """

	603 for bucket_listing_ref in self.__iter__():

	604 yield bucket_listing_ref

	605

	606 def IterBuckets(self, unused_bucket_fields=None):

	607 """Placeholder to allow polymorphic use of WildcardIterator.

	608

	609 Args:

	610 unused_bucket_fields: Ignored; filesystems don't have buckets.

	611

	612 Raises:

	613 WildcardException: in all cases.

	614 """

	615 raise WildcardException(

	616 'Iterating over Buckets not possible for file wildcards')

	617

	618

	619 class WildcardException(StandardError):

	620 """Exception raised for invalid wildcard URLs."""

	621

	622 def __init__(self, reason):

	623 StandardError.__init__(self)

	624 self.reason = reason

	625

	626 def __repr__(self):

	627 return 'WildcardException: %s' % self.reason

	628

	629 def __str__(self):

	630 return 'WildcardException: %s' % self.reason

	631

	632

	633 def CreateWildcardIterator(url_str, gsutil_api, all_versions=False, debug=0,

	634 project_id=None):

	635 """Instantiate a WildcardIterator for the given URL string.

	636

	637 Args:

	638 url_str: URL string naming wildcard object(s) to iterate.

	639 gsutil_api: Cloud storage interface. Passed in for thread safety, also

	640 settable for testing/mocking.

	641 all_versions: If true, the iterator yields all versions of objects

	642 matching the wildcard. If false, yields just the live

	643 object version.

	644 debug: Debug level to control debug output for iterator.

	645 project_id: Project id to use for bucket listings.

	646

	647 Returns:

	648 A WildcardIterator that handles the requested iteration.

	649 """

	650

	651 url = StorageUrlFromString(url_str)

	652 if url.IsFileUrl():

	653 return FileWildcardIterator(url, debug=debug)

	654 else: # Cloud URL

	655 return CloudWildcardIterator(

	656 url, gsutil_api, all_versions=all_versions, debug=debug,

	657 project_id=project_id)

OLD	NEW

« no previous file with comments | « third_party/gsutil/gslib/util.py ('k') | third_party/gsutil/gsutil » ('j') | no next file with comments »