third_party/gsutil/gslib/wildcard_iterator.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: third_party/gsutil/gslib/wildcard_iterator.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Removed gsutil/tests and gsutil/docs Created 7 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # Copyright 2010 Google Inc.

	2 #

	3 # Permission is hereby granted, free of charge, to any person obtaining a

	4 # copy of this software and associated documentation files (the

	5 # "Software"), to deal in the Software without restriction, including

	6 # without limitation the rights to use, copy, modify, merge, publish, dis-

	7 # tribute, sublicense, and/or sell copies of the Software, and to permit

	8 # persons to whom the Software is furnished to do so, subject to the fol-

	9 # lowing conditions:

	10 #

	11 # The above copyright notice and this permission notice shall be included

	12 # in all copies or substantial portions of the Software.

	13 #

	14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS

	15 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-

	16 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT

	17 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,

	18 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

	19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS

	20 # IN THE SOFTWARE.

	21

	22 """Implementation of wildcarding over StorageUris.

	23

	24 StorageUri is an abstraction that Google introduced in the boto library,

	25 for representing storage provider-independent bucket and object names with

	26 a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current

	27 class provides wildcarding support for StorageUri objects (including both

	28 bucket and file system objects), allowing one to express collections of

	29 objects with syntax like the following:

	30 gs://mybucket/images/*.png

	31 file:///tmp/???abc???

	32

	33 We provide wildcarding support as part of gsutil rather than as part

	34 of boto because wildcarding is really part of shell command-like

	35 functionality.

	36

	37 A comment about wildcard semantics: We support both single path component

	38 wildcards (e.g., using '') and recursive wildcards (using '*'), for both

	39 file and cloud URIs. For example,

	40 gs://bucket/doc//.html

	41 would enumerate HTML files one directory down from gs://bucket/doc, while

	42 gs://bucket/*/.html

	43 would enumerate HTML files in all objects contained in the bucket.

	44

	45 Note also that if you use file system wildcards it's likely your shell

	46 interprets the wildcarding before passing the command to gsutil. For example:

	47 % gsutil cp /opt/eclipse//.html gs://bucket/eclipse

	48 would likely be expanded by the shell into the following before running gsutil:

	49 % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse

	50

	51 Note also that most shells don't support '**' wildcarding (I think only

	52 zsh does). If you want to use '**' wildcarding with such a shell you can

	53 single quote each wildcarded string, so it gets passed uninterpreted by the

	54 shell to gsutil (at which point gsutil will perform the wildcarding expansion):

	55 % gsutil cp '/opt/eclipse/*/.html' gs://bucket/eclipse

	56 """

	57

	58 import boto

	59 import fnmatch

	60 import glob

	61 import os

	62 import re

	63 import sys

	64 import urllib

	65

	66 from boto.s3.prefix import Prefix

	67 from boto.storage_uri import BucketStorageUri

	68 from bucket_listing_ref import BucketListingRef

	69

	70 # Regex to determine if a string contains any wildcards.

	71 WILDCARD_REGEX = re.compile('[*?\[\]]')

	72

	73 WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator'

	74 WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator'

	75

	76

	77 class WildcardIterator(object):

	78 """Base class for wildcarding over StorageUris.

	79

	80 This class implements support for iterating over StorageUris that

	81 contain wildcards.

	82

	83 The base class is abstract; you should instantiate using the

	84 wildcard_iterator() static factory method, which chooses the right

	85 implementation depending on the StorageUri.

	86 """

	87

	88 def __repr__(self):

	89 """Returns string representation of WildcardIterator."""

	90 return 'WildcardIterator(%s)' % self.wildcard_uri

	91

	92

	93 class CloudWildcardIterator(WildcardIterator):

	94 """WildcardIterator subclass for buckets and objects.

	95

	96 Iterates over BucketListingRef matching the StorageUri wildcard. It's

	97 much more efficient to request the Key from the BucketListingRef (via

	98 GetKey()) than to request the StorageUri and then call uri.get_key()

	99 to retrieve the key, for cases where you want to get metadata that's

	100 available in the Bucket (for example to get the name and size of

	101 each object), because that information is available in the bucket GET

	102 results. If you were to iterate over URIs for such cases and then get

	103 the name and size info from each resulting StorageUri, it would cause

	104 an additional object GET request for each of the result URIs.

	105 """

	106

	107 def __init__(self, wildcard_uri, proj_id_handler,

	108 bucket_storage_uri_class=BucketStorageUri, all_versions=False,

	109 headers=None, debug=0):

	110 """

	111 Instantiates an iterator over BucketListingRef matching given wildcard URI.

	112

	113 Args:

	114 wildcard_uri: StorageUri that contains the wildcard to iterate.

	115 proj_id_handler: ProjectIdHandler to use for current command.

	116 bucket_storage_uri_class: BucketStorageUri interface.

	117 Settable for testing/mocking.

	118 headers: Dictionary containing optional HTTP headers to pass to boto.

	119 debug: Debug level to pass in to boto connection (range 0..3).

	120 """

	121 self.wildcard_uri = wildcard_uri

	122 # Make a copy of the headers so any updates we make during wildcard

	123 # expansion aren't left in the input params (specifically, so we don't

	124 # include the x-goog-project-id header needed by a subset of cases, in

	125 # the data returned to caller, which could then be used in other cases

	126 # where that header must not be passed).

	127 if headers is None:

	128 self.headers = {}

	129 else:

	130 self.headers = headers.copy()

	131 self.proj_id_handler = proj_id_handler

	132 self.bucket_storage_uri_class = bucket_storage_uri_class

	133 self.all_versions = all_versions

	134 self.debug = debug

	135

	136 def __iter__(self):

	137 """Python iterator that gets called when iterating over cloud wildcard.

	138

	139 Yields:

	140 BucketListingRef, or empty iterator if no matches.

	141 """

	142 # First handle bucket wildcarding, if any.

	143 if ContainsWildcard(self.wildcard_uri.bucket_name):

	144 regex = fnmatch.translate(self.wildcard_uri.bucket_name)

	145 bucket_uris = []

	146 prog = re.compile(regex)

	147 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR,

	148 self.wildcard_uri,

	149 self.headers)

	150 for b in self.wildcard_uri.get_all_buckets(headers=self.headers):

	151 if prog.match(b.name):

	152 # Use str(b.name) because get_all_buckets() returns Unicode

	153 # string, which when used to construct x-goog-copy-src metadata

	154 # requests for object-to-object copies causes pathname '/' chars

	155 # to be entity-encoded (bucket%2Fdir instead of bucket/dir),

	156 # which causes the request to fail.

	157 uri_str = '%s://%s' % (self.wildcard_uri.scheme,

	158 urllib.quote_plus(str(b.name)))

	159 bucket_uris.append(

	160 boto.storage_uri(

	161 uri_str, debug=self.debug,

	162 bucket_storage_uri_class=self.bucket_storage_uri_class,

	163 suppress_consec_slashes=False))

	164 else:

	165 bucket_uris = [self.wildcard_uri.clone_replace_name('')]

	166

	167 # Now iterate over bucket(s), and handle object wildcarding, if any.

	168 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR,

	169 self.wildcard_uri,

	170 self.headers)

	171 for bucket_uri in bucket_uris:

	172 if self.wildcard_uri.names_bucket():

	173 # Bucket-only URI.

	174 yield BucketListingRef(bucket_uri, key=None, prefix=None,

	175 headers=self.headers)

	176 else:

	177 # URI contains an object name. If there's no wildcard just yield

	178 # the needed URI.

	179 if not ContainsWildcard(self.wildcard_uri.object_name):

	180 uri_to_yield = bucket_uri.clone_replace_name(

	181 self.wildcard_uri.object_name)

	182 yield BucketListingRef(uri_to_yield, key=None, prefix=None,

	183 headers=self.headers)

	184 else:

	185 # URI contains a wildcard. Expand iteratively by building

	186 # prefix/delimiter bucket listing request, filtering the results per

	187 # the current level's wildcard, and continuing with the next component

	188 # of the wildcard. See _BuildBucketFilterStrings() documentation

	189 # for details.

	190 #

	191 # Initialize the iteration with bucket name from bucket_uri but

	192 # object name from self.wildcard_uri. This is needed to handle cases

	193 # where both the bucket and object names contain wildcards.

	194 uris_needing_expansion = [

	195 bucket_uri.clone_replace_name(self.wildcard_uri.object_name)]

	196 while len(uris_needing_expansion) > 0:

	197 uri = uris_needing_expansion.pop(0)

	198 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (

	199 self._BuildBucketFilterStrings(uri.object_name))

	200 prog = re.compile(fnmatch.translate(prefix_wildcard))

	201 # List bucket for objects matching prefix up to delimiter.

	202 for key in bucket_uri.list_bucket(prefix=prefix,

	203 delimiter=delimiter,

	204 headers=self.headers,

	205 all_versions=self.all_versions):

	206 # Check that the prefix regex matches rstripped key.name (to

	207 # correspond with the rstripped prefix_wildcard from

	208 # _BuildBucketFilterStrings()).

	209 if prog.match(key.name.rstrip('/')):

	210 if suffix_wildcard and key.name.rstrip('/') != suffix_wildcard:

	211 if isinstance(key, Prefix):

	212 # There's more wildcard left to expand.

	213 uris_needing_expansion.append(

	214 uri.clone_replace_name(key.name.rstrip('/') + '/'

	215 + suffix_wildcard))

	216 else:

	217 # Done expanding.

	218 expanded_uri = uri.clone_replace_name(key.name)

	219 if isinstance(key, Prefix):

	220 yield BucketListingRef(expanded_uri, key=None, prefix=key,

	221 headers=self.headers)

	222 else:

	223 yield BucketListingRef(expanded_uri, key=key, prefix=None,

	224 headers=self.headers)

	225

	226 def _BuildBucketFilterStrings(self, wildcard):

	227 """

	228 Builds strings needed for querying a bucket and filtering results to

	229 implement wildcard object name matching.

	230

	231 Args:

	232 wildcard: The wildcard string to match to objects.

	233

	234 Returns:

	235 (prefix, delimiter, prefix_wildcard, suffix_wildcard)

	236 where:

	237 prefix is the prefix to be sent in bucket GET request.

	238 delimiter is the delimiter to be sent in bucket GET request.

	239 prefix_wildcard is the wildcard to be used to filter bucket GET results.

	240 suffix_wildcard is wildcard to be appended to filtered bucket GET

	241 results for next wildcard expansion iteration.

	242 For example, given the wildcard gs://bucket/abc/de/f.txt we

	243 would build prefix= abc/d, delimiter=/, prefix_wildcard=d*e, and

	244 suffix_wildcard=f*.txt. Using this prefix and delimiter for a bucket

	245 listing request will then produce a listing result set that can be

	246 filtered using this prefix_wildcard; and we'd use this suffix_wildcard

	247 to feed into the next call(s) to _BuildBucketFilterStrings(), for the

	248 next iteration of listing/filtering.

	249

	250 Raises:

	251 AssertionError if wildcard doesn't contain any wildcard chars.

	252 """

	253 # Generate a request prefix if the object name part of the wildcard starts

	254 # with a non-wildcard string (e.g., that's true for 'gs://bucket/abc*xyz').

	255 match = WILDCARD_REGEX.search(wildcard)

	256 if not match:

	257 # Input "wildcard" has no wildcard chars, so just return tuple that will

	258 # cause a bucket listing to match the given input wildcard. Example: if

	259 # previous iteration yielded gs://bucket/dir/ with suffix_wildcard abc,

	260 # the next iteration will call _BuildBucketFilterStrings() with

	261 # gs://bucket/dir/abc, and we will return prefix ='dir/abc',

	262 # delimiter='/', prefix_wildcard='dir/abc', and suffix_wildcard=''.

	263 prefix = wildcard

	264 delimiter = '/'

	265 prefix_wildcard = wildcard

	266 suffix_wildcard = ''

	267 else:

	268 if match.start() > 0:

	269 # Wildcard does not occur at beginning of object name, so construct a

	270 # prefix string to send to server.

	271 prefix = wildcard[:match.start()]

	272 wildcard_part = wildcard[match.start():]

	273 else:

	274 prefix = None

	275 wildcard_part = wildcard

	276 end = wildcard_part.find('/')

	277 if end != -1:

	278 wildcard_part = wildcard_part[:end+1]

	279 # Remove trailing '/' so we will match gs://bucket/abc* as well as

	280 # gs://bucket/abc*/ with the same wildcard regex.

	281 prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/')

	282 suffix_wildcard = wildcard[match.end():]

	283 end = suffix_wildcard.find('/')

	284 if end == -1:

	285 suffix_wildcard = ''

	286 else:

	287 suffix_wildcard = suffix_wildcard[end+1:]

	288 # To implement recursive (**) wildcarding, if prefix_wildcard

	289 # suffix_wildcard starts with '**' don't send a delimiter, and combine

	290 # suffix_wildcard at end of prefix_wildcard.

	291 if prefix_wildcard.find('**') != -1:

	292 delimiter = None

	293 prefix_wildcard = prefix_wildcard + suffix_wildcard

	294 suffix_wildcard = ''

	295 else:

	296 delimiter = '/'

	297 delim_pos = suffix_wildcard.find(delimiter)

	298 # The following debug output is useful for tracing how the algorithm

	299 # walks through a multi-part wildcard like gs://bucket/abc/de/f.txt

	300 if self.debug > 1:

	301 sys.stderr.write(

	302 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '

	303 'prefix_wildcard=%s, suffix_wildcard=%s\n' %

	304 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))

	305 return (prefix, delimiter, prefix_wildcard, suffix_wildcard)

	306

	307 def IterKeys(self):

	308 """

	309 Convenience iterator that runs underlying iterator and returns Key for each

	310 iteration.

	311

	312 Yields:

	313 Subclass of boto.s3.key.Key, or empty iterator if no matches.

	314

	315 Raises:

	316 WildcardException: for bucket-only uri.

	317 """

	318 for bucket_listing_ref in self. __iter__():

	319 if bucket_listing_ref.HasKey():

	320 yield bucket_listing_ref.GetKey()

	321

	322 def IterUris(self):

	323 """

	324 Convenience iterator that runs underlying iterator and returns StorageUri

	325 for each iteration.

	326

	327 Yields:

	328 StorageUri, or empty iterator if no matches.

	329 """

	330 for bucket_listing_ref in self. __iter__():

	331 yield bucket_listing_ref.GetUri()

	332

	333 def IterUrisForKeys(self):

	334 """

	335 Convenience iterator that runs underlying iterator and returns the

	336 StorageUri for each iterated BucketListingRef that has a Key.

	337

	338 Yields:

	339 StorageUri, or empty iterator if no matches.

	340 """

	341 for bucket_listing_ref in self. __iter__():

	342 if bucket_listing_ref.HasKey():

	343 yield bucket_listing_ref.GetUri()

	344

	345

	346 class FileWildcardIterator(WildcardIterator):

	347 """WildcardIterator subclass for files and directories.

	348

	349 If you use recursive wildcards ('**') only a single such wildcard is

	350 supported. For example you could use the wildcard '*/.txt' to list all .txt

	351 files in any subdirectory of the current directory, but you couldn't use a

	352 wildcard like '/abc//*.txt' (which would, if supported, let you find .txt

	353 files in any subdirectory named 'abc').

	354 """

	355

	356 def __init__(self, wildcard_uri, headers=None, debug=0):

	357 """

	358 Instantiate an iterator over BucketListingRefs matching given wildcard URI.

	359

	360 Args:

	361 wildcard_uri: StorageUri that contains the wildcard to iterate.

	362 headers: Dictionary containing optional HTTP headers to pass to boto.

	363 debug: Debug level to pass in to boto connection (range 0..3).

	364 """

	365 self.wildcard_uri = wildcard_uri

	366 self.headers = headers

	367 self.debug = debug

	368

	369 def __iter__(self):

	370 wildcard = self.wildcard_uri.object_name

	371 match = re.search('\\', wildcard)

	372 if match:

	373 # Recursive wildcarding request ('.../**/...').

	374 # Example input: wildcard = '/tmp/tmp2pQJAX/*/'

	375 base_dir = wildcard[:match.start()-1]

	376 remaining_wildcard = wildcard[match.start()+2:]

	377 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and

	378 # remaining_wildcard = '/*'

	379 if remaining_wildcard.startswith('*'):

	380 raise WildcardException('Invalid wildcard with more than 2 consecutive '

	381 '*s (%s)' % wildcard)

	382 # If there was no remaining wildcard past the recursive wildcard,

	383 # treat it as if it were a ''. For example, file://tmp/* is equivalent

	384 # to file://tmp/*/

	385 if not remaining_wildcard:

	386 remaining_wildcard = '*'

	387 # Skip slash(es).

	388 remaining_wildcard = remaining_wildcard.lstrip('/')

	389 filepaths = []

	390 for dirpath, unused_dirnames, filenames in os.walk(base_dir):

	391 filepaths.extend(

	392 os.path.join(dirpath, f) for f in fnmatch.filter(filenames,

	393 remaining_wildcard)

	394 )

	395 else:

	396 # Not a recursive wildcarding request.

	397 filepaths = glob.glob(wildcard)

	398 for filepath in filepaths:

	399 expanded_uri = self.wildcard_uri.clone_replace_name(filepath)

	400 yield BucketListingRef(expanded_uri)

	401

	402 def IterKeys(self):

	403 """

	404 Placeholder to allow polymorphic use of WildcardIterator.

	405

	406 Raises:

	407 WildcardException: in all cases.

	408 """

	409 raise WildcardException(

	410 'Iterating over Keys not possible for file wildcards')

	411

	412 def IterUris(self):

	413 """

	414 Convenience iterator that runs underlying iterator and returns StorageUri

	415 for each iteration.

	416

	417 Yields:

	418 StorageUri, or empty iterator if no matches.

	419 """

	420 for bucket_listing_ref in self. __iter__():

	421 yield bucket_listing_ref.GetUri()

	422

	423

	424 class WildcardException(StandardError):

	425 """Exception thrown for invalid wildcard URIs."""

	426

	427 def __init__(self, reason):

	428 StandardError.__init__(self)

	429 self.reason = reason

	430

	431 def __repr__(self):

	432 return 'WildcardException: %s' % self.reason

	433

	434 def __str__(self):

	435 return 'WildcardException: %s' % self.reason

	436

	437

	438 def wildcard_iterator(uri_or_str, proj_id_handler,

	439 bucket_storage_uri_class=BucketStorageUri,

	440 all_versions=False,

	441 headers=None, debug=0):

	442 """Instantiate a WildCardIterator for the given StorageUri.

	443

	444 Args:

	445 uri_or_str: StorageUri or URI string naming wildcard objects to iterate.

	446 proj_id_handler: ProjectIdHandler to use for current command.

	447 bucket_storage_uri_class: BucketStorageUri interface.

	448 Settable for testing/mocking.

	449 headers: Dictionary containing optional HTTP headers to pass to boto.

	450 debug: Debug level to pass in to boto connection (range 0..3).

	451

	452 Returns:

	453 A WildcardIterator that handles the requested iteration.

	454 """

	455

	456 if isinstance(uri_or_str, basestring):

	457 # Disable enforce_bucket_naming, to allow bucket names containing

	458 # wildcard chars.

	459 uri = boto.storage_uri(

	460 uri_or_str, debug=debug, validate=False,

	461 bucket_storage_uri_class=bucket_storage_uri_class,

	462 suppress_consec_slashes=False)

	463 else:

	464 uri = uri_or_str

	465

	466 if uri.is_cloud_uri():

	467 return CloudWildcardIterator(

	468 uri, proj_id_handler,

	469 bucket_storage_uri_class=bucket_storage_uri_class,

	470 all_versions=all_versions,

	471 headers=headers,

	472 debug=debug)

	473 elif uri.is_file_uri():

	474 return FileWildcardIterator(uri, headers=headers, debug=debug)

	475 else:

	476 raise WildcardException('Unexpected type of StorageUri (%s)' % uri)

	477

	478

	479 def ContainsWildcard(uri_or_str):

	480 """Checks whether uri_or_str contains a wildcard.

	481

	482 Args:

	483 uri_or_str: StorageUri or URI string to check.

	484

	485 Returns:

	486 bool indicator.

	487 """

	488 if isinstance(uri_or_str, basestring):

	489 return bool(WILDCARD_REGEX.search(uri_or_str))

	490 else:

	491 return bool(WILDCARD_REGEX.search(uri_or_str.uri))

OLD	NEW

« download_from_google_storage.py ('K') | « third_party/gsutil/gslib/util.py ('k') | third_party/gsutil/gsutil » ('j') | upload_to_google_storage.py » ('J')