gslib/wildcard_iterator.py - Issue 698893003: Update checked in version of gsutil to version 4.6

Side by Side Diff: gslib/wildcard_iterator.py

Issue 698893003: Update checked in version of gsutil to version 4.6 (Closed) Base URL: http://dart.googlecode.com/svn/third_party/gsutil/

Patch Set: Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	1 # -- coding: utf-8 --

1 # Copyright 2010 Google Inc. All Rights Reserved.	2 # Copyright 2010 Google Inc. All Rights Reserved.

2 #	3 #

3 # Permission is hereby granted, free of charge, to any person obtaining a	4 # Licensed under the Apache License, Version 2.0 (the "License");

4 # copy of this software and associated documentation files (the	5 # you may not use this file except in compliance with the License.

5 # "Software"), to deal in the Software without restriction, including	6 # You may obtain a copy of the License at

6 # without limitation the rights to use, copy, modify, merge, publish, dis-

7 # tribute, sublicense, and/or sell copies of the Software, and to permit

8 # persons to whom the Software is furnished to do so, subject to the fol-

9 # lowing conditions:

10 #	7 #

11 # The above copyright notice and this permission notice shall be included	8 # http://www.apache.org/licenses/LICENSE-2.0

12 # in all copies or substantial portions of the Software.

13 #	9 #

14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS	10 # Unless required by applicable law or agreed to in writing, software

15 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL-	11 # distributed under the License is distributed on an "AS IS" BASIS,

16 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT	12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

17 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,	13 # See the License for the specific language governing permissions and

18 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,	14 # limitations under the License.

19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS	15 """Wildcard iterator class and supporting functions."""

20 # IN THE SOFTWARE.	16

21	17 from __future__ import absolute_import

22 """Implementation of wildcarding over StorageUris.	18

23

24 StorageUri is an abstraction that Google introduced in the boto library,

25 for representing storage provider-independent bucket and object names with

26 a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current

27 class provides wildcarding support for StorageUri objects (including both

28 bucket and file system objects), allowing one to express collections of

29 objects with syntax like the following:

30 gs://mybucket/images/*.png

31 file:///tmp/???abc???

32

33 We provide wildcarding support as part of gsutil rather than as part

34 of boto because wildcarding is really part of shell command-like

35 functionality.

36

37 A comment about wildcard semantics: We support both single path component

38 wildcards (e.g., using '') and recursive wildcards (using '*'), for both

39 file and cloud URIs. For example,

40 gs://bucket/doc//.html

41 would enumerate HTML files one directory down from gs://bucket/doc, while

42 gs://bucket/*/.html

43 would enumerate HTML files in all objects contained in the bucket.

44

45 Note also that if you use file system wildcards it's likely your shell

46 interprets the wildcarding before passing the command to gsutil. For example:

47 % gsutil cp /opt/eclipse//.html gs://bucket/eclipse

48 would likely be expanded by the shell into the following before running gsutil:

49 % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse

50

51 Note also that most shells don't support '**' wildcarding (I think only

52 zsh does). If you want to use '**' wildcarding with such a shell you can

53 single quote each wildcarded string, so it gets passed uninterpreted by the

54 shell to gsutil (at which point gsutil will perform the wildcarding expansion):

55 % gsutil cp '/opt/eclipse/*/.html' gs://bucket/eclipse

56 """

57

58 import boto

59 import fnmatch	19 import fnmatch

60 import glob	20 import glob

61 import os	21 import os

62 import re	22 import re

63 import sys	23 import sys

64 import urllib	24 import textwrap

65	25

66 from boto.s3.prefix import Prefix	26 from gslib.bucket_listing_ref import BucketListingBucket

67 from boto.storage_uri import BucketStorageUri	27 from gslib.bucket_listing_ref import BucketListingObject

68 from bucket_listing_ref import BucketListingRef	28 from gslib.bucket_listing_ref import BucketListingPrefix

69	29 from gslib.cloud_api import AccessDeniedException

70 # Regex to determine if a string contains any wildcards.	30 from gslib.cloud_api import CloudApi

71 WILDCARD_REGEX = re.compile('[*?\[\]]')	31 from gslib.cloud_api import NotFoundException

72	32 from gslib.exception import CommandException

73 WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator'	33 from gslib.storage_url import ContainsWildcard

74 WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator'	34 from gslib.storage_url import StorageUrlFromString

	35 from gslib.storage_url import StripOneSlash

	36 from gslib.storage_url import WILDCARD_REGEX

	37 from gslib.translation_helper import GenerationFromUrlAndString

	38 from gslib.util import UTF8

	39

	40

	41 FLAT_LIST_REGEX = re.compile(r'(?P<before>.?)\\(?P<after>.)')

75	42

76	43

77 class WildcardIterator(object):	44 class WildcardIterator(object):

78 """Base class for wildcarding over StorageUris.	45 """Class for iterating over Google Cloud Storage strings containing wildcards.

79

80 This class implements support for iterating over StorageUris that

81 contain wildcards.

82	46

83 The base class is abstract; you should instantiate using the	47 The base class is abstract; you should instantiate using the

84 wildcard_iterator() static factory method, which chooses the right	48 wildcard_iterator() static factory method, which chooses the right

85 implementation depending on the StorageUri.	49 implementation depending on the base string.

86 """	50 """

87	51

	52 # TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both

	53 # and make one return the other.

88 def __repr__(self):	54 def __repr__(self):

89 """Returns string representation of WildcardIterator."""	55 """Returns string representation of WildcardIterator."""

90 return 'WildcardIterator(%s)' % self.wildcard_uri	56 return 'WildcardIterator(%s)' % self.wildcard_url.url_string

91	57

92	58

93 class CloudWildcardIterator(WildcardIterator):	59 class CloudWildcardIterator(WildcardIterator):

94 """WildcardIterator subclass for buckets and objects.	60 """WildcardIterator subclass for buckets, bucket subdirs and objects.

95	61

96 Iterates over BucketListingRef matching the StorageUri wildcard. It's	62 Iterates over BucketListingRef matching the Url string wildcard. It's

97 much more efficient to request the Key from the BucketListingRef (via	63 much more efficient to first get metadata that's available in the Bucket

98 GetKey()) than to request the StorageUri and then call uri.get_key()	64 (for example to get the name and size of each object), because that

99 to retrieve the key, for cases where you want to get metadata that's	65 information is available in the object list results.

100 available in the Bucket (for example to get the name and size of

101 each object), because that information is available in the bucket GET

102 results. If you were to iterate over URIs for such cases and then get

103 the name and size info from each resulting StorageUri, it would cause

104 an additional object GET request for each of the result URIs.

105 """	66 """

106	67

107 def __init__(self, wildcard_uri, proj_id_handler,	68 def __init__(self, wildcard_url, gsutil_api, all_versions=False,

108 bucket_storage_uri_class=BucketStorageUri, all_versions=False,	69 debug=0, project_id=None):

109 headers=None, debug=0):	70 """Instantiates an iterator that matches the wildcard URL.

	71

	72 Args:

	73 wildcard_url: CloudUrl that contains the wildcard to iterate.

	74 gsutil_api: Cloud storage interface. Passed in for thread safety, also

	75 settable for testing/mocking.

	76 all_versions: If true, the iterator yields all versions of objects

	77 matching the wildcard. If false, yields just the live

	78 object version.

	79 debug: Debug level to control debug output for iterator.

	80 project_id: Project ID to use for bucket listings.

110 """	81 """

111 Instantiates an iterator over BucketListingRef matching given wildcard URI.	82 self.wildcard_url = wildcard_url

112

113 Args:

114 wildcard_uri: StorageUri that contains the wildcard to iterate.

115 proj_id_handler: ProjectIdHandler to use for current command.

116 bucket_storage_uri_class: BucketStorageUri interface.

117 Settable for testing/mocking.

118 headers: Dictionary containing optional HTTP headers to pass to boto.

119 debug: Debug level to pass in to boto connection (range 0..3).

120 """

121 self.wildcard_uri = wildcard_uri

122 # Make a copy of the headers so any updates we make during wildcard

123 # expansion aren't left in the input params (specifically, so we don't

124 # include the x-goog-project-id header needed by a subset of cases, in

125 # the data returned to caller, which could then be used in other cases

126 # where that header must not be passed).

127 if headers is None:

128 self.headers = {}

129 else:

130 self.headers = headers.copy()

131 self.proj_id_handler = proj_id_handler

132 self.bucket_storage_uri_class = bucket_storage_uri_class

133 self.all_versions = all_versions	83 self.all_versions = all_versions

134 self.debug = debug	84 self.debug = debug

135	85 self.gsutil_api = gsutil_api

136 def __iter__(self):	86 self.project_id = project_id

137 """Python iterator that gets called when iterating over cloud wildcard.	87

	88 def __iter__(self, bucket_listing_fields=None,

	89 expand_top_level_buckets=False):

	90 """Iterator that gets called when iterating over the cloud wildcard.

	91

	92 In the case where no wildcard is present, returns a single matching object,

	93 single matching prefix, or one of each if both exist.

	94

	95 Args:

	96 bucket_listing_fields: Iterable fields to include in bucket listings.

	97 Ex. ['name', 'acl']. Iterator is

	98 responsible for converting these to list-style

	99 format ['items/name', 'items/acl'] as well as

	100 adding any fields necessary for listing such as

	101 prefixes. API implemenation is responsible for

	102 adding pagination fields. If this is None,

	103 all fields are returned.

	104 expand_top_level_buckets: If true, yield no BUCKET references. Instead,

	105 expand buckets into top-level objects and

	106 prefixes.

138	107

139 Yields:	108 Yields:

140 BucketListingRef, or empty iterator if no matches.	109 BucketListingRef of type BUCKET, OBJECT or PREFIX.

141 """	110 """

142 # First handle bucket wildcarding, if any.	111 single_version_request = self.wildcard_url.HasGeneration()

143 if ContainsWildcard(self.wildcard_uri.bucket_name):	112

144 regex = fnmatch.translate(self.wildcard_uri.bucket_name)	113 # For wildcard expansion purposes, we need at a minimum the name of

145 bucket_uris = []	114 # each object and prefix. If we're not using the default of requesting

146 prog = re.compile(regex)	115 # all fields, make sure at least these are requested. The Cloud API

147 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR,	116 # tolerates specifying the same field twice.

148 self.wildcard_uri,	117 get_fields = None

149 self.headers)	118 if bucket_listing_fields:

150 for b in self.wildcard_uri.get_all_buckets(headers=self.headers):	119 get_fields = set()

151 if prog.match(b.name):	120 for field in bucket_listing_fields:

152 # Use str(b.name) because get_all_buckets() returns Unicode	121 get_fields.add(field)

153 # string, which when used to construct x-goog-copy-src metadata	122 bucket_listing_fields = self._GetToListFields(

154 # requests for object-to-object copies causes pathname '/' chars	123 get_fields=bucket_listing_fields)

155 # to be entity-encoded (bucket%2Fdir instead of bucket/dir),	124 bucket_listing_fields.update(['items/name', 'prefixes'])

156 # which causes the request to fail.	125 get_fields.update(['name'])

157 uri_str = '%s://%s' % (self.wildcard_uri.scheme,	126 # If we're making versioned requests, ensure generation and

158 urllib.quote_plus(str(b.name)))	127 # metageneration are also included.

159 # TODO: Move bucket_uris to a separate generator function that yields	128 if single_version_request or self.all_versions:

160 # values instead of pre-computing the list.	129 bucket_listing_fields.update(['items/generation',

161 bucket_uris.append(	130 'items/metageneration'])

162 boto.storage_uri(	131 get_fields.update(['generation', 'metageneration'])

163 uri_str, debug=self.debug,	132

164 bucket_storage_uri_class=self.bucket_storage_uri_class,	133 # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then

165 suppress_consec_slashes=False))	134 # iterate over the expanded bucket strings and handle any object

166 else:	135 # wildcarding.

167 bucket_uris = [self.wildcard_uri.clone_replace_name('')]	136 for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']):

168	137 bucket_url_string = bucket_listing_ref.url_string

169 # Now iterate over bucket(s), and handle object wildcarding, if any.	138 if self.wildcard_url.IsBucket():

170 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR,	139 # IsBucket() guarantees there are no prefix or object wildcards, and

171 self.wildcard_uri,	140 # thus this is a top-level listing of buckets.

172 self.headers)	141 if expand_top_level_buckets:

173 for bucket_uri in bucket_uris:	142 url = StorageUrlFromString(bucket_url_string)

174 if self.wildcard_uri.names_bucket():	143 for obj_or_prefix in self.gsutil_api.ListObjects(

175 # Bucket-only URI.	144 url.bucket_name, delimiter='/', all_versions=self.all_versions,

176 yield BucketListingRef(bucket_uri, key=None, prefix=None,	145 provider=self.wildcard_url.scheme,

177 headers=self.headers)	146 fields=bucket_listing_fields):

	147 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:

	148 yield self._GetObjectRef(bucket_url_string, obj_or_prefix.data,

	149 with_version=self.all_versions)

	150 else: # CloudApi.CsObjectOrPrefixType.PREFIX:

	151 yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data)

	152 else:

	153 yield bucket_listing_ref

178 else:	154 else:

179 # URI contains an object name. If there's no wildcard just yield	155 # By default, assume a non-wildcarded URL is an object, not a prefix.

180 # the needed URI.	156 # This prevents unnecessary listings (which are slower, more expensive,

181 if not ContainsWildcard(self.wildcard_uri.object_name):	157 # and also subject to eventual consistency).

182 uri_to_yield = bucket_uri.clone_replace_name(	158 if (not ContainsWildcard(self.wildcard_url.url_string) and

183 self.wildcard_uri.object_name)	159 self.wildcard_url.IsObject() and not self.all_versions):

184 yield BucketListingRef(uri_to_yield, key=None, prefix=None,	160 try:

185 headers=self.headers)	161 get_object = self.gsutil_api.GetObjectMetadata(

	162 self.wildcard_url.bucket_name,

	163 self.wildcard_url.object_name,

	164 generation=self.wildcard_url.generation,

	165 provider=self.wildcard_url.scheme,

	166 fields=get_fields)

	167 yield self._GetObjectRef(

	168 self.wildcard_url.bucket_url_string, get_object,

	169 with_version=(self.all_versions or single_version_request))

	170 return

	171 except (NotFoundException, AccessDeniedException):

	172 # It's possible this is a prefix - try to list instead.

	173 pass

	174

	175 # Expand iteratively by building prefix/delimiter bucket listing

	176 # request, filtering the results per the current level's wildcard

	177 # (if present), and continuing with the next component of the

	178 # wildcard. See _BuildBucketFilterStrings() documentation for details.

	179 if single_version_request:

	180 url_string = '%s%s#%s' % (bucket_url_string,

	181 self.wildcard_url.object_name,

	182 self.wildcard_url.generation)

186 else:	183 else:

187 # URI contains a wildcard. Expand iteratively by building	184 # Rstrip any prefixes to correspond with rstripped prefix wildcard

188 # prefix/delimiter bucket listing request, filtering the results per	185 # from _BuildBucketFilterStrings().

189 # the current level's wildcard, and continuing with the next component	186 url_string = '%s%s' % (bucket_url_string,

190 # of the wildcard. See _BuildBucketFilterStrings() documentation	187 StripOneSlash(self.wildcard_url.object_name)

191 # for details.	188 or '/') # Cover root object named '/' case.

192 #	189 urls_needing_expansion = [url_string]

193 # Initialize the iteration with bucket name from bucket_uri but	190 while urls_needing_expansion:

194 # object name from self.wildcard_uri. This is needed to handle cases	191 url = StorageUrlFromString(urls_needing_expansion.pop(0))

195 # where both the bucket and object names contain wildcards.	192 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (

196 uris_needing_expansion = [	193 self._BuildBucketFilterStrings(url.object_name))

197 bucket_uri.clone_replace_name(self.wildcard_uri.object_name)]	194 prog = re.compile(fnmatch.translate(prefix_wildcard))

198 while len(uris_needing_expansion) > 0:	195

199 uri = uris_needing_expansion.pop(0)	196 # List bucket for objects matching prefix up to delimiter.

200 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (	197 for obj_or_prefix in self.gsutil_api.ListObjects(

201 self._BuildBucketFilterStrings(uri.object_name))	198 url.bucket_name, prefix=prefix, delimiter=delimiter,

202 prog = re.compile(fnmatch.translate(prefix_wildcard))	199 all_versions=self.all_versions or single_version_request,

203 # List bucket for objects matching prefix up to delimiter.	200 provider=self.wildcard_url.scheme,

204 for key in bucket_uri.list_bucket(prefix=prefix,	201 fields=bucket_listing_fields):

205 delimiter=delimiter,	202 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:

206 headers=self.headers,	203 gcs_object = obj_or_prefix.data

207 all_versions=self.all_versions):	204 if prog.match(gcs_object.name):

208 # Check that the prefix regex matches rstripped key.name (to	205 if not suffix_wildcard or (

209 # correspond with the rstripped prefix_wildcard from	206 StripOneSlash(gcs_object.name) == suffix_wildcard):

210 # _BuildBucketFilterStrings()).	207 if not single_version_request or (

211 keyname = key.name	208 self._SingleVersionMatches(gcs_object.generation)):

212 if isinstance(key, Prefix):	209 yield self._GetObjectRef(

213 keyname = keyname.rstrip('/')	210 bucket_url_string, gcs_object, with_version=(

214 if prog.match(keyname):	211 self.all_versions or single_version_request))

215 if suffix_wildcard and keyname != suffix_wildcard:	212 else: # CloudApi.CsObjectOrPrefixType.PREFIX

216 if isinstance(key, Prefix):	213 prefix = obj_or_prefix.data

217 # There's more wildcard left to expand.	214 # If the prefix ends with a slash, remove it. Note that we only

218 uris_needing_expansion.append(	215 # remove one slash so that we can successfully enumerate dirs

219 uri.clone_replace_name(key.name.rstrip('/') + '/'	216 # containing multiple slashes.

220 + suffix_wildcard))	217 rstripped_prefix = StripOneSlash(prefix)

	218 if prog.match(rstripped_prefix):

	219 if suffix_wildcard and rstripped_prefix != suffix_wildcard:

	220 # There's more wildcard left to expand.

	221 url_append_string = '%s%s' % (

	222 bucket_url_string, rstripped_prefix + '/' +

	223 suffix_wildcard)

	224 urls_needing_expansion.append(url_append_string)

221 else:	225 else:

222 # Done expanding.	226 # No wildcard to expand, just yield the prefix

223 expanded_uri = uri.clone_replace_key(key)	227 yield self._GetPrefixRef(bucket_url_string, prefix)

224

225 if isinstance(key, Prefix):

226 yield BucketListingRef(expanded_uri, key=None, prefix=key,

227 headers=self.headers)

228 else:

229 if self.all_versions:

230 yield BucketListingRef(expanded_uri, key=key, prefix=None,

231 headers=self.headers)

232 else:

233 # Yield BLR wrapping version-less URI.

234 yield BucketListingRef(expanded_uri.clone_replace_name(

235 expanded_uri.object_name), key=key, prefix=None,

236 headers=self.headers)

237	228

238 def _BuildBucketFilterStrings(self, wildcard):	229 def _BuildBucketFilterStrings(self, wildcard):

239 """	230 """Builds strings needed for querying a bucket and filtering results.

240 Builds strings needed for querying a bucket and filtering results to	231

241 implement wildcard object name matching.	232 This implements wildcard object name matching.

242	233

243 Args:	234 Args:

244 wildcard: The wildcard string to match to objects.	235 wildcard: The wildcard string to match to objects.

245	236

246 Returns:	237 Returns:

247 (prefix, delimiter, prefix_wildcard, suffix_wildcard)	238 (prefix, delimiter, prefix_wildcard, suffix_wildcard)

248 where:	239 where:

249 prefix is the prefix to be sent in bucket GET request.	240 prefix is the prefix to be sent in bucket GET request.

250 delimiter is the delimiter to be sent in bucket GET request.	241 delimiter is the delimiter to be sent in bucket GET request.

251 prefix_wildcard is the wildcard to be used to filter bucket GET results.	242 prefix_wildcard is the wildcard to be used to filter bucket GET results.

(...skipping 31 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
283 prefix = wildcard[:match.start()]	274 prefix = wildcard[:match.start()]

284 wildcard_part = wildcard[match.start():]	275 wildcard_part = wildcard[match.start():]

285 else:	276 else:

286 prefix = None	277 prefix = None

287 wildcard_part = wildcard	278 wildcard_part = wildcard

288 end = wildcard_part.find('/')	279 end = wildcard_part.find('/')

289 if end != -1:	280 if end != -1:

290 wildcard_part = wildcard_part[:end+1]	281 wildcard_part = wildcard_part[:end+1]

291 # Remove trailing '/' so we will match gs://bucket/abc* as well as	282 # Remove trailing '/' so we will match gs://bucket/abc* as well as

292 # gs://bucket/abc*/ with the same wildcard regex.	283 # gs://bucket/abc*/ with the same wildcard regex.

293 prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/')	284 prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part)

294 suffix_wildcard = wildcard[match.end():]	285 suffix_wildcard = wildcard[match.end():]

295 end = suffix_wildcard.find('/')	286 end = suffix_wildcard.find('/')

296 if end == -1:	287 if end == -1:

297 suffix_wildcard = ''	288 suffix_wildcard = ''

298 else:	289 else:

299 suffix_wildcard = suffix_wildcard[end+1:]	290 suffix_wildcard = suffix_wildcard[end+1:]

300 # To implement recursive (**) wildcarding, if prefix_wildcard	291 # To implement recursive (**) wildcarding, if prefix_wildcard

301 # suffix_wildcard starts with '**' don't send a delimiter, and combine	292 # suffix_wildcard starts with '**' don't send a delimiter, and combine

302 # suffix_wildcard at end of prefix_wildcard.	293 # suffix_wildcard at end of prefix_wildcard.

303 if prefix_wildcard.find('**') != -1:	294 if prefix_wildcard.find('**') != -1:

304 delimiter = None	295 delimiter = None

305 prefix_wildcard = prefix_wildcard + suffix_wildcard	296 prefix_wildcard += suffix_wildcard

306 suffix_wildcard = ''	297 suffix_wildcard = ''

307 else:	298 else:

308 delimiter = '/'	299 delimiter = '/'

309 delim_pos = suffix_wildcard.find(delimiter)

310 # The following debug output is useful for tracing how the algorithm	300 # The following debug output is useful for tracing how the algorithm

311 # walks through a multi-part wildcard like gs://bucket/abc/de/f.txt	301 # walks through a multi-part wildcard like gs://bucket/abc/de/f.txt

312 if self.debug > 1:	302 if self.debug > 1:

313 sys.stderr.write(	303 sys.stderr.write(

314 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '	304 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '

315 'prefix_wildcard=%s, suffix_wildcard=%s\n' %	305 'prefix_wildcard=%s, suffix_wildcard=%s\n' %

316 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))	306 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))

317 return (prefix, delimiter, prefix_wildcard, suffix_wildcard)	307 return (prefix, delimiter, prefix_wildcard, suffix_wildcard)

318	308

319 def IterKeys(self):	309 def _SingleVersionMatches(self, listed_generation):

320 """	310 decoded_generation = GenerationFromUrlAndString(self.wildcard_url,

321 Convenience iterator that runs underlying iterator and returns Key for each	311 listed_generation)

322 iteration.	312 return str(self.wildcard_url.generation) == str(decoded_generation)

	313

	314 def _ExpandBucketWildcards(self, bucket_fields=None):

	315 """Expands bucket and provider wildcards.

	316

	317 Builds a list of bucket url strings that can be iterated on.

	318

	319 Args:

	320 bucket_fields: If present, populate only these metadata fields for

	321 buckets. Example value: ['acl', 'defaultObjectAcl']

323	322

324 Yields:	323 Yields:

325 Subclass of boto.s3.key.Key, or empty iterator if no matches.	324 BucketListingRefereneces of type BUCKET.

	325 """

	326 bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string)

	327 if (bucket_fields and set(bucket_fields) == set(['id']) and

	328 not ContainsWildcard(self.wildcard_url.bucket_name)):

	329 # If we just want the name of a non-wildcarded bucket URL,

	330 # don't make an RPC.

	331 yield BucketListingBucket(bucket_url)

	332 elif(self.wildcard_url.IsBucket() and

	333 not ContainsWildcard(self.wildcard_url.bucket_name)):

	334 # If we have a non-wildcarded bucket URL, get just that bucket.

	335 yield BucketListingBucket(

	336 bucket_url, root_object=self.gsutil_api.GetBucket(

	337 self.wildcard_url.bucket_name, provider=self.wildcard_url.scheme,

	338 fields=bucket_fields))

	339 else:

	340 regex = fnmatch.translate(self.wildcard_url.bucket_name)

	341 prog = re.compile(regex)

326	342

327 Raises:	343 fields = self._GetToListFields(bucket_fields)

328 WildcardException: for bucket-only uri.	344 if fields:

	345 fields.add('items/id')

	346 for bucket in self.gsutil_api.ListBuckets(

	347 fields=fields, project_id=self.project_id,

	348 provider=self.wildcard_url.scheme):

	349 if prog.match(bucket.id):

	350 url = StorageUrlFromString(

	351 '%s://%s/' % (self.wildcard_url.scheme, bucket.id))

	352 yield BucketListingBucket(url, root_object=bucket)

	353

	354 def _GetToListFields(self, get_fields=None):

	355 """Prepends 'items/' to the input fields and converts it to a set.

	356

	357 This way field sets requested for GetBucket can be used in ListBucket calls.

	358 Note that the input set must contain only bucket or object fields; listing

	359 fields such as prefixes or nextPageToken should be added after calling

	360 this function.

	361

	362 Args:

	363 get_fields: Iterable fields usable in GetBucket/GetObject calls.

	364

	365 Returns:

	366 Set of fields usable in ListBuckets/ListObjects calls.

329 """	367 """

330 for bucket_listing_ref in self. __iter__():	368 if get_fields:

331 if bucket_listing_ref.HasKey():	369 list_fields = set()

332 yield bucket_listing_ref.GetKey()	370 for field in get_fields:

	371 list_fields.add('items/' + field)

	372 return list_fields

333	373

334 def IterUris(self):	374 def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False):

	375 """Creates a BucketListingRef of type OBJECT from the arguments.

	376

	377 Args:

	378 bucket_url_string: Wildcardless string describing the containing bucket.

	379 gcs_object: gsutil_api root Object for populating the BucketListingRef.

	380 with_version: If true, return a reference with a versioned string.

	381

	382 Returns:

	383 BucketListingRef of type OBJECT.

335 """	384 """

336 Convenience iterator that runs underlying iterator and returns StorageUri	385 # Generation can be None in test mocks, so just return the

337 for each iteration.	386 # live object for simplicity.

	387 if with_version and gcs_object.generation is not None:

	388 generation_str = GenerationFromUrlAndString(self.wildcard_url,

	389 gcs_object.generation)

	390 object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name,

	391 generation_str)

	392 else:

	393 object_string = '%s%s' % (bucket_url_string, gcs_object.name)

	394 object_url = StorageUrlFromString(object_string)

	395 return BucketListingObject(object_url, root_object=gcs_object)

	396

	397 def _GetPrefixRef(self, bucket_url_string, prefix):

	398 """Creates a BucketListingRef of type PREFIX from the arguments.

	399

	400 Args:

	401 bucket_url_string: Wildcardless string describing the containing bucket.

	402 prefix: gsutil_api Prefix for populating the BucketListingRef

	403

	404 Returns:

	405 BucketListingRef of type PREFIX.

	406 """

	407 prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix))

	408 return BucketListingPrefix(prefix_url, root_object=prefix)

	409

	410 def IterBuckets(self, bucket_fields=None):

	411 """Iterates over the wildcard, returning refs for each expanded bucket.

	412

	413 This ignores the object part of the URL entirely and expands only the

	414 the bucket portion. It will yield BucketListingRefs of type BUCKET only.

	415

	416 Args:

	417 bucket_fields: Iterable fields to include in bucket listings.

	418 Ex. ['defaultObjectAcl', 'logging']. This function is

	419 responsible for converting these to listing-style

	420 format ['items/defaultObjectAcl', 'items/logging'], as

	421 well as adding any fields necessary for listing such as

	422 'items/id'. API implemenation is responsible for

	423 adding pagination fields. If this is None, all fields are

	424 returned.

338	425

339 Yields:	426 Yields:

340 StorageUri, or empty iterator if no matches.	427 BucketListingRef of type BUCKET, or empty iterator if no matches.

341 """	428 """

342 for bucket_listing_ref in self. __iter__():	429 for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields):

343 yield bucket_listing_ref.GetUri()	430 yield blr

344	431

345 def IterUrisForKeys(self):	432 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):

346 """	433 """Iterates over the wildcard, yielding bucket, prefix or object refs.

347 Convenience iterator that runs underlying iterator and returns the	434

348 StorageUri for each iterated BucketListingRef that has a Key.	435 Args:

	436 bucket_listing_fields: If present, populate only these metadata

	437 fields for listed objects.

	438 expand_top_level_buckets: If true and the wildcard expands only to

	439 Bucket(s), yields the expansion of each bucket

	440 into a top-level listing of prefixes and objects

	441 in that bucket instead of a BucketListingRef

	442 to that bucket.

349	443

350 Yields:	444 Yields:

351 StorageUri, or empty iterator if no matches.	445 BucketListingRef, or empty iterator if no matches.

352 """	446 """

353 for bucket_listing_ref in self. __iter__():	447 for blr in self. __iter__(

354 if bucket_listing_ref.HasKey():	448 bucket_listing_fields=bucket_listing_fields,

355 yield bucket_listing_ref.GetUri()	449 expand_top_level_buckets=expand_top_level_buckets):

	450 yield blr

	451

	452 def IterObjects(self, bucket_listing_fields=None):

	453 """Iterates over the wildcard, yielding only object BucketListingRefs.

	454

	455 Args:

	456 bucket_listing_fields: If present, populate only these metadata

	457 fields for listed objects.

	458

	459 Yields:

	460 BucketListingRefs of type OBJECT or empty iterator if no matches.

	461 """

	462 for blr in self. __iter__(bucket_listing_fields=bucket_listing_fields,

	463 expand_top_level_buckets=True):

	464 if blr.IsObject():

	465 yield blr

356	466

357	467

358 class FileWildcardIterator(WildcardIterator):	468 class FileWildcardIterator(WildcardIterator):

359 """WildcardIterator subclass for files and directories.	469 """WildcardIterator subclass for files and directories.

360	470

361 If you use recursive wildcards ('**') only a single such wildcard is	471 If you use recursive wildcards ('**') only a single such wildcard is

362 supported. For example you could use the wildcard '*/.txt' to list all .txt	472 supported. For example you could use the wildcard '*/.txt' to list all .txt

363 files in any subdirectory of the current directory, but you couldn't use a	473 files in any subdirectory of the current directory, but you couldn't use a

364 wildcard like '/abc//*.txt' (which would, if supported, let you find .txt	474 wildcard like '/abc//*.txt' (which would, if supported, let you find .txt

365 files in any subdirectory named 'abc').	475 files in any subdirectory named 'abc').

366 """	476 """

367	477

368 def __init__(self, wildcard_uri, headers=None, debug=0):	478 def __init__(self, wildcard_url, debug=0):

369 """	479 """Instantiates an iterator over BucketListingRefs matching wildcard URL.

370 Instantiate an iterator over BucketListingRefs matching given wildcard URI.

371	480

372 Args:	481 Args:

373 wildcard_uri: StorageUri that contains the wildcard to iterate.	482 wildcard_url: FileUrl that contains the wildcard to iterate.

374 headers: Dictionary containing optional HTTP headers to pass to boto.	483 debug: Debug level (range 0..3).

375 debug: Debug level to pass in to boto connection (range 0..3).

376 """	484 """

377 self.wildcard_uri = wildcard_uri	485 self.wildcard_url = wildcard_url

378 self.headers = headers

379 self.debug = debug	486 self.debug = debug

380	487

381 def __iter__(self):	488 def __iter__(self):

382 wildcard = self.wildcard_uri.object_name	489 """Iterator that gets called when iterating over the file wildcard.

383 match = re.search('\\', wildcard)	490

	491 In the case where no wildcard is present, returns a single matching file

	492 or directory.

	493

	494 Raises:

	495 WildcardException: if invalid wildcard found.

	496

	497 Yields:

	498 BucketListingRef of type OBJECT (for files) or PREFIX (for directories)

	499 """

	500 wildcard = self.wildcard_url.object_name

	501 match = FLAT_LIST_REGEX.match(wildcard)

384 if match:	502 if match:

385 # Recursive wildcarding request ('.../**/...').	503 # Recursive wildcarding request ('.../**/...').

386 # Example input: wildcard = '/tmp/tmp2pQJAX/*/'	504 # Example input: wildcard = '/tmp/tmp2pQJAX/*/'

387 base_dir = wildcard[:match.start()-1]	505 base_dir = match.group('before')[:-1]

388 remaining_wildcard = wildcard[match.start()+2:]	506 remaining_wildcard = match.group('after')

389 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and	507 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and

390 # remaining_wildcard = '/*'	508 # remaining_wildcard = '/*'

391 if remaining_wildcard.startswith('*'):	509 if remaining_wildcard.startswith('*'):

392 raise WildcardException('Invalid wildcard with more than 2 consecutive '	510 raise WildcardException('Invalid wildcard with more than 2 consecutive '

393 '*s (%s)' % wildcard)	511 '*s (%s)' % wildcard)

394 # If there was no remaining wildcard past the recursive wildcard,	512 # If there was no remaining wildcard past the recursive wildcard,

395 # treat it as if it were a ''. For example, file://tmp/* is equivalent	513 # treat it as if it were a ''. For example, file://tmp/* is equivalent

396 # to file://tmp/*/	514 # to file://tmp/*/

397 if not remaining_wildcard:	515 if not remaining_wildcard:

398 remaining_wildcard = '*'	516 remaining_wildcard = '*'

399 # Skip slash(es).	517 # Skip slash(es).

400 remaining_wildcard = remaining_wildcard.lstrip(os.sep)	518 remaining_wildcard = remaining_wildcard.lstrip(os.sep)

401 filepaths = self._iter_dir(base_dir, remaining_wildcard)	519 filepaths = self._IterDir(base_dir, remaining_wildcard)

402 else:	520 else:

403 # Not a recursive wildcarding request.	521 # Not a recursive wildcarding request.

404 filepaths = glob.iglob(wildcard)	522 filepaths = glob.iglob(wildcard)

405 for filepath in filepaths:	523 for filepath in filepaths:

406 expanded_uri = self.wildcard_uri.clone_replace_name(filepath)	524 expanded_url = StorageUrlFromString(filepath)

407 yield BucketListingRef(expanded_uri)	525 if os.path.isdir(filepath):

	526 yield BucketListingPrefix(expanded_url)

	527 else:

	528 yield BucketListingObject(expanded_url)

408	529

409 def _iter_dir(self, dir, wildcard):	530 def _IterDir(self, directory, wildcard):

410 """An iterator over the specified dir and wildcard."""	531 """An iterator over the specified dir and wildcard."""

411 for dirpath, unused_dirnames, filenames in os.walk(dir):	532 # UTF8-encode directory before passing it to os.walk() so if there are

	533 # non-valid UTF8 chars in the file name (e.g., that can happen if the file

	534 # originated on Windows) os.walk() will not attempt to decode and then die

	535 # with a "codec can't decode byte" error, and instead we can catch the error

	536 # at yield time and print a more informative error message.

	537 for dirpath, unused_dirnames, filenames in os.walk(directory.encode(UTF8)):

412 for f in fnmatch.filter(filenames, wildcard):	538 for f in fnmatch.filter(filenames, wildcard):

413 yield os.path.join(dirpath, f)	539 try:

	540 yield os.path.join(dirpath, f).decode(UTF8)

	541 except UnicodeDecodeError:

	542 # Note: We considered several ways to deal with this, but each had

	543 # problems:

	544 # 1. Raise an exception and try to catch in a higher layer (the

	545 # gsutil cp command), so we can properly support the gsutil cp -c

	546 # option. That doesn't work because raising an exception during

	547 # iteration terminates the generator.

	548 # 2. Accumulate a list of bad filenames and skip processing each

	549 # during iteration, then raise at the end, with exception text

	550 # printing the bad paths. That doesn't work because iteration is

	551 # wrapped in PluralityCheckableIterator, so it's possible there

	552 # are not-yet-performed copy operations at the time we reach the

	553 # end of the iteration and raise the exception - which would cause

	554 # us to skip copying validly named files. Moreover, the gsutil

	555 # cp command loops over argv, so if you run the command gsutil cp

	556 # -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1

	557 # would cause dir2 never to be visited.

	558 # 3. Print the invalid pathname and skip it during iteration. That

	559 # would work but would mean gsutil cp could exit with status 0

	560 # even though some files weren't copied.

	561 # 4. Change the WildcardIterator to include an error status along with

	562 # the result. That would solve the problem but would be a

	563 # substantial change (WildcardIterator is used in many parts of

	564 # gsutil), and we didn't feel that magnitude of change was

	565 # warranted by this relatively uncommon corner case.

	566 # Instead we chose to abort when one such file is encountered, and

	567 # require the user to remove or rename the files and try again.

	568 raise CommandException('\n'.join(textwrap.wrap(

	569 'Invalid Unicode path encountered (%s). gsutil cannot proceed '

	570 'with such files present. Please remove or rename this file and '

	571 'try again.' % repr(os.path.join(dirpath, f)))))

414	572

415 def IterKeys(self):	573 # pylint: disable=unused-argument

	574 def IterObjects(self, bucket_listing_fields=None):

	575 """Iterates over the wildcard, yielding only object (file) refs.

	576

	577 Args:

	578 bucket_listing_fields: Ignored as filesystems don't have buckets.

	579

	580 Yields:

	581 BucketListingRefs of type OBJECT or empty iterator if no matches.

416 """	582 """

417 Placeholder to allow polymorphic use of WildcardIterator.	583 for bucket_listing_ref in self.IterAll():

	584 if bucket_listing_ref.IsObject():

	585 yield bucket_listing_ref

	586

	587 # pylint: disable=unused-argument

	588 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):

	589 """Iterates over the wildcard, yielding BucketListingRefs.

	590

	591 Args:

	592 bucket_listing_fields: Ignored; filesystems don't have buckets.

	593 expand_top_level_buckets: Ignored; filesystems don't have buckets.

	594

	595 Yields:

	596 BucketListingRefs of type OBJECT (file) or PREFIX (directory),

	597 or empty iterator if no matches.

	598 """

	599 for bucket_listing_ref in self.__iter__():

	600 yield bucket_listing_ref

	601

	602 def IterBuckets(self, unused_bucket_fields=None):

	603 """Placeholder to allow polymorphic use of WildcardIterator.

	604

	605 Args:

	606 unused_bucket_fields: Ignored; filesystems don't have buckets.

418	607

419 Raises:	608 Raises:

420 WildcardException: in all cases.	609 WildcardException: in all cases.

421 """	610 """

422 raise WildcardException(	611 raise WildcardException(

423 'Iterating over Keys not possible for file wildcards')	612 'Iterating over Buckets not possible for file wildcards')

424

425 def IterUris(self):

426 """

427 Convenience iterator that runs underlying iterator and returns StorageUri

428 for each iteration.

429

430 Yields:

431 StorageUri, or empty iterator if no matches.

432 """

433 for bucket_listing_ref in self. __iter__():

434 yield bucket_listing_ref.GetUri()

435	613

436	614

437 class WildcardException(StandardError):	615 class WildcardException(StandardError):

438 """Exception thrown for invalid wildcard URIs."""	616 """Exception raised for invalid wildcard URLs."""

439	617

440 def __init__(self, reason):	618 def __init__(self, reason):

441 StandardError.__init__(self)	619 StandardError.__init__(self)

442 self.reason = reason	620 self.reason = reason

443	621

444 def __repr__(self):	622 def __repr__(self):

445 return 'WildcardException: %s' % self.reason	623 return 'WildcardException: %s' % self.reason

446	624

447 def __str__(self):	625 def __str__(self):

448 return 'WildcardException: %s' % self.reason	626 return 'WildcardException: %s' % self.reason

449	627

450	628

451 def wildcard_iterator(uri_or_str, proj_id_handler,	629 def CreateWildcardIterator(url_str, gsutil_api, all_versions=False, debug=0,

452 bucket_storage_uri_class=BucketStorageUri,	630 project_id=None):

453 all_versions=False,	631 """Instantiate a WildcardIterator for the given URL string.

454 headers=None, debug=0):

455 """Instantiate a WildCardIterator for the given StorageUri.

456	632

457 Args:	633 Args:

458 uri_or_str: StorageUri or URI string naming wildcard objects to iterate.	634 url_str: URL string naming wildcard object(s) to iterate.

459 proj_id_handler: ProjectIdHandler to use for current command.	635 gsutil_api: Cloud storage interface. Passed in for thread safety, also

460 bucket_storage_uri_class: BucketStorageUri interface.	636 settable for testing/mocking.

461 Settable for testing/mocking.	637 all_versions: If true, the iterator yields all versions of objects

462 headers: Dictionary containing optional HTTP headers to pass to boto.	638 matching the wildcard. If false, yields just the live

463 debug: Debug level to pass in to boto connection (range 0..3).	639 object version.

	640 debug: Debug level to control debug output for iterator.

	641 project_id: Project id to use for bucket listings.

464	642

465 Returns:	643 Returns:

466 A WildcardIterator that handles the requested iteration.	644 A WildcardIterator that handles the requested iteration.

467 """	645 """

468	646

469 if isinstance(uri_or_str, basestring):	647 url = StorageUrlFromString(url_str)

470 # Disable enforce_bucket_naming, to allow bucket names containing wildcard	648 if url.IsFileUrl():

471 # chars.	649 return FileWildcardIterator(url, debug=debug)

472 uri = boto.storage_uri(	650 else: # Cloud URL

473 uri_or_str, debug=debug, validate=False,

474 bucket_storage_uri_class=bucket_storage_uri_class,

475 suppress_consec_slashes=False)

476 else:

477 uri = uri_or_str

478

479 if uri.is_cloud_uri():

480 return CloudWildcardIterator(	651 return CloudWildcardIterator(

481 uri, proj_id_handler,	652 url, gsutil_api, all_versions=all_versions, debug=debug,

482 bucket_storage_uri_class=bucket_storage_uri_class,	653 project_id=project_id)

483 all_versions=all_versions,

484 headers=headers,

485 debug=debug)

486 elif uri.is_file_uri():

487 return FileWildcardIterator(uri, headers=headers, debug=debug)

488 else:

489 raise WildcardException('Unexpected type of StorageUri (%s)' % uri)

490

491

492 def ContainsWildcard(uri_or_str):

493 """Checks whether uri_or_str contains a wildcard.

494

495 Args:

496 uri_or_str: StorageUri or URI string to check.

497

498 Returns:

499 bool indicator.

500 """

501 if isinstance(uri_or_str, basestring):

502 return bool(WILDCARD_REGEX.search(uri_or_str))

503 else:

504 return bool(WILDCARD_REGEX.search(uri_or_str.uri))

OLD	NEW

« no previous file with comments | « gslib/util.py ('k') | gsutil » ('j') | no next file with comments »