Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(261)

Side by Side Diff: gslib/wildcard_iterator.py

Issue 698893003: Update checked in version of gsutil to version 4.6 (Closed) Base URL: http://dart.googlecode.com/svn/third_party/gsutil/
Patch Set: Created 6 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « gslib/util.py ('k') | gsutil » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # -*- coding: utf-8 -*-
1 # Copyright 2010 Google Inc. All Rights Reserved. 2 # Copyright 2010 Google Inc. All Rights Reserved.
2 # 3 #
3 # Permission is hereby granted, free of charge, to any person obtaining a 4 # Licensed under the Apache License, Version 2.0 (the "License");
4 # copy of this software and associated documentation files (the 5 # you may not use this file except in compliance with the License.
5 # "Software"), to deal in the Software without restriction, including 6 # You may obtain a copy of the License at
6 # without limitation the rights to use, copy, modify, merge, publish, dis-
7 # tribute, sublicense, and/or sell copies of the Software, and to permit
8 # persons to whom the Software is furnished to do so, subject to the fol-
9 # lowing conditions:
10 # 7 #
11 # The above copyright notice and this permission notice shall be included 8 # http://www.apache.org/licenses/LICENSE-2.0
12 # in all copies or substantial portions of the Software.
13 # 9 #
14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 10 # Unless required by applicable law or agreed to in writing, software
15 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- 11 # distributed under the License is distributed on an "AS IS" BASIS,
16 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 13 # See the License for the specific language governing permissions and
18 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 14 # limitations under the License.
19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 15 """Wildcard iterator class and supporting functions."""
20 # IN THE SOFTWARE. 16
21 17 from __future__ import absolute_import
22 """Implementation of wildcarding over StorageUris. 18
23
24 StorageUri is an abstraction that Google introduced in the boto library,
25 for representing storage provider-independent bucket and object names with
26 a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current
27 class provides wildcarding support for StorageUri objects (including both
28 bucket and file system objects), allowing one to express collections of
29 objects with syntax like the following:
30 gs://mybucket/images/*.png
31 file:///tmp/???abc???
32
33 We provide wildcarding support as part of gsutil rather than as part
34 of boto because wildcarding is really part of shell command-like
35 functionality.
36
37 A comment about wildcard semantics: We support both single path component
38 wildcards (e.g., using '*') and recursive wildcards (using '**'), for both
39 file and cloud URIs. For example,
40 gs://bucket/doc/*/*.html
41 would enumerate HTML files one directory down from gs://bucket/doc, while
42 gs://bucket/**/*.html
43 would enumerate HTML files in all objects contained in the bucket.
44
45 Note also that if you use file system wildcards it's likely your shell
46 interprets the wildcarding before passing the command to gsutil. For example:
47 % gsutil cp /opt/eclipse/*/*.html gs://bucket/eclipse
48 would likely be expanded by the shell into the following before running gsutil:
49 % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse
50
51 Note also that most shells don't support '**' wildcarding (I think only
52 zsh does). If you want to use '**' wildcarding with such a shell you can
53 single quote each wildcarded string, so it gets passed uninterpreted by the
54 shell to gsutil (at which point gsutil will perform the wildcarding expansion):
55 % gsutil cp '/opt/eclipse/**/*.html' gs://bucket/eclipse
56 """
57
58 import boto
59 import fnmatch 19 import fnmatch
60 import glob 20 import glob
61 import os 21 import os
62 import re 22 import re
63 import sys 23 import sys
64 import urllib 24 import textwrap
65 25
66 from boto.s3.prefix import Prefix 26 from gslib.bucket_listing_ref import BucketListingBucket
67 from boto.storage_uri import BucketStorageUri 27 from gslib.bucket_listing_ref import BucketListingObject
68 from bucket_listing_ref import BucketListingRef 28 from gslib.bucket_listing_ref import BucketListingPrefix
69 29 from gslib.cloud_api import AccessDeniedException
70 # Regex to determine if a string contains any wildcards. 30 from gslib.cloud_api import CloudApi
71 WILDCARD_REGEX = re.compile('[*?\[\]]') 31 from gslib.cloud_api import NotFoundException
72 32 from gslib.exception import CommandException
73 WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator' 33 from gslib.storage_url import ContainsWildcard
74 WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator' 34 from gslib.storage_url import StorageUrlFromString
35 from gslib.storage_url import StripOneSlash
36 from gslib.storage_url import WILDCARD_REGEX
37 from gslib.translation_helper import GenerationFromUrlAndString
38 from gslib.util import UTF8
39
40
41 FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)')
75 42
76 43
77 class WildcardIterator(object): 44 class WildcardIterator(object):
78 """Base class for wildcarding over StorageUris. 45 """Class for iterating over Google Cloud Storage strings containing wildcards.
79
80 This class implements support for iterating over StorageUris that
81 contain wildcards.
82 46
83 The base class is abstract; you should instantiate using the 47 The base class is abstract; you should instantiate using the
84 wildcard_iterator() static factory method, which chooses the right 48 wildcard_iterator() static factory method, which chooses the right
85 implementation depending on the StorageUri. 49 implementation depending on the base string.
86 """ 50 """
87 51
52 # TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both
53 # and make one return the other.
88 def __repr__(self): 54 def __repr__(self):
89 """Returns string representation of WildcardIterator.""" 55 """Returns string representation of WildcardIterator."""
90 return 'WildcardIterator(%s)' % self.wildcard_uri 56 return 'WildcardIterator(%s)' % self.wildcard_url.url_string
91 57
92 58
93 class CloudWildcardIterator(WildcardIterator): 59 class CloudWildcardIterator(WildcardIterator):
94 """WildcardIterator subclass for buckets and objects. 60 """WildcardIterator subclass for buckets, bucket subdirs and objects.
95 61
96 Iterates over BucketListingRef matching the StorageUri wildcard. It's 62 Iterates over BucketListingRef matching the Url string wildcard. It's
97 much more efficient to request the Key from the BucketListingRef (via 63 much more efficient to first get metadata that's available in the Bucket
98 GetKey()) than to request the StorageUri and then call uri.get_key() 64 (for example to get the name and size of each object), because that
99 to retrieve the key, for cases where you want to get metadata that's 65 information is available in the object list results.
100 available in the Bucket (for example to get the name and size of
101 each object), because that information is available in the bucket GET
102 results. If you were to iterate over URIs for such cases and then get
103 the name and size info from each resulting StorageUri, it would cause
104 an additional object GET request for each of the result URIs.
105 """ 66 """
106 67
107 def __init__(self, wildcard_uri, proj_id_handler, 68 def __init__(self, wildcard_url, gsutil_api, all_versions=False,
108 bucket_storage_uri_class=BucketStorageUri, all_versions=False, 69 debug=0, project_id=None):
109 headers=None, debug=0): 70 """Instantiates an iterator that matches the wildcard URL.
71
72 Args:
73 wildcard_url: CloudUrl that contains the wildcard to iterate.
74 gsutil_api: Cloud storage interface. Passed in for thread safety, also
75 settable for testing/mocking.
76 all_versions: If true, the iterator yields all versions of objects
77 matching the wildcard. If false, yields just the live
78 object version.
79 debug: Debug level to control debug output for iterator.
80 project_id: Project ID to use for bucket listings.
110 """ 81 """
111 Instantiates an iterator over BucketListingRef matching given wildcard URI. 82 self.wildcard_url = wildcard_url
112
113 Args:
114 wildcard_uri: StorageUri that contains the wildcard to iterate.
115 proj_id_handler: ProjectIdHandler to use for current command.
116 bucket_storage_uri_class: BucketStorageUri interface.
117 Settable for testing/mocking.
118 headers: Dictionary containing optional HTTP headers to pass to boto.
119 debug: Debug level to pass in to boto connection (range 0..3).
120 """
121 self.wildcard_uri = wildcard_uri
122 # Make a copy of the headers so any updates we make during wildcard
123 # expansion aren't left in the input params (specifically, so we don't
124 # include the x-goog-project-id header needed by a subset of cases, in
125 # the data returned to caller, which could then be used in other cases
126 # where that header must not be passed).
127 if headers is None:
128 self.headers = {}
129 else:
130 self.headers = headers.copy()
131 self.proj_id_handler = proj_id_handler
132 self.bucket_storage_uri_class = bucket_storage_uri_class
133 self.all_versions = all_versions 83 self.all_versions = all_versions
134 self.debug = debug 84 self.debug = debug
135 85 self.gsutil_api = gsutil_api
136 def __iter__(self): 86 self.project_id = project_id
137 """Python iterator that gets called when iterating over cloud wildcard. 87
88 def __iter__(self, bucket_listing_fields=None,
89 expand_top_level_buckets=False):
90 """Iterator that gets called when iterating over the cloud wildcard.
91
92 In the case where no wildcard is present, returns a single matching object,
93 single matching prefix, or one of each if both exist.
94
95 Args:
96 bucket_listing_fields: Iterable fields to include in bucket listings.
97 Ex. ['name', 'acl']. Iterator is
98 responsible for converting these to list-style
99 format ['items/name', 'items/acl'] as well as
100 adding any fields necessary for listing such as
101 prefixes. API implemenation is responsible for
102 adding pagination fields. If this is None,
103 all fields are returned.
104 expand_top_level_buckets: If true, yield no BUCKET references. Instead,
105 expand buckets into top-level objects and
106 prefixes.
138 107
139 Yields: 108 Yields:
140 BucketListingRef, or empty iterator if no matches. 109 BucketListingRef of type BUCKET, OBJECT or PREFIX.
141 """ 110 """
142 # First handle bucket wildcarding, if any. 111 single_version_request = self.wildcard_url.HasGeneration()
143 if ContainsWildcard(self.wildcard_uri.bucket_name): 112
144 regex = fnmatch.translate(self.wildcard_uri.bucket_name) 113 # For wildcard expansion purposes, we need at a minimum the name of
145 bucket_uris = [] 114 # each object and prefix. If we're not using the default of requesting
146 prog = re.compile(regex) 115 # all fields, make sure at least these are requested. The Cloud API
147 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR, 116 # tolerates specifying the same field twice.
148 self.wildcard_uri, 117 get_fields = None
149 self.headers) 118 if bucket_listing_fields:
150 for b in self.wildcard_uri.get_all_buckets(headers=self.headers): 119 get_fields = set()
151 if prog.match(b.name): 120 for field in bucket_listing_fields:
152 # Use str(b.name) because get_all_buckets() returns Unicode 121 get_fields.add(field)
153 # string, which when used to construct x-goog-copy-src metadata 122 bucket_listing_fields = self._GetToListFields(
154 # requests for object-to-object copies causes pathname '/' chars 123 get_fields=bucket_listing_fields)
155 # to be entity-encoded (bucket%2Fdir instead of bucket/dir), 124 bucket_listing_fields.update(['items/name', 'prefixes'])
156 # which causes the request to fail. 125 get_fields.update(['name'])
157 uri_str = '%s://%s' % (self.wildcard_uri.scheme, 126 # If we're making versioned requests, ensure generation and
158 urllib.quote_plus(str(b.name))) 127 # metageneration are also included.
159 # TODO: Move bucket_uris to a separate generator function that yields 128 if single_version_request or self.all_versions:
160 # values instead of pre-computing the list. 129 bucket_listing_fields.update(['items/generation',
161 bucket_uris.append( 130 'items/metageneration'])
162 boto.storage_uri( 131 get_fields.update(['generation', 'metageneration'])
163 uri_str, debug=self.debug, 132
164 bucket_storage_uri_class=self.bucket_storage_uri_class, 133 # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then
165 suppress_consec_slashes=False)) 134 # iterate over the expanded bucket strings and handle any object
166 else: 135 # wildcarding.
167 bucket_uris = [self.wildcard_uri.clone_replace_name('')] 136 for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']):
168 137 bucket_url_string = bucket_listing_ref.url_string
169 # Now iterate over bucket(s), and handle object wildcarding, if any. 138 if self.wildcard_url.IsBucket():
170 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR, 139 # IsBucket() guarantees there are no prefix or object wildcards, and
171 self.wildcard_uri, 140 # thus this is a top-level listing of buckets.
172 self.headers) 141 if expand_top_level_buckets:
173 for bucket_uri in bucket_uris: 142 url = StorageUrlFromString(bucket_url_string)
174 if self.wildcard_uri.names_bucket(): 143 for obj_or_prefix in self.gsutil_api.ListObjects(
175 # Bucket-only URI. 144 url.bucket_name, delimiter='/', all_versions=self.all_versions,
176 yield BucketListingRef(bucket_uri, key=None, prefix=None, 145 provider=self.wildcard_url.scheme,
177 headers=self.headers) 146 fields=bucket_listing_fields):
147 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
148 yield self._GetObjectRef(bucket_url_string, obj_or_prefix.data,
149 with_version=self.all_versions)
150 else: # CloudApi.CsObjectOrPrefixType.PREFIX:
151 yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data)
152 else:
153 yield bucket_listing_ref
178 else: 154 else:
179 # URI contains an object name. If there's no wildcard just yield 155 # By default, assume a non-wildcarded URL is an object, not a prefix.
180 # the needed URI. 156 # This prevents unnecessary listings (which are slower, more expensive,
181 if not ContainsWildcard(self.wildcard_uri.object_name): 157 # and also subject to eventual consistency).
182 uri_to_yield = bucket_uri.clone_replace_name( 158 if (not ContainsWildcard(self.wildcard_url.url_string) and
183 self.wildcard_uri.object_name) 159 self.wildcard_url.IsObject() and not self.all_versions):
184 yield BucketListingRef(uri_to_yield, key=None, prefix=None, 160 try:
185 headers=self.headers) 161 get_object = self.gsutil_api.GetObjectMetadata(
162 self.wildcard_url.bucket_name,
163 self.wildcard_url.object_name,
164 generation=self.wildcard_url.generation,
165 provider=self.wildcard_url.scheme,
166 fields=get_fields)
167 yield self._GetObjectRef(
168 self.wildcard_url.bucket_url_string, get_object,
169 with_version=(self.all_versions or single_version_request))
170 return
171 except (NotFoundException, AccessDeniedException):
172 # It's possible this is a prefix - try to list instead.
173 pass
174
175 # Expand iteratively by building prefix/delimiter bucket listing
176 # request, filtering the results per the current level's wildcard
177 # (if present), and continuing with the next component of the
178 # wildcard. See _BuildBucketFilterStrings() documentation for details.
179 if single_version_request:
180 url_string = '%s%s#%s' % (bucket_url_string,
181 self.wildcard_url.object_name,
182 self.wildcard_url.generation)
186 else: 183 else:
187 # URI contains a wildcard. Expand iteratively by building 184 # Rstrip any prefixes to correspond with rstripped prefix wildcard
188 # prefix/delimiter bucket listing request, filtering the results per 185 # from _BuildBucketFilterStrings().
189 # the current level's wildcard, and continuing with the next component 186 url_string = '%s%s' % (bucket_url_string,
190 # of the wildcard. See _BuildBucketFilterStrings() documentation 187 StripOneSlash(self.wildcard_url.object_name)
191 # for details. 188 or '/') # Cover root object named '/' case.
192 # 189 urls_needing_expansion = [url_string]
193 # Initialize the iteration with bucket name from bucket_uri but 190 while urls_needing_expansion:
194 # object name from self.wildcard_uri. This is needed to handle cases 191 url = StorageUrlFromString(urls_needing_expansion.pop(0))
195 # where both the bucket and object names contain wildcards. 192 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = (
196 uris_needing_expansion = [ 193 self._BuildBucketFilterStrings(url.object_name))
197 bucket_uri.clone_replace_name(self.wildcard_uri.object_name)] 194 prog = re.compile(fnmatch.translate(prefix_wildcard))
198 while len(uris_needing_expansion) > 0: 195
199 uri = uris_needing_expansion.pop(0) 196 # List bucket for objects matching prefix up to delimiter.
200 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = ( 197 for obj_or_prefix in self.gsutil_api.ListObjects(
201 self._BuildBucketFilterStrings(uri.object_name)) 198 url.bucket_name, prefix=prefix, delimiter=delimiter,
202 prog = re.compile(fnmatch.translate(prefix_wildcard)) 199 all_versions=self.all_versions or single_version_request,
203 # List bucket for objects matching prefix up to delimiter. 200 provider=self.wildcard_url.scheme,
204 for key in bucket_uri.list_bucket(prefix=prefix, 201 fields=bucket_listing_fields):
205 delimiter=delimiter, 202 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT:
206 headers=self.headers, 203 gcs_object = obj_or_prefix.data
207 all_versions=self.all_versions): 204 if prog.match(gcs_object.name):
208 # Check that the prefix regex matches rstripped key.name (to 205 if not suffix_wildcard or (
209 # correspond with the rstripped prefix_wildcard from 206 StripOneSlash(gcs_object.name) == suffix_wildcard):
210 # _BuildBucketFilterStrings()). 207 if not single_version_request or (
211 keyname = key.name 208 self._SingleVersionMatches(gcs_object.generation)):
212 if isinstance(key, Prefix): 209 yield self._GetObjectRef(
213 keyname = keyname.rstrip('/') 210 bucket_url_string, gcs_object, with_version=(
214 if prog.match(keyname): 211 self.all_versions or single_version_request))
215 if suffix_wildcard and keyname != suffix_wildcard: 212 else: # CloudApi.CsObjectOrPrefixType.PREFIX
216 if isinstance(key, Prefix): 213 prefix = obj_or_prefix.data
217 # There's more wildcard left to expand. 214 # If the prefix ends with a slash, remove it. Note that we only
218 uris_needing_expansion.append( 215 # remove one slash so that we can successfully enumerate dirs
219 uri.clone_replace_name(key.name.rstrip('/') + '/' 216 # containing multiple slashes.
220 + suffix_wildcard)) 217 rstripped_prefix = StripOneSlash(prefix)
218 if prog.match(rstripped_prefix):
219 if suffix_wildcard and rstripped_prefix != suffix_wildcard:
220 # There's more wildcard left to expand.
221 url_append_string = '%s%s' % (
222 bucket_url_string, rstripped_prefix + '/' +
223 suffix_wildcard)
224 urls_needing_expansion.append(url_append_string)
221 else: 225 else:
222 # Done expanding. 226 # No wildcard to expand, just yield the prefix
223 expanded_uri = uri.clone_replace_key(key) 227 yield self._GetPrefixRef(bucket_url_string, prefix)
224
225 if isinstance(key, Prefix):
226 yield BucketListingRef(expanded_uri, key=None, prefix=key,
227 headers=self.headers)
228 else:
229 if self.all_versions:
230 yield BucketListingRef(expanded_uri, key=key, prefix=None,
231 headers=self.headers)
232 else:
233 # Yield BLR wrapping version-less URI.
234 yield BucketListingRef(expanded_uri.clone_replace_name(
235 expanded_uri.object_name), key=key, prefix=None,
236 headers=self.headers)
237 228
238 def _BuildBucketFilterStrings(self, wildcard): 229 def _BuildBucketFilterStrings(self, wildcard):
239 """ 230 """Builds strings needed for querying a bucket and filtering results.
240 Builds strings needed for querying a bucket and filtering results to 231
241 implement wildcard object name matching. 232 This implements wildcard object name matching.
242 233
243 Args: 234 Args:
244 wildcard: The wildcard string to match to objects. 235 wildcard: The wildcard string to match to objects.
245 236
246 Returns: 237 Returns:
247 (prefix, delimiter, prefix_wildcard, suffix_wildcard) 238 (prefix, delimiter, prefix_wildcard, suffix_wildcard)
248 where: 239 where:
249 prefix is the prefix to be sent in bucket GET request. 240 prefix is the prefix to be sent in bucket GET request.
250 delimiter is the delimiter to be sent in bucket GET request. 241 delimiter is the delimiter to be sent in bucket GET request.
251 prefix_wildcard is the wildcard to be used to filter bucket GET results. 242 prefix_wildcard is the wildcard to be used to filter bucket GET results.
(...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after
283 prefix = wildcard[:match.start()] 274 prefix = wildcard[:match.start()]
284 wildcard_part = wildcard[match.start():] 275 wildcard_part = wildcard[match.start():]
285 else: 276 else:
286 prefix = None 277 prefix = None
287 wildcard_part = wildcard 278 wildcard_part = wildcard
288 end = wildcard_part.find('/') 279 end = wildcard_part.find('/')
289 if end != -1: 280 if end != -1:
290 wildcard_part = wildcard_part[:end+1] 281 wildcard_part = wildcard_part[:end+1]
291 # Remove trailing '/' so we will match gs://bucket/abc* as well as 282 # Remove trailing '/' so we will match gs://bucket/abc* as well as
292 # gs://bucket/abc*/ with the same wildcard regex. 283 # gs://bucket/abc*/ with the same wildcard regex.
293 prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/') 284 prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part)
294 suffix_wildcard = wildcard[match.end():] 285 suffix_wildcard = wildcard[match.end():]
295 end = suffix_wildcard.find('/') 286 end = suffix_wildcard.find('/')
296 if end == -1: 287 if end == -1:
297 suffix_wildcard = '' 288 suffix_wildcard = ''
298 else: 289 else:
299 suffix_wildcard = suffix_wildcard[end+1:] 290 suffix_wildcard = suffix_wildcard[end+1:]
300 # To implement recursive (**) wildcarding, if prefix_wildcard 291 # To implement recursive (**) wildcarding, if prefix_wildcard
301 # suffix_wildcard starts with '**' don't send a delimiter, and combine 292 # suffix_wildcard starts with '**' don't send a delimiter, and combine
302 # suffix_wildcard at end of prefix_wildcard. 293 # suffix_wildcard at end of prefix_wildcard.
303 if prefix_wildcard.find('**') != -1: 294 if prefix_wildcard.find('**') != -1:
304 delimiter = None 295 delimiter = None
305 prefix_wildcard = prefix_wildcard + suffix_wildcard 296 prefix_wildcard += suffix_wildcard
306 suffix_wildcard = '' 297 suffix_wildcard = ''
307 else: 298 else:
308 delimiter = '/' 299 delimiter = '/'
309 delim_pos = suffix_wildcard.find(delimiter)
310 # The following debug output is useful for tracing how the algorithm 300 # The following debug output is useful for tracing how the algorithm
311 # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt 301 # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt
312 if self.debug > 1: 302 if self.debug > 1:
313 sys.stderr.write( 303 sys.stderr.write(
314 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, ' 304 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, '
315 'prefix_wildcard=%s, suffix_wildcard=%s\n' % 305 'prefix_wildcard=%s, suffix_wildcard=%s\n' %
316 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard)) 306 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard))
317 return (prefix, delimiter, prefix_wildcard, suffix_wildcard) 307 return (prefix, delimiter, prefix_wildcard, suffix_wildcard)
318 308
319 def IterKeys(self): 309 def _SingleVersionMatches(self, listed_generation):
320 """ 310 decoded_generation = GenerationFromUrlAndString(self.wildcard_url,
321 Convenience iterator that runs underlying iterator and returns Key for each 311 listed_generation)
322 iteration. 312 return str(self.wildcard_url.generation) == str(decoded_generation)
313
314 def _ExpandBucketWildcards(self, bucket_fields=None):
315 """Expands bucket and provider wildcards.
316
317 Builds a list of bucket url strings that can be iterated on.
318
319 Args:
320 bucket_fields: If present, populate only these metadata fields for
321 buckets. Example value: ['acl', 'defaultObjectAcl']
323 322
324 Yields: 323 Yields:
325 Subclass of boto.s3.key.Key, or empty iterator if no matches. 324 BucketListingRefereneces of type BUCKET.
325 """
326 bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string)
327 if (bucket_fields and set(bucket_fields) == set(['id']) and
328 not ContainsWildcard(self.wildcard_url.bucket_name)):
329 # If we just want the name of a non-wildcarded bucket URL,
330 # don't make an RPC.
331 yield BucketListingBucket(bucket_url)
332 elif(self.wildcard_url.IsBucket() and
333 not ContainsWildcard(self.wildcard_url.bucket_name)):
334 # If we have a non-wildcarded bucket URL, get just that bucket.
335 yield BucketListingBucket(
336 bucket_url, root_object=self.gsutil_api.GetBucket(
337 self.wildcard_url.bucket_name, provider=self.wildcard_url.scheme,
338 fields=bucket_fields))
339 else:
340 regex = fnmatch.translate(self.wildcard_url.bucket_name)
341 prog = re.compile(regex)
326 342
327 Raises: 343 fields = self._GetToListFields(bucket_fields)
328 WildcardException: for bucket-only uri. 344 if fields:
345 fields.add('items/id')
346 for bucket in self.gsutil_api.ListBuckets(
347 fields=fields, project_id=self.project_id,
348 provider=self.wildcard_url.scheme):
349 if prog.match(bucket.id):
350 url = StorageUrlFromString(
351 '%s://%s/' % (self.wildcard_url.scheme, bucket.id))
352 yield BucketListingBucket(url, root_object=bucket)
353
354 def _GetToListFields(self, get_fields=None):
355 """Prepends 'items/' to the input fields and converts it to a set.
356
357 This way field sets requested for GetBucket can be used in ListBucket calls.
358 Note that the input set must contain only bucket or object fields; listing
359 fields such as prefixes or nextPageToken should be added after calling
360 this function.
361
362 Args:
363 get_fields: Iterable fields usable in GetBucket/GetObject calls.
364
365 Returns:
366 Set of fields usable in ListBuckets/ListObjects calls.
329 """ 367 """
330 for bucket_listing_ref in self. __iter__(): 368 if get_fields:
331 if bucket_listing_ref.HasKey(): 369 list_fields = set()
332 yield bucket_listing_ref.GetKey() 370 for field in get_fields:
371 list_fields.add('items/' + field)
372 return list_fields
333 373
334 def IterUris(self): 374 def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False):
375 """Creates a BucketListingRef of type OBJECT from the arguments.
376
377 Args:
378 bucket_url_string: Wildcardless string describing the containing bucket.
379 gcs_object: gsutil_api root Object for populating the BucketListingRef.
380 with_version: If true, return a reference with a versioned string.
381
382 Returns:
383 BucketListingRef of type OBJECT.
335 """ 384 """
336 Convenience iterator that runs underlying iterator and returns StorageUri 385 # Generation can be None in test mocks, so just return the
337 for each iteration. 386 # live object for simplicity.
387 if with_version and gcs_object.generation is not None:
388 generation_str = GenerationFromUrlAndString(self.wildcard_url,
389 gcs_object.generation)
390 object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name,
391 generation_str)
392 else:
393 object_string = '%s%s' % (bucket_url_string, gcs_object.name)
394 object_url = StorageUrlFromString(object_string)
395 return BucketListingObject(object_url, root_object=gcs_object)
396
397 def _GetPrefixRef(self, bucket_url_string, prefix):
398 """Creates a BucketListingRef of type PREFIX from the arguments.
399
400 Args:
401 bucket_url_string: Wildcardless string describing the containing bucket.
402 prefix: gsutil_api Prefix for populating the BucketListingRef
403
404 Returns:
405 BucketListingRef of type PREFIX.
406 """
407 prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix))
408 return BucketListingPrefix(prefix_url, root_object=prefix)
409
410 def IterBuckets(self, bucket_fields=None):
411 """Iterates over the wildcard, returning refs for each expanded bucket.
412
413 This ignores the object part of the URL entirely and expands only the
414 the bucket portion. It will yield BucketListingRefs of type BUCKET only.
415
416 Args:
417 bucket_fields: Iterable fields to include in bucket listings.
418 Ex. ['defaultObjectAcl', 'logging']. This function is
419 responsible for converting these to listing-style
420 format ['items/defaultObjectAcl', 'items/logging'], as
421 well as adding any fields necessary for listing such as
422 'items/id'. API implemenation is responsible for
423 adding pagination fields. If this is None, all fields are
424 returned.
338 425
339 Yields: 426 Yields:
340 StorageUri, or empty iterator if no matches. 427 BucketListingRef of type BUCKET, or empty iterator if no matches.
341 """ 428 """
342 for bucket_listing_ref in self. __iter__(): 429 for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields):
343 yield bucket_listing_ref.GetUri() 430 yield blr
344 431
345 def IterUrisForKeys(self): 432 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
346 """ 433 """Iterates over the wildcard, yielding bucket, prefix or object refs.
347 Convenience iterator that runs underlying iterator and returns the 434
348 StorageUri for each iterated BucketListingRef that has a Key. 435 Args:
436 bucket_listing_fields: If present, populate only these metadata
437 fields for listed objects.
438 expand_top_level_buckets: If true and the wildcard expands only to
439 Bucket(s), yields the expansion of each bucket
440 into a top-level listing of prefixes and objects
441 in that bucket instead of a BucketListingRef
442 to that bucket.
349 443
350 Yields: 444 Yields:
351 StorageUri, or empty iterator if no matches. 445 BucketListingRef, or empty iterator if no matches.
352 """ 446 """
353 for bucket_listing_ref in self. __iter__(): 447 for blr in self. __iter__(
354 if bucket_listing_ref.HasKey(): 448 bucket_listing_fields=bucket_listing_fields,
355 yield bucket_listing_ref.GetUri() 449 expand_top_level_buckets=expand_top_level_buckets):
450 yield blr
451
452 def IterObjects(self, bucket_listing_fields=None):
453 """Iterates over the wildcard, yielding only object BucketListingRefs.
454
455 Args:
456 bucket_listing_fields: If present, populate only these metadata
457 fields for listed objects.
458
459 Yields:
460 BucketListingRefs of type OBJECT or empty iterator if no matches.
461 """
462 for blr in self. __iter__(bucket_listing_fields=bucket_listing_fields,
463 expand_top_level_buckets=True):
464 if blr.IsObject():
465 yield blr
356 466
357 467
358 class FileWildcardIterator(WildcardIterator): 468 class FileWildcardIterator(WildcardIterator):
359 """WildcardIterator subclass for files and directories. 469 """WildcardIterator subclass for files and directories.
360 470
361 If you use recursive wildcards ('**') only a single such wildcard is 471 If you use recursive wildcards ('**') only a single such wildcard is
362 supported. For example you could use the wildcard '**/*.txt' to list all .txt 472 supported. For example you could use the wildcard '**/*.txt' to list all .txt
363 files in any subdirectory of the current directory, but you couldn't use a 473 files in any subdirectory of the current directory, but you couldn't use a
364 wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt 474 wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt
365 files in any subdirectory named 'abc'). 475 files in any subdirectory named 'abc').
366 """ 476 """
367 477
368 def __init__(self, wildcard_uri, headers=None, debug=0): 478 def __init__(self, wildcard_url, debug=0):
369 """ 479 """Instantiates an iterator over BucketListingRefs matching wildcard URL.
370 Instantiate an iterator over BucketListingRefs matching given wildcard URI.
371 480
372 Args: 481 Args:
373 wildcard_uri: StorageUri that contains the wildcard to iterate. 482 wildcard_url: FileUrl that contains the wildcard to iterate.
374 headers: Dictionary containing optional HTTP headers to pass to boto. 483 debug: Debug level (range 0..3).
375 debug: Debug level to pass in to boto connection (range 0..3).
376 """ 484 """
377 self.wildcard_uri = wildcard_uri 485 self.wildcard_url = wildcard_url
378 self.headers = headers
379 self.debug = debug 486 self.debug = debug
380 487
381 def __iter__(self): 488 def __iter__(self):
382 wildcard = self.wildcard_uri.object_name 489 """Iterator that gets called when iterating over the file wildcard.
383 match = re.search('\*\*', wildcard) 490
491 In the case where no wildcard is present, returns a single matching file
492 or directory.
493
494 Raises:
495 WildcardException: if invalid wildcard found.
496
497 Yields:
498 BucketListingRef of type OBJECT (for files) or PREFIX (for directories)
499 """
500 wildcard = self.wildcard_url.object_name
501 match = FLAT_LIST_REGEX.match(wildcard)
384 if match: 502 if match:
385 # Recursive wildcarding request ('.../**/...'). 503 # Recursive wildcarding request ('.../**/...').
386 # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' 504 # Example input: wildcard = '/tmp/tmp2pQJAX/**/*'
387 base_dir = wildcard[:match.start()-1] 505 base_dir = match.group('before')[:-1]
388 remaining_wildcard = wildcard[match.start()+2:] 506 remaining_wildcard = match.group('after')
389 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and 507 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and
390 # remaining_wildcard = '/*' 508 # remaining_wildcard = '/*'
391 if remaining_wildcard.startswith('*'): 509 if remaining_wildcard.startswith('*'):
392 raise WildcardException('Invalid wildcard with more than 2 consecutive ' 510 raise WildcardException('Invalid wildcard with more than 2 consecutive '
393 '*s (%s)' % wildcard) 511 '*s (%s)' % wildcard)
394 # If there was no remaining wildcard past the recursive wildcard, 512 # If there was no remaining wildcard past the recursive wildcard,
395 # treat it as if it were a '*'. For example, file://tmp/** is equivalent 513 # treat it as if it were a '*'. For example, file://tmp/** is equivalent
396 # to file://tmp/**/* 514 # to file://tmp/**/*
397 if not remaining_wildcard: 515 if not remaining_wildcard:
398 remaining_wildcard = '*' 516 remaining_wildcard = '*'
399 # Skip slash(es). 517 # Skip slash(es).
400 remaining_wildcard = remaining_wildcard.lstrip(os.sep) 518 remaining_wildcard = remaining_wildcard.lstrip(os.sep)
401 filepaths = self._iter_dir(base_dir, remaining_wildcard) 519 filepaths = self._IterDir(base_dir, remaining_wildcard)
402 else: 520 else:
403 # Not a recursive wildcarding request. 521 # Not a recursive wildcarding request.
404 filepaths = glob.iglob(wildcard) 522 filepaths = glob.iglob(wildcard)
405 for filepath in filepaths: 523 for filepath in filepaths:
406 expanded_uri = self.wildcard_uri.clone_replace_name(filepath) 524 expanded_url = StorageUrlFromString(filepath)
407 yield BucketListingRef(expanded_uri) 525 if os.path.isdir(filepath):
526 yield BucketListingPrefix(expanded_url)
527 else:
528 yield BucketListingObject(expanded_url)
408 529
409 def _iter_dir(self, dir, wildcard): 530 def _IterDir(self, directory, wildcard):
410 """An iterator over the specified dir and wildcard.""" 531 """An iterator over the specified dir and wildcard."""
411 for dirpath, unused_dirnames, filenames in os.walk(dir): 532 # UTF8-encode directory before passing it to os.walk() so if there are
533 # non-valid UTF8 chars in the file name (e.g., that can happen if the file
534 # originated on Windows) os.walk() will not attempt to decode and then die
535 # with a "codec can't decode byte" error, and instead we can catch the error
536 # at yield time and print a more informative error message.
537 for dirpath, unused_dirnames, filenames in os.walk(directory.encode(UTF8)):
412 for f in fnmatch.filter(filenames, wildcard): 538 for f in fnmatch.filter(filenames, wildcard):
413 yield os.path.join(dirpath, f) 539 try:
540 yield os.path.join(dirpath, f).decode(UTF8)
541 except UnicodeDecodeError:
542 # Note: We considered several ways to deal with this, but each had
543 # problems:
544 # 1. Raise an exception and try to catch in a higher layer (the
545 # gsutil cp command), so we can properly support the gsutil cp -c
546 # option. That doesn't work because raising an exception during
547 # iteration terminates the generator.
548 # 2. Accumulate a list of bad filenames and skip processing each
549 # during iteration, then raise at the end, with exception text
550 # printing the bad paths. That doesn't work because iteration is
551 # wrapped in PluralityCheckableIterator, so it's possible there
552 # are not-yet-performed copy operations at the time we reach the
553 # end of the iteration and raise the exception - which would cause
554 # us to skip copying validly named files. Moreover, the gsutil
555 # cp command loops over argv, so if you run the command gsutil cp
556 # -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1
557 # would cause dir2 never to be visited.
558 # 3. Print the invalid pathname and skip it during iteration. That
559 # would work but would mean gsutil cp could exit with status 0
560 # even though some files weren't copied.
561 # 4. Change the WildcardIterator to include an error status along with
562 # the result. That would solve the problem but would be a
563 # substantial change (WildcardIterator is used in many parts of
564 # gsutil), and we didn't feel that magnitude of change was
565 # warranted by this relatively uncommon corner case.
566 # Instead we chose to abort when one such file is encountered, and
567 # require the user to remove or rename the files and try again.
568 raise CommandException('\n'.join(textwrap.wrap(
569 'Invalid Unicode path encountered (%s). gsutil cannot proceed '
570 'with such files present. Please remove or rename this file and '
571 'try again.' % repr(os.path.join(dirpath, f)))))
414 572
415 def IterKeys(self): 573 # pylint: disable=unused-argument
574 def IterObjects(self, bucket_listing_fields=None):
575 """Iterates over the wildcard, yielding only object (file) refs.
576
577 Args:
578 bucket_listing_fields: Ignored as filesystems don't have buckets.
579
580 Yields:
581 BucketListingRefs of type OBJECT or empty iterator if no matches.
416 """ 582 """
417 Placeholder to allow polymorphic use of WildcardIterator. 583 for bucket_listing_ref in self.IterAll():
584 if bucket_listing_ref.IsObject():
585 yield bucket_listing_ref
586
587 # pylint: disable=unused-argument
588 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False):
589 """Iterates over the wildcard, yielding BucketListingRefs.
590
591 Args:
592 bucket_listing_fields: Ignored; filesystems don't have buckets.
593 expand_top_level_buckets: Ignored; filesystems don't have buckets.
594
595 Yields:
596 BucketListingRefs of type OBJECT (file) or PREFIX (directory),
597 or empty iterator if no matches.
598 """
599 for bucket_listing_ref in self.__iter__():
600 yield bucket_listing_ref
601
602 def IterBuckets(self, unused_bucket_fields=None):
603 """Placeholder to allow polymorphic use of WildcardIterator.
604
605 Args:
606 unused_bucket_fields: Ignored; filesystems don't have buckets.
418 607
419 Raises: 608 Raises:
420 WildcardException: in all cases. 609 WildcardException: in all cases.
421 """ 610 """
422 raise WildcardException( 611 raise WildcardException(
423 'Iterating over Keys not possible for file wildcards') 612 'Iterating over Buckets not possible for file wildcards')
424
425 def IterUris(self):
426 """
427 Convenience iterator that runs underlying iterator and returns StorageUri
428 for each iteration.
429
430 Yields:
431 StorageUri, or empty iterator if no matches.
432 """
433 for bucket_listing_ref in self. __iter__():
434 yield bucket_listing_ref.GetUri()
435 613
436 614
437 class WildcardException(StandardError): 615 class WildcardException(StandardError):
438 """Exception thrown for invalid wildcard URIs.""" 616 """Exception raised for invalid wildcard URLs."""
439 617
440 def __init__(self, reason): 618 def __init__(self, reason):
441 StandardError.__init__(self) 619 StandardError.__init__(self)
442 self.reason = reason 620 self.reason = reason
443 621
444 def __repr__(self): 622 def __repr__(self):
445 return 'WildcardException: %s' % self.reason 623 return 'WildcardException: %s' % self.reason
446 624
447 def __str__(self): 625 def __str__(self):
448 return 'WildcardException: %s' % self.reason 626 return 'WildcardException: %s' % self.reason
449 627
450 628
451 def wildcard_iterator(uri_or_str, proj_id_handler, 629 def CreateWildcardIterator(url_str, gsutil_api, all_versions=False, debug=0,
452 bucket_storage_uri_class=BucketStorageUri, 630 project_id=None):
453 all_versions=False, 631 """Instantiate a WildcardIterator for the given URL string.
454 headers=None, debug=0):
455 """Instantiate a WildCardIterator for the given StorageUri.
456 632
457 Args: 633 Args:
458 uri_or_str: StorageUri or URI string naming wildcard objects to iterate. 634 url_str: URL string naming wildcard object(s) to iterate.
459 proj_id_handler: ProjectIdHandler to use for current command. 635 gsutil_api: Cloud storage interface. Passed in for thread safety, also
460 bucket_storage_uri_class: BucketStorageUri interface. 636 settable for testing/mocking.
461 Settable for testing/mocking. 637 all_versions: If true, the iterator yields all versions of objects
462 headers: Dictionary containing optional HTTP headers to pass to boto. 638 matching the wildcard. If false, yields just the live
463 debug: Debug level to pass in to boto connection (range 0..3). 639 object version.
640 debug: Debug level to control debug output for iterator.
641 project_id: Project id to use for bucket listings.
464 642
465 Returns: 643 Returns:
466 A WildcardIterator that handles the requested iteration. 644 A WildcardIterator that handles the requested iteration.
467 """ 645 """
468 646
469 if isinstance(uri_or_str, basestring): 647 url = StorageUrlFromString(url_str)
470 # Disable enforce_bucket_naming, to allow bucket names containing wildcard 648 if url.IsFileUrl():
471 # chars. 649 return FileWildcardIterator(url, debug=debug)
472 uri = boto.storage_uri( 650 else: # Cloud URL
473 uri_or_str, debug=debug, validate=False,
474 bucket_storage_uri_class=bucket_storage_uri_class,
475 suppress_consec_slashes=False)
476 else:
477 uri = uri_or_str
478
479 if uri.is_cloud_uri():
480 return CloudWildcardIterator( 651 return CloudWildcardIterator(
481 uri, proj_id_handler, 652 url, gsutil_api, all_versions=all_versions, debug=debug,
482 bucket_storage_uri_class=bucket_storage_uri_class, 653 project_id=project_id)
483 all_versions=all_versions,
484 headers=headers,
485 debug=debug)
486 elif uri.is_file_uri():
487 return FileWildcardIterator(uri, headers=headers, debug=debug)
488 else:
489 raise WildcardException('Unexpected type of StorageUri (%s)' % uri)
490
491
492 def ContainsWildcard(uri_or_str):
493 """Checks whether uri_or_str contains a wildcard.
494
495 Args:
496 uri_or_str: StorageUri or URI string to check.
497
498 Returns:
499 bool indicator.
500 """
501 if isinstance(uri_or_str, basestring):
502 return bool(WILDCARD_REGEX.search(uri_or_str))
503 else:
504 return bool(WILDCARD_REGEX.search(uri_or_str.uri))
OLDNEW
« no previous file with comments | « gslib/util.py ('k') | gsutil » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698