Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(189)

Side by Side Diff: third_party/gsutil/gslib/name_expansion.py

Issue 2280023003: depot_tools: Remove third_party/gsutil (Closed)
Patch Set: Created 4 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 # Copyright 2012 Google Inc. All Rights Reserved.
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 # http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 import copy
16 import threading
17 import wildcard_iterator
18
19 from bucket_listing_ref import BucketListingRef
20 from gslib.exception import CommandException
21 from gslib.plurality_checkable_iterator import PluralityCheckableIterator
22 from gslib.storage_uri_builder import StorageUriBuilder
23 from wildcard_iterator import ContainsWildcard
24
25 """
26 Name expansion support for the various ways gsutil lets users refer to
27 collections of data (via explicit wildcarding as well as directory,
28 bucket, and bucket subdir implicit wildcarding). This class encapsulates
29 the various rules for determining how these expansions are done.
30 """
31
32
33 class NameExpansionResult(object):
34 """
35 Holds one fully expanded result from iterating over NameExpansionIterator.
36
37 The member data in this class need to be pickleable because
38 NameExpansionResult instances are passed through Multiprocessing.Queue. In
39 particular, don't include any boto state like StorageUri, since that pulls
40 in a big tree of objects, some of which aren't pickleable (and even if
41 they were, pickling/unpickling such a large object tree would result in
42 significant overhead).
43
44 The state held in this object is needed for handling the various naming cases
45 (e.g., copying from a single source URI to a directory generates different
46 dest URI names than copying multiple URIs to a directory, to be consistent
47 with naming rules used by the Unix cp command). For more details see comments
48 in _NameExpansionIterator.
49 """
50
51 def __init__(self, src_uri_str, is_multi_src_request,
52 src_uri_expands_to_multi, names_container, expanded_uri_str,
53 have_existing_dst_container=None, is_latest=False):
54 """
55 Args:
56 src_uri_str: string representation of StorageUri that was expanded.
57 is_multi_src_request: bool indicator whether src_uri_str expanded to more
58 than 1 BucketListingRef.
59 src_uri_expands_to_multi: bool indicator whether the current src_uri
60 expanded to more than 1 BucketListingRef.
61 names_container: Bool indicator whether src_uri names a container.
62 expanded_uri_str: string representation of StorageUri to which src_uri_str
63 expands.
64 have_existing_dst_container: bool indicator whether this is a copy
65 request to an existing bucket, bucket subdir, or directory. Default
66 None value should be used in cases where this is not needed (commands
67 other than cp).
68 is_latest: Bool indicating that the result represents the object's current
69 version.
70 """
71 self.src_uri_str = src_uri_str
72 self.is_multi_src_request = is_multi_src_request
73 self.src_uri_expands_to_multi = src_uri_expands_to_multi
74 self.names_container = names_container
75 self.expanded_uri_str = expanded_uri_str
76 self.have_existing_dst_container = have_existing_dst_container
77 self.is_latest = is_latest
78
79 def __repr__(self):
80 return '%s' % self.expanded_uri_str
81
82 def IsEmpty(self):
83 """Returns True if name expansion yielded no matches."""
84 return self.expanded_blr is None
85
86 def GetSrcUriStr(self):
87 """Returns the string representation of the StorageUri that was expanded."""
88 return self.src_uri_str
89
90 def IsMultiSrcRequest(self):
91 """
92 Returns bool indicator whether name expansion resulted in more than 0
93 BucketListingRef.
94 """
95 return self.is_multi_src_request
96
97 def SrcUriExpandsToMulti(self):
98 """
99 Returns bool indicator whether the current src_uri expanded to more than
100 1 BucketListingRef
101 """
102 return self.src_uri_expands_to_multi
103
104 def NamesContainer(self):
105 """
106 Returns bool indicator of whether src_uri names a directory, bucket, or
107 bucket subdir.
108 """
109 return self.names_container
110
111 def GetExpandedUriStr(self):
112 """
113 Returns the string representation of StorageUri to which src_uri_str
114 expands.
115 """
116 return self.expanded_uri_str
117
118 def HaveExistingDstContainer(self):
119 """Returns bool indicator whether this is a copy request to an
120 existing bucket, bucket subdir, or directory, or None if not
121 relevant."""
122 return self.have_existing_dst_container
123
124
125 class _NameExpansionIterator(object):
126 """
127 Iterates over all src_uris, expanding wildcards, object-less bucket names,
128 subdir bucket names, and directory names, generating a flat listing of all
129 the matching objects/files.
130
131 You should instantiate this object using the static factory function
132 NameExpansionIterator, because consumers of this iterator need the
133 PluralityCheckableIterator wrapper built by that function.
134
135 Yields:
136 gslib.name_expansion.NameExpansionResult.
137
138 Raises:
139 CommandException: if errors encountered.
140 """
141
142 def __init__(self, command_name, proj_id_handler, headers, debug,
143 bucket_storage_uri_class, uri_strs, recursion_requested,
144 have_existing_dst_container=None, flat=True,
145 all_versions=False, for_all_version_delete=False):
146 """
147 Args:
148 command_name: name of command being run.
149 proj_id_handler: ProjectIdHandler to use for current command.
150 headers: Dictionary containing optional HTTP headers to pass to boto.
151 debug: Debug level to pass in to boto connection (range 0..3).
152 bucket_storage_uri_class: Class to instantiate for cloud StorageUris.
153 Settable for testing/mocking.
154 uri_strs: PluralityCheckableIterator of URI strings needing expansion.
155 recursion_requested: True if -R specified on command-line.
156 have_existing_dst_container: Bool indicator whether this is a copy
157 request to an existing bucket, bucket subdir, or directory. Default
158 None value should be used in cases where this is not needed (commands
159 other than cp).
160 flat: Bool indicating whether bucket listings should be flattened, i.e.,
161 so the mapped-to results contain objects spanning subdirectories.
162 all_versions: Bool indicating whether to iterate over all object versions.
163 for_all_version_delete: Bool indicating whether this is for an all-version
164 delete.
165
166 Examples of _NameExpansionIterator with flat=True:
167 - Calling with one of the uri_strs being 'gs://bucket' will enumerate all
168 top-level objects, as will 'gs://bucket/' and 'gs://bucket/*'.
169 - 'gs://bucket/**' will enumerate all objects in the bucket.
170 - 'gs://bucket/abc' will enumerate all next-level objects under directory
171 abc (i.e., not including subdirectories of abc) if gs://bucket/abc/*
172 matches any objects; otherwise it will enumerate the single name
173 gs://bucket/abc
174 - 'gs://bucket/abc/**' will enumerate all objects under abc or any of its
175 subdirectories.
176 - 'file:///tmp' will enumerate all files under /tmp, as will
177 'file:///tmp/*'
178 - 'file:///tmp/**' will enumerate all files under /tmp or any of its
179 subdirectories.
180
181 Example if flat=False: calling with gs://bucket/abc/* lists matching objects
182 or subdirs, but not sub-subdirs or objects beneath subdirs.
183
184 Note: In step-by-step comments below we give examples assuming there's a
185 gs://bucket with object paths:
186 abcd/o1.txt
187 abcd/o2.txt
188 xyz/o1.txt
189 xyz/o2.txt
190 and a directory file://dir with file paths:
191 dir/a.txt
192 dir/b.txt
193 dir/c/
194 """
195 self.command_name = command_name
196 self.proj_id_handler = proj_id_handler
197 self.headers = headers
198 self.debug = debug
199 self.bucket_storage_uri_class = bucket_storage_uri_class
200 self.suri_builder = StorageUriBuilder(debug, bucket_storage_uri_class)
201 self.uri_strs = uri_strs
202 self.recursion_requested = recursion_requested
203 self.have_existing_dst_container = have_existing_dst_container
204 self.flat = flat
205 self.all_versions = all_versions
206
207 # Map holding wildcard strings to use for flat vs subdir-by-subdir listings.
208 # (A flat listing means show all objects expanded all the way down.)
209 self._flatness_wildcard = {True: '**', False: '*'}
210
211 def __iter__(self):
212 for uri_str in self.uri_strs:
213 # Step 1: Expand any explicitly specified wildcards. The output from this
214 # step is an iterator of BucketListingRef.
215 # Starting with gs://buck*/abc* this step would expand to gs://bucket/abcd
216 if ContainsWildcard(uri_str):
217 post_step1_iter = self._WildcardIterator(uri_str)
218 else:
219 suri = self.suri_builder.StorageUri(uri_str)
220 post_step1_iter = iter([BucketListingRef(suri)])
221 post_step1_iter = PluralityCheckableIterator(post_step1_iter)
222
223 # Step 2: Expand bucket subdirs and versions. The output from this
224 # step is an iterator of (names_container, BucketListingRef).
225 # Starting with gs://bucket/abcd this step would expand to:
226 # iter([(True, abcd/o1.txt), (True, abcd/o2.txt)]).
227 if self.flat and self.recursion_requested:
228 post_step2_iter = _ImplicitBucketSubdirIterator(self,
229 post_step1_iter, self.flat)
230 elif self.all_versions:
231 post_step2_iter = _AllVersionIterator(self, post_step1_iter,
232 headers=self.headers)
233 else:
234 post_step2_iter = _NonContainerTuplifyIterator(post_step1_iter)
235 post_step2_iter = PluralityCheckableIterator(post_step2_iter)
236
237 # Step 3. Expand directories and buckets. This step yields the iterated
238 # values. Starting with gs://bucket this step would expand to:
239 # [abcd/o1.txt, abcd/o2.txt, xyz/o1.txt, xyz/o2.txt]
240 # Starting with file://dir this step would expand to:
241 # [dir/a.txt, dir/b.txt, dir/c/]
242 exp_src_bucket_listing_refs = []
243 wc = self._flatness_wildcard[self.flat]
244 src_uri_expands_to_multi = (post_step1_iter.has_plurality()
245 or post_step2_iter.has_plurality())
246 is_multi_src_request = (self.uri_strs.has_plurality()
247 or src_uri_expands_to_multi)
248
249 if post_step2_iter.is_empty():
250 raise CommandException('No URIs matched: %s' % uri_str)
251 for (names_container, blr) in post_step2_iter:
252 if (not blr.GetUri().names_container()
253 and (self.flat or not blr.HasPrefix())):
254 yield NameExpansionResult(uri_str, is_multi_src_request,
255 src_uri_expands_to_multi, names_container,
256 blr.GetUriString(),
257 self.have_existing_dst_container,
258 is_latest=blr.IsLatest())
259 continue
260 if not self.recursion_requested:
261 if blr.GetUri().is_file_uri():
262 desc = 'directory'
263 else:
264 desc = 'bucket'
265 print 'Omitting %s "%s". (Did you mean to do %s -R?)' % (
266 desc, blr.GetUri(), self.command_name)
267 continue
268 if blr.GetUri().is_file_uri():
269 # Convert dir to implicit recursive wildcard.
270 uri_to_iterate = '%s/%s' % (blr.GetUriString(), wc)
271 else:
272 # Convert bucket to implicit recursive wildcard.
273 uri_to_iterate = blr.GetUri().clone_replace_name(wc)
274 wc_iter = PluralityCheckableIterator(
275 self._WildcardIterator(uri_to_iterate))
276 src_uri_expands_to_multi = (src_uri_expands_to_multi
277 or wc_iter.has_plurality())
278 is_multi_src_request = (self.uri_strs.has_plurality()
279 or src_uri_expands_to_multi)
280 for blr in wc_iter:
281 yield NameExpansionResult(uri_str, is_multi_src_request,
282 src_uri_expands_to_multi, True,
283 blr.GetUriString(),
284 self.have_existing_dst_container,
285 is_latest=blr.IsLatest())
286
287 def _WildcardIterator(self, uri_or_str):
288 """
289 Helper to instantiate gslib.WildcardIterator. Args are same as
290 gslib.WildcardIterator interface, but this method fills in most of the
291 values from instance state.
292
293 Args:
294 uri_or_str: StorageUri or URI string naming wildcard objects to iterate.
295 """
296 return wildcard_iterator.wildcard_iterator(
297 uri_or_str, self.proj_id_handler,
298 bucket_storage_uri_class=self.bucket_storage_uri_class,
299 headers=self.headers, debug=self.debug,
300 all_versions=self.all_versions)
301
302
303 def NameExpansionIterator(command_name, proj_id_handler, headers, debug,
304 bucket_storage_uri_class, uri_strs,
305 recursion_requested,
306 have_existing_dst_container=None, flat=True,
307 all_versions=False,
308 for_all_version_delete=False):
309 """
310 Static factory function for instantiating _NameExpansionIterator, which
311 wraps the resulting iterator in a PluralityCheckableIterator and checks
312 that it is non-empty. Also, allows uri_strs can be either an array or an
313 iterator.
314
315 Args:
316 command_name: name of command being run.
317 proj_id_handler: ProjectIdHandler to use for current command.
318 headers: Dictionary containing optional HTTP headers to pass to boto.
319 debug: Debug level to pass in to boto connection (range 0..3).
320 bucket_storage_uri_class: Class to instantiate for cloud StorageUris.
321 Settable for testing/mocking.
322 uri_strs: PluralityCheckableIterator of URI strings needing expansion.
323 recursion_requested: True if -R specified on command-line.
324 have_existing_dst_container: Bool indicator whether this is a copy
325 request to an existing bucket, bucket subdir, or directory. Default
326 None value should be used in cases where this is not needed (commands
327 other than cp).
328 flat: Bool indicating whether bucket listings should be flattened, i.e.,
329 so the mapped-to results contain objects spanning subdirectories.
330 all_versions: Bool indicating whether to iterate over all object versions.
331 for_all_version_delete: Bool indicating whether this is for an all-version
332 delete.
333
334 Examples of ExpandWildcardsAndContainers with flat=True:
335 - Calling with one of the uri_strs being 'gs://bucket' will enumerate all
336 top-level objects, as will 'gs://bucket/' and 'gs://bucket/*'.
337 - 'gs://bucket/**' will enumerate all objects in the bucket.
338 - 'gs://bucket/abc' will enumerate all next-level objects under directory
339 abc (i.e., not including subdirectories of abc) if gs://bucket/abc/*
340 matches any objects; otherwise it will enumerate the single name
341 gs://bucket/abc
342 - 'gs://bucket/abc/**' will enumerate all objects under abc or any of its
343 subdirectories.
344 - 'file:///tmp' will enumerate all files under /tmp, as will
345 'file:///tmp/*'
346 - 'file:///tmp/**' will enumerate all files under /tmp or any of its
347 subdirectories.
348
349 Example if flat=False: calling with gs://bucket/abc/* lists matching objects
350 or subdirs, but not sub-subdirs or objects beneath subdirs.
351
352 Note: In step-by-step comments below we give examples assuming there's a
353 gs://bucket with object paths:
354 abcd/o1.txt
355 abcd/o2.txt
356 xyz/o1.txt
357 xyz/o2.txt
358 and a directory file://dir with file paths:
359 dir/a.txt
360 dir/b.txt
361 dir/c/
362 """
363 uri_strs = PluralityCheckableIterator(uri_strs)
364 name_expansion_iterator = _NameExpansionIterator(
365 command_name, proj_id_handler, headers, debug, bucket_storage_uri_class,
366 uri_strs, recursion_requested, have_existing_dst_container, flat,
367 all_versions=all_versions, for_all_version_delete=for_all_version_delete)
368 name_expansion_iterator = PluralityCheckableIterator(name_expansion_iterator)
369 if name_expansion_iterator.is_empty():
370 raise CommandException('No URIs matched')
371 return name_expansion_iterator
372
373
374 class NameExpansionIteratorQueue(object):
375 """
376 Wrapper around NameExpansionIterator that provides a Multiprocessing.Queue
377 facade.
378
379 Only a blocking get() function can be called, and the block and timeout
380 params on that function are ignored. All other class functions raise
381 NotImplementedError.
382
383 This class is thread safe.
384 """
385
386 def __init__(self, name_expansion_iterator, final_value):
387 self.name_expansion_iterator = name_expansion_iterator
388 self.final_value = final_value
389 self.lock = threading.Lock()
390
391 def qsize(self):
392 raise NotImplementedError(
393 "NameExpansionIteratorQueue.qsize() not implemented")
394
395 def empty(self):
396 raise NotImplementedError(
397 "NameExpansionIteratorQueue.empty() not implemented")
398
399 def full(self):
400 raise NotImplementedError(
401 "NameExpansionIteratorQueue.full() not implemented")
402
403 def put(self, obj=None, block=None, timeout=None):
404 raise NotImplementedError(
405 "NameExpansionIteratorQueue.put() not implemented")
406
407 def put_nowait(self, obj):
408 raise NotImplementedError(
409 "NameExpansionIteratorQueue.put_nowait() not implemented")
410
411 def get(self, block=None, timeout=None):
412 self.lock.acquire()
413 try:
414 if self.name_expansion_iterator.is_empty():
415 return self.final_value
416 return self.name_expansion_iterator.next()
417 finally:
418 self.lock.release()
419
420 def get_nowait(self):
421 raise NotImplementedError(
422 "NameExpansionIteratorQueue.get_nowait() not implemented")
423
424 def get_no_wait(self):
425 raise NotImplementedError(
426 "NameExpansionIteratorQueue.get_no_wait() not implemented")
427
428 def close(self):
429 raise NotImplementedError(
430 "NameExpansionIteratorQueue.close() not implemented")
431
432 def join_thread(self):
433 raise NotImplementedError(
434 "NameExpansionIteratorQueue.join_thread() not implemented")
435
436 def cancel_join_thread(self):
437 raise NotImplementedError(
438 "NameExpansionIteratorQueue.cancel_join_thread() not implemented")
439
440
441 class _NonContainerTuplifyIterator(object):
442 """
443 Iterator that produces the tuple (False, blr) for each iteration
444 of blr_iter. Used for cases where blr_iter iterates over a set of
445 BucketListingRefs known not to name containers.
446 """
447
448 def __init__(self, blr_iter):
449 """
450 Args:
451 blr_iter: iterator of BucketListingRef.
452 """
453 self.blr_iter = blr_iter
454
455 def __iter__(self):
456 for blr in self.blr_iter:
457 yield (False, blr)
458
459
460 class _ImplicitBucketSubdirIterator(object):
461
462 """
463 Iterator wrapper that iterates over blr_iter, performing implicit bucket
464 subdir expansion.
465
466 Each iteration yields tuple (names_container, expanded BucketListingRefs)
467 where names_container is true if URI names a directory, bucket,
468 or bucket subdir (vs how StorageUri.names_container() doesn't
469 handle latter case).
470
471 For example, iterating over [BucketListingRef("gs://abc")] would expand to:
472 [BucketListingRef("gs://abc/o1"), BucketListingRef("gs://abc/o2")]
473 if those subdir objects exist, and [BucketListingRef("gs://abc") otherwise.
474 """
475
476 def __init__(self, name_expansion_instance, blr_iter, flat):
477 """
478 Args:
479 name_expansion_instance: calling instance of NameExpansion class.
480 blr_iter: iterator of BucketListingRef.
481 flat: bool indicating whether bucket listings should be flattened, i.e.,
482 so the mapped-to results contain objects spanning subdirectories.
483 """
484 self.blr_iter = blr_iter
485 self.name_expansion_instance = name_expansion_instance
486 self.flat = flat
487
488 def __iter__(self):
489 for blr in self.blr_iter:
490 uri = blr.GetUri()
491 if uri.names_object():
492 # URI could be a bucket subdir.
493 implicit_subdir_iterator = PluralityCheckableIterator(
494 self.name_expansion_instance._WildcardIterator(
495 self.name_expansion_instance.suri_builder.StorageUri(
496 '%s/%s' % (uri.uri.rstrip('/'),
497 self.name_expansion_instance._flatness_wildcard[
498 self.flat]))))
499 if not implicit_subdir_iterator.is_empty():
500 for exp_blr in implicit_subdir_iterator:
501 yield (True, exp_blr)
502 else:
503 yield (False, blr)
504 else:
505 yield (False, blr)
506
507 class _AllVersionIterator(object):
508 """
509 Iterator wrapper that iterates over blr_iter, performing implicit version
510 expansion.
511
512 Output behavior is identical to that in _ImplicitBucketSubdirIterator above.
513
514 For example, iterating over [BucketListingRef("gs://abc/o1")] would expand to:
515 [BucketListingRef("gs://abc/o1#1234"), BucketListingRef("gs://abc/o1#1235")]
516 """
517
518 def __init__(self, name_expansion_instance, blr_iter, headers=None):
519 """
520 Args:
521 name_expansion_instance: calling instance of NameExpansion class.
522 blr_iter: iterator of BucketListingRef.
523 flat: bool indicating whether bucket listings should be flattened, i.e.,
524 so the mapped-to results contain objects spanning subdirectories.
525 """
526 self.blr_iter = blr_iter
527 self.name_expansion_instance = name_expansion_instance
528 self.headers = headers
529
530 def __iter__(self):
531 empty = True
532 for blr in self.blr_iter:
533 uri = blr.GetUri()
534 if not uri.names_object():
535 empty = False
536 yield (True, blr)
537 break
538 for key in uri.list_bucket(
539 prefix=uri.object_name, headers=self.headers, all_versions=True):
540 if key.name != uri.object_name:
541 # The desired entries will be alphabetically first in this listing.
542 break
543 version_blr = BucketListingRef(uri.clone_replace_key(key), key=key)
544 empty = False
545 yield (False, version_blr)
546 # If no version exists, yield the unversioned blr, and let the consuming
547 # operation fail. This mirrors behavior in _ImplicitBucketSubdirIterator.
548 if empty:
549 yield (False, blr)
550
OLDNEW
« no previous file with comments | « third_party/gsutil/gslib/help_provider.py ('k') | third_party/gsutil/gslib/no_op_auth_plugin.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698