| OLD | NEW |
| 1 # -*- coding: utf-8 -*- |
| 1 # Copyright 2010 Google Inc. All Rights Reserved. | 2 # Copyright 2010 Google Inc. All Rights Reserved. |
| 2 # | 3 # |
| 3 # Permission is hereby granted, free of charge, to any person obtaining a | 4 # Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 # copy of this software and associated documentation files (the | 5 # you may not use this file except in compliance with the License. |
| 5 # "Software"), to deal in the Software without restriction, including | 6 # You may obtain a copy of the License at |
| 6 # without limitation the rights to use, copy, modify, merge, publish, dis- | |
| 7 # tribute, sublicense, and/or sell copies of the Software, and to permit | |
| 8 # persons to whom the Software is furnished to do so, subject to the fol- | |
| 9 # lowing conditions: | |
| 10 # | 7 # |
| 11 # The above copyright notice and this permission notice shall be included | 8 # http://www.apache.org/licenses/LICENSE-2.0 |
| 12 # in all copies or substantial portions of the Software. | |
| 13 # | 9 # |
| 14 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS | 10 # Unless required by applicable law or agreed to in writing, software |
| 15 # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABIL- | 11 # distributed under the License is distributed on an "AS IS" BASIS, |
| 16 # ITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT | 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 17 # SHALL THE AUTHOR BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | 13 # See the License for the specific language governing permissions and |
| 18 # WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | 14 # limitations under the License. |
| 19 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS | 15 """Wildcard iterator class and supporting functions.""" |
| 20 # IN THE SOFTWARE. | 16 |
| 21 | 17 from __future__ import absolute_import |
| 22 """Implementation of wildcarding over StorageUris. | 18 |
| 23 | |
| 24 StorageUri is an abstraction that Google introduced in the boto library, | |
| 25 for representing storage provider-independent bucket and object names with | |
| 26 a shorthand URI-like syntax (see boto/boto/storage_uri.py) The current | |
| 27 class provides wildcarding support for StorageUri objects (including both | |
| 28 bucket and file system objects), allowing one to express collections of | |
| 29 objects with syntax like the following: | |
| 30 gs://mybucket/images/*.png | |
| 31 file:///tmp/???abc??? | |
| 32 | |
| 33 We provide wildcarding support as part of gsutil rather than as part | |
| 34 of boto because wildcarding is really part of shell command-like | |
| 35 functionality. | |
| 36 | |
| 37 A comment about wildcard semantics: We support both single path component | |
| 38 wildcards (e.g., using '*') and recursive wildcards (using '**'), for both | |
| 39 file and cloud URIs. For example, | |
| 40 gs://bucket/doc/*/*.html | |
| 41 would enumerate HTML files one directory down from gs://bucket/doc, while | |
| 42 gs://bucket/**/*.html | |
| 43 would enumerate HTML files in all objects contained in the bucket. | |
| 44 | |
| 45 Note also that if you use file system wildcards it's likely your shell | |
| 46 interprets the wildcarding before passing the command to gsutil. For example: | |
| 47 % gsutil cp /opt/eclipse/*/*.html gs://bucket/eclipse | |
| 48 would likely be expanded by the shell into the following before running gsutil: | |
| 49 % gsutil cp /opt/eclipse/RUNNING.html gs://bucket/eclipse | |
| 50 | |
| 51 Note also that most shells don't support '**' wildcarding (I think only | |
| 52 zsh does). If you want to use '**' wildcarding with such a shell you can | |
| 53 single quote each wildcarded string, so it gets passed uninterpreted by the | |
| 54 shell to gsutil (at which point gsutil will perform the wildcarding expansion): | |
| 55 % gsutil cp '/opt/eclipse/**/*.html' gs://bucket/eclipse | |
| 56 """ | |
| 57 | |
| 58 import boto | |
| 59 import fnmatch | 19 import fnmatch |
| 60 import glob | 20 import glob |
| 61 import os | 21 import os |
| 62 import re | 22 import re |
| 63 import sys | 23 import sys |
| 64 import urllib | 24 import textwrap |
| 65 | 25 |
| 66 from boto.s3.prefix import Prefix | 26 from gslib.bucket_listing_ref import BucketListingBucket |
| 67 from boto.storage_uri import BucketStorageUri | 27 from gslib.bucket_listing_ref import BucketListingObject |
| 68 from bucket_listing_ref import BucketListingRef | 28 from gslib.bucket_listing_ref import BucketListingPrefix |
| 69 | 29 from gslib.cloud_api import AccessDeniedException |
| 70 # Regex to determine if a string contains any wildcards. | 30 from gslib.cloud_api import CloudApi |
| 71 WILDCARD_REGEX = re.compile('[*?\[\]]') | 31 from gslib.cloud_api import NotFoundException |
| 72 | 32 from gslib.exception import CommandException |
| 73 WILDCARD_OBJECT_ITERATOR = 'wildcard_object_iterator' | 33 from gslib.storage_url import ContainsWildcard |
| 74 WILDCARD_BUCKET_ITERATOR = 'wildcard_bucket_iterator' | 34 from gslib.storage_url import StorageUrlFromString |
| 35 from gslib.storage_url import StripOneSlash |
| 36 from gslib.storage_url import WILDCARD_REGEX |
| 37 from gslib.translation_helper import GenerationFromUrlAndString |
| 38 from gslib.util import UTF8 |
| 39 |
| 40 |
| 41 FLAT_LIST_REGEX = re.compile(r'(?P<before>.*?)\*\*(?P<after>.*)') |
| 75 | 42 |
| 76 | 43 |
| 77 class WildcardIterator(object): | 44 class WildcardIterator(object): |
| 78 """Base class for wildcarding over StorageUris. | 45 """Class for iterating over Google Cloud Storage strings containing wildcards. |
| 79 | |
| 80 This class implements support for iterating over StorageUris that | |
| 81 contain wildcards. | |
| 82 | 46 |
| 83 The base class is abstract; you should instantiate using the | 47 The base class is abstract; you should instantiate using the |
| 84 wildcard_iterator() static factory method, which chooses the right | 48 wildcard_iterator() static factory method, which chooses the right |
| 85 implementation depending on the StorageUri. | 49 implementation depending on the base string. |
| 86 """ | 50 """ |
| 87 | 51 |
| 52 # TODO: Standardize on __str__ and __repr__ here and elsewhere. Define both |
| 53 # and make one return the other. |
| 88 def __repr__(self): | 54 def __repr__(self): |
| 89 """Returns string representation of WildcardIterator.""" | 55 """Returns string representation of WildcardIterator.""" |
| 90 return 'WildcardIterator(%s)' % self.wildcard_uri | 56 return 'WildcardIterator(%s)' % self.wildcard_url.url_string |
| 91 | 57 |
| 92 | 58 |
| 93 class CloudWildcardIterator(WildcardIterator): | 59 class CloudWildcardIterator(WildcardIterator): |
| 94 """WildcardIterator subclass for buckets and objects. | 60 """WildcardIterator subclass for buckets, bucket subdirs and objects. |
| 95 | 61 |
| 96 Iterates over BucketListingRef matching the StorageUri wildcard. It's | 62 Iterates over BucketListingRef matching the Url string wildcard. It's |
| 97 much more efficient to request the Key from the BucketListingRef (via | 63 much more efficient to first get metadata that's available in the Bucket |
| 98 GetKey()) than to request the StorageUri and then call uri.get_key() | 64 (for example to get the name and size of each object), because that |
| 99 to retrieve the key, for cases where you want to get metadata that's | 65 information is available in the object list results. |
| 100 available in the Bucket (for example to get the name and size of | |
| 101 each object), because that information is available in the bucket GET | |
| 102 results. If you were to iterate over URIs for such cases and then get | |
| 103 the name and size info from each resulting StorageUri, it would cause | |
| 104 an additional object GET request for each of the result URIs. | |
| 105 """ | 66 """ |
| 106 | 67 |
| 107 def __init__(self, wildcard_uri, proj_id_handler, | 68 def __init__(self, wildcard_url, gsutil_api, all_versions=False, |
| 108 bucket_storage_uri_class=BucketStorageUri, all_versions=False, | 69 debug=0, project_id=None): |
| 109 headers=None, debug=0): | 70 """Instantiates an iterator that matches the wildcard URL. |
| 71 |
| 72 Args: |
| 73 wildcard_url: CloudUrl that contains the wildcard to iterate. |
| 74 gsutil_api: Cloud storage interface. Passed in for thread safety, also |
| 75 settable for testing/mocking. |
| 76 all_versions: If true, the iterator yields all versions of objects |
| 77 matching the wildcard. If false, yields just the live |
| 78 object version. |
| 79 debug: Debug level to control debug output for iterator. |
| 80 project_id: Project ID to use for bucket listings. |
| 110 """ | 81 """ |
| 111 Instantiates an iterator over BucketListingRef matching given wildcard URI. | 82 self.wildcard_url = wildcard_url |
| 112 | |
| 113 Args: | |
| 114 wildcard_uri: StorageUri that contains the wildcard to iterate. | |
| 115 proj_id_handler: ProjectIdHandler to use for current command. | |
| 116 bucket_storage_uri_class: BucketStorageUri interface. | |
| 117 Settable for testing/mocking. | |
| 118 headers: Dictionary containing optional HTTP headers to pass to boto. | |
| 119 debug: Debug level to pass in to boto connection (range 0..3). | |
| 120 """ | |
| 121 self.wildcard_uri = wildcard_uri | |
| 122 # Make a copy of the headers so any updates we make during wildcard | |
| 123 # expansion aren't left in the input params (specifically, so we don't | |
| 124 # include the x-goog-project-id header needed by a subset of cases, in | |
| 125 # the data returned to caller, which could then be used in other cases | |
| 126 # where that header must not be passed). | |
| 127 if headers is None: | |
| 128 self.headers = {} | |
| 129 else: | |
| 130 self.headers = headers.copy() | |
| 131 self.proj_id_handler = proj_id_handler | |
| 132 self.bucket_storage_uri_class = bucket_storage_uri_class | |
| 133 self.all_versions = all_versions | 83 self.all_versions = all_versions |
| 134 self.debug = debug | 84 self.debug = debug |
| 135 | 85 self.gsutil_api = gsutil_api |
| 136 def __iter__(self): | 86 self.project_id = project_id |
| 137 """Python iterator that gets called when iterating over cloud wildcard. | 87 |
| 88 def __iter__(self, bucket_listing_fields=None, |
| 89 expand_top_level_buckets=False): |
| 90 """Iterator that gets called when iterating over the cloud wildcard. |
| 91 |
| 92 In the case where no wildcard is present, returns a single matching object, |
| 93 single matching prefix, or one of each if both exist. |
| 94 |
| 95 Args: |
| 96 bucket_listing_fields: Iterable fields to include in bucket listings. |
| 97 Ex. ['name', 'acl']. Iterator is |
| 98 responsible for converting these to list-style |
| 99 format ['items/name', 'items/acl'] as well as |
| 100 adding any fields necessary for listing such as |
| 101 prefixes. API implemenation is responsible for |
| 102 adding pagination fields. If this is None, |
| 103 all fields are returned. |
| 104 expand_top_level_buckets: If true, yield no BUCKET references. Instead, |
| 105 expand buckets into top-level objects and |
| 106 prefixes. |
| 138 | 107 |
| 139 Yields: | 108 Yields: |
| 140 BucketListingRef, or empty iterator if no matches. | 109 BucketListingRef of type BUCKET, OBJECT or PREFIX. |
| 141 """ | 110 """ |
| 142 # First handle bucket wildcarding, if any. | 111 single_version_request = self.wildcard_url.HasGeneration() |
| 143 if ContainsWildcard(self.wildcard_uri.bucket_name): | 112 |
| 144 regex = fnmatch.translate(self.wildcard_uri.bucket_name) | 113 # For wildcard expansion purposes, we need at a minimum the name of |
| 145 bucket_uris = [] | 114 # each object and prefix. If we're not using the default of requesting |
| 146 prog = re.compile(regex) | 115 # all fields, make sure at least these are requested. The Cloud API |
| 147 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_BUCKET_ITERATOR, | 116 # tolerates specifying the same field twice. |
| 148 self.wildcard_uri, | 117 get_fields = None |
| 149 self.headers) | 118 if bucket_listing_fields: |
| 150 for b in self.wildcard_uri.get_all_buckets(headers=self.headers): | 119 get_fields = set() |
| 151 if prog.match(b.name): | 120 for field in bucket_listing_fields: |
| 152 # Use str(b.name) because get_all_buckets() returns Unicode | 121 get_fields.add(field) |
| 153 # string, which when used to construct x-goog-copy-src metadata | 122 bucket_listing_fields = self._GetToListFields( |
| 154 # requests for object-to-object copies causes pathname '/' chars | 123 get_fields=bucket_listing_fields) |
| 155 # to be entity-encoded (bucket%2Fdir instead of bucket/dir), | 124 bucket_listing_fields.update(['items/name', 'prefixes']) |
| 156 # which causes the request to fail. | 125 get_fields.update(['name']) |
| 157 uri_str = '%s://%s' % (self.wildcard_uri.scheme, | 126 # If we're making versioned requests, ensure generation and |
| 158 urllib.quote_plus(str(b.name))) | 127 # metageneration are also included. |
| 159 # TODO: Move bucket_uris to a separate generator function that yields | 128 if single_version_request or self.all_versions: |
| 160 # values instead of pre-computing the list. | 129 bucket_listing_fields.update(['items/generation', |
| 161 bucket_uris.append( | 130 'items/metageneration']) |
| 162 boto.storage_uri( | 131 get_fields.update(['generation', 'metageneration']) |
| 163 uri_str, debug=self.debug, | 132 |
| 164 bucket_storage_uri_class=self.bucket_storage_uri_class, | 133 # Handle bucket wildcarding, if any, in _ExpandBucketWildcards. Then |
| 165 suppress_consec_slashes=False)) | 134 # iterate over the expanded bucket strings and handle any object |
| 166 else: | 135 # wildcarding. |
| 167 bucket_uris = [self.wildcard_uri.clone_replace_name('')] | 136 for bucket_listing_ref in self._ExpandBucketWildcards(bucket_fields=['id']): |
| 168 | 137 bucket_url_string = bucket_listing_ref.url_string |
| 169 # Now iterate over bucket(s), and handle object wildcarding, if any. | 138 if self.wildcard_url.IsBucket(): |
| 170 self.proj_id_handler.FillInProjectHeaderIfNeeded(WILDCARD_OBJECT_ITERATOR, | 139 # IsBucket() guarantees there are no prefix or object wildcards, and |
| 171 self.wildcard_uri, | 140 # thus this is a top-level listing of buckets. |
| 172 self.headers) | 141 if expand_top_level_buckets: |
| 173 for bucket_uri in bucket_uris: | 142 url = StorageUrlFromString(bucket_url_string) |
| 174 if self.wildcard_uri.names_bucket(): | 143 for obj_or_prefix in self.gsutil_api.ListObjects( |
| 175 # Bucket-only URI. | 144 url.bucket_name, delimiter='/', all_versions=self.all_versions, |
| 176 yield BucketListingRef(bucket_uri, key=None, prefix=None, | 145 provider=self.wildcard_url.scheme, |
| 177 headers=self.headers) | 146 fields=bucket_listing_fields): |
| 147 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: |
| 148 yield self._GetObjectRef(bucket_url_string, obj_or_prefix.data, |
| 149 with_version=self.all_versions) |
| 150 else: # CloudApi.CsObjectOrPrefixType.PREFIX: |
| 151 yield self._GetPrefixRef(bucket_url_string, obj_or_prefix.data) |
| 152 else: |
| 153 yield bucket_listing_ref |
| 178 else: | 154 else: |
| 179 # URI contains an object name. If there's no wildcard just yield | 155 # By default, assume a non-wildcarded URL is an object, not a prefix. |
| 180 # the needed URI. | 156 # This prevents unnecessary listings (which are slower, more expensive, |
| 181 if not ContainsWildcard(self.wildcard_uri.object_name): | 157 # and also subject to eventual consistency). |
| 182 uri_to_yield = bucket_uri.clone_replace_name( | 158 if (not ContainsWildcard(self.wildcard_url.url_string) and |
| 183 self.wildcard_uri.object_name) | 159 self.wildcard_url.IsObject() and not self.all_versions): |
| 184 yield BucketListingRef(uri_to_yield, key=None, prefix=None, | 160 try: |
| 185 headers=self.headers) | 161 get_object = self.gsutil_api.GetObjectMetadata( |
| 162 self.wildcard_url.bucket_name, |
| 163 self.wildcard_url.object_name, |
| 164 generation=self.wildcard_url.generation, |
| 165 provider=self.wildcard_url.scheme, |
| 166 fields=get_fields) |
| 167 yield self._GetObjectRef( |
| 168 self.wildcard_url.bucket_url_string, get_object, |
| 169 with_version=(self.all_versions or single_version_request)) |
| 170 return |
| 171 except (NotFoundException, AccessDeniedException): |
| 172 # It's possible this is a prefix - try to list instead. |
| 173 pass |
| 174 |
| 175 # Expand iteratively by building prefix/delimiter bucket listing |
| 176 # request, filtering the results per the current level's wildcard |
| 177 # (if present), and continuing with the next component of the |
| 178 # wildcard. See _BuildBucketFilterStrings() documentation for details. |
| 179 if single_version_request: |
| 180 url_string = '%s%s#%s' % (bucket_url_string, |
| 181 self.wildcard_url.object_name, |
| 182 self.wildcard_url.generation) |
| 186 else: | 183 else: |
| 187 # URI contains a wildcard. Expand iteratively by building | 184 # Rstrip any prefixes to correspond with rstripped prefix wildcard |
| 188 # prefix/delimiter bucket listing request, filtering the results per | 185 # from _BuildBucketFilterStrings(). |
| 189 # the current level's wildcard, and continuing with the next component | 186 url_string = '%s%s' % (bucket_url_string, |
| 190 # of the wildcard. See _BuildBucketFilterStrings() documentation | 187 StripOneSlash(self.wildcard_url.object_name) |
| 191 # for details. | 188 or '/') # Cover root object named '/' case. |
| 192 # | 189 urls_needing_expansion = [url_string] |
| 193 # Initialize the iteration with bucket name from bucket_uri but | 190 while urls_needing_expansion: |
| 194 # object name from self.wildcard_uri. This is needed to handle cases | 191 url = StorageUrlFromString(urls_needing_expansion.pop(0)) |
| 195 # where both the bucket and object names contain wildcards. | 192 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = ( |
| 196 uris_needing_expansion = [ | 193 self._BuildBucketFilterStrings(url.object_name)) |
| 197 bucket_uri.clone_replace_name(self.wildcard_uri.object_name)] | 194 prog = re.compile(fnmatch.translate(prefix_wildcard)) |
| 198 while len(uris_needing_expansion) > 0: | 195 |
| 199 uri = uris_needing_expansion.pop(0) | 196 # List bucket for objects matching prefix up to delimiter. |
| 200 (prefix, delimiter, prefix_wildcard, suffix_wildcard) = ( | 197 for obj_or_prefix in self.gsutil_api.ListObjects( |
| 201 self._BuildBucketFilterStrings(uri.object_name)) | 198 url.bucket_name, prefix=prefix, delimiter=delimiter, |
| 202 prog = re.compile(fnmatch.translate(prefix_wildcard)) | 199 all_versions=self.all_versions or single_version_request, |
| 203 # List bucket for objects matching prefix up to delimiter. | 200 provider=self.wildcard_url.scheme, |
| 204 for key in bucket_uri.list_bucket(prefix=prefix, | 201 fields=bucket_listing_fields): |
| 205 delimiter=delimiter, | 202 if obj_or_prefix.datatype == CloudApi.CsObjectOrPrefixType.OBJECT: |
| 206 headers=self.headers, | 203 gcs_object = obj_or_prefix.data |
| 207 all_versions=self.all_versions): | 204 if prog.match(gcs_object.name): |
| 208 # Check that the prefix regex matches rstripped key.name (to | 205 if not suffix_wildcard or ( |
| 209 # correspond with the rstripped prefix_wildcard from | 206 StripOneSlash(gcs_object.name) == suffix_wildcard): |
| 210 # _BuildBucketFilterStrings()). | 207 if not single_version_request or ( |
| 211 keyname = key.name | 208 self._SingleVersionMatches(gcs_object.generation)): |
| 212 if isinstance(key, Prefix): | 209 yield self._GetObjectRef( |
| 213 keyname = keyname.rstrip('/') | 210 bucket_url_string, gcs_object, with_version=( |
| 214 if prog.match(keyname): | 211 self.all_versions or single_version_request)) |
| 215 if suffix_wildcard and keyname != suffix_wildcard: | 212 else: # CloudApi.CsObjectOrPrefixType.PREFIX |
| 216 if isinstance(key, Prefix): | 213 prefix = obj_or_prefix.data |
| 217 # There's more wildcard left to expand. | 214 # If the prefix ends with a slash, remove it. Note that we only |
| 218 uris_needing_expansion.append( | 215 # remove one slash so that we can successfully enumerate dirs |
| 219 uri.clone_replace_name(key.name.rstrip('/') + '/' | 216 # containing multiple slashes. |
| 220 + suffix_wildcard)) | 217 rstripped_prefix = StripOneSlash(prefix) |
| 218 if prog.match(rstripped_prefix): |
| 219 if suffix_wildcard and rstripped_prefix != suffix_wildcard: |
| 220 # There's more wildcard left to expand. |
| 221 url_append_string = '%s%s' % ( |
| 222 bucket_url_string, rstripped_prefix + '/' + |
| 223 suffix_wildcard) |
| 224 urls_needing_expansion.append(url_append_string) |
| 221 else: | 225 else: |
| 222 # Done expanding. | 226 # No wildcard to expand, just yield the prefix |
| 223 expanded_uri = uri.clone_replace_key(key) | 227 yield self._GetPrefixRef(bucket_url_string, prefix) |
| 224 | |
| 225 if isinstance(key, Prefix): | |
| 226 yield BucketListingRef(expanded_uri, key=None, prefix=key, | |
| 227 headers=self.headers) | |
| 228 else: | |
| 229 if self.all_versions: | |
| 230 yield BucketListingRef(expanded_uri, key=key, prefix=None, | |
| 231 headers=self.headers) | |
| 232 else: | |
| 233 # Yield BLR wrapping version-less URI. | |
| 234 yield BucketListingRef(expanded_uri.clone_replace_name( | |
| 235 expanded_uri.object_name), key=key, prefix=None, | |
| 236 headers=self.headers) | |
| 237 | 228 |
| 238 def _BuildBucketFilterStrings(self, wildcard): | 229 def _BuildBucketFilterStrings(self, wildcard): |
| 239 """ | 230 """Builds strings needed for querying a bucket and filtering results. |
| 240 Builds strings needed for querying a bucket and filtering results to | 231 |
| 241 implement wildcard object name matching. | 232 This implements wildcard object name matching. |
| 242 | 233 |
| 243 Args: | 234 Args: |
| 244 wildcard: The wildcard string to match to objects. | 235 wildcard: The wildcard string to match to objects. |
| 245 | 236 |
| 246 Returns: | 237 Returns: |
| 247 (prefix, delimiter, prefix_wildcard, suffix_wildcard) | 238 (prefix, delimiter, prefix_wildcard, suffix_wildcard) |
| 248 where: | 239 where: |
| 249 prefix is the prefix to be sent in bucket GET request. | 240 prefix is the prefix to be sent in bucket GET request. |
| 250 delimiter is the delimiter to be sent in bucket GET request. | 241 delimiter is the delimiter to be sent in bucket GET request. |
| 251 prefix_wildcard is the wildcard to be used to filter bucket GET results. | 242 prefix_wildcard is the wildcard to be used to filter bucket GET results. |
| (...skipping 31 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 283 prefix = wildcard[:match.start()] | 274 prefix = wildcard[:match.start()] |
| 284 wildcard_part = wildcard[match.start():] | 275 wildcard_part = wildcard[match.start():] |
| 285 else: | 276 else: |
| 286 prefix = None | 277 prefix = None |
| 287 wildcard_part = wildcard | 278 wildcard_part = wildcard |
| 288 end = wildcard_part.find('/') | 279 end = wildcard_part.find('/') |
| 289 if end != -1: | 280 if end != -1: |
| 290 wildcard_part = wildcard_part[:end+1] | 281 wildcard_part = wildcard_part[:end+1] |
| 291 # Remove trailing '/' so we will match gs://bucket/abc* as well as | 282 # Remove trailing '/' so we will match gs://bucket/abc* as well as |
| 292 # gs://bucket/abc*/ with the same wildcard regex. | 283 # gs://bucket/abc*/ with the same wildcard regex. |
| 293 prefix_wildcard = ((prefix or '') + wildcard_part).rstrip('/') | 284 prefix_wildcard = StripOneSlash((prefix or '') + wildcard_part) |
| 294 suffix_wildcard = wildcard[match.end():] | 285 suffix_wildcard = wildcard[match.end():] |
| 295 end = suffix_wildcard.find('/') | 286 end = suffix_wildcard.find('/') |
| 296 if end == -1: | 287 if end == -1: |
| 297 suffix_wildcard = '' | 288 suffix_wildcard = '' |
| 298 else: | 289 else: |
| 299 suffix_wildcard = suffix_wildcard[end+1:] | 290 suffix_wildcard = suffix_wildcard[end+1:] |
| 300 # To implement recursive (**) wildcarding, if prefix_wildcard | 291 # To implement recursive (**) wildcarding, if prefix_wildcard |
| 301 # suffix_wildcard starts with '**' don't send a delimiter, and combine | 292 # suffix_wildcard starts with '**' don't send a delimiter, and combine |
| 302 # suffix_wildcard at end of prefix_wildcard. | 293 # suffix_wildcard at end of prefix_wildcard. |
| 303 if prefix_wildcard.find('**') != -1: | 294 if prefix_wildcard.find('**') != -1: |
| 304 delimiter = None | 295 delimiter = None |
| 305 prefix_wildcard = prefix_wildcard + suffix_wildcard | 296 prefix_wildcard += suffix_wildcard |
| 306 suffix_wildcard = '' | 297 suffix_wildcard = '' |
| 307 else: | 298 else: |
| 308 delimiter = '/' | 299 delimiter = '/' |
| 309 delim_pos = suffix_wildcard.find(delimiter) | |
| 310 # The following debug output is useful for tracing how the algorithm | 300 # The following debug output is useful for tracing how the algorithm |
| 311 # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt | 301 # walks through a multi-part wildcard like gs://bucket/abc/d*e/f*.txt |
| 312 if self.debug > 1: | 302 if self.debug > 1: |
| 313 sys.stderr.write( | 303 sys.stderr.write( |
| 314 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, ' | 304 'DEBUG: wildcard=%s, prefix=%s, delimiter=%s, ' |
| 315 'prefix_wildcard=%s, suffix_wildcard=%s\n' % | 305 'prefix_wildcard=%s, suffix_wildcard=%s\n' % |
| 316 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard)) | 306 (wildcard, prefix, delimiter, prefix_wildcard, suffix_wildcard)) |
| 317 return (prefix, delimiter, prefix_wildcard, suffix_wildcard) | 307 return (prefix, delimiter, prefix_wildcard, suffix_wildcard) |
| 318 | 308 |
| 319 def IterKeys(self): | 309 def _SingleVersionMatches(self, listed_generation): |
| 320 """ | 310 decoded_generation = GenerationFromUrlAndString(self.wildcard_url, |
| 321 Convenience iterator that runs underlying iterator and returns Key for each | 311 listed_generation) |
| 322 iteration. | 312 return str(self.wildcard_url.generation) == str(decoded_generation) |
| 313 |
| 314 def _ExpandBucketWildcards(self, bucket_fields=None): |
| 315 """Expands bucket and provider wildcards. |
| 316 |
| 317 Builds a list of bucket url strings that can be iterated on. |
| 318 |
| 319 Args: |
| 320 bucket_fields: If present, populate only these metadata fields for |
| 321 buckets. Example value: ['acl', 'defaultObjectAcl'] |
| 323 | 322 |
| 324 Yields: | 323 Yields: |
| 325 Subclass of boto.s3.key.Key, or empty iterator if no matches. | 324 BucketListingRefereneces of type BUCKET. |
| 325 """ |
| 326 bucket_url = StorageUrlFromString(self.wildcard_url.bucket_url_string) |
| 327 if (bucket_fields and set(bucket_fields) == set(['id']) and |
| 328 not ContainsWildcard(self.wildcard_url.bucket_name)): |
| 329 # If we just want the name of a non-wildcarded bucket URL, |
| 330 # don't make an RPC. |
| 331 yield BucketListingBucket(bucket_url) |
| 332 elif(self.wildcard_url.IsBucket() and |
| 333 not ContainsWildcard(self.wildcard_url.bucket_name)): |
| 334 # If we have a non-wildcarded bucket URL, get just that bucket. |
| 335 yield BucketListingBucket( |
| 336 bucket_url, root_object=self.gsutil_api.GetBucket( |
| 337 self.wildcard_url.bucket_name, provider=self.wildcard_url.scheme, |
| 338 fields=bucket_fields)) |
| 339 else: |
| 340 regex = fnmatch.translate(self.wildcard_url.bucket_name) |
| 341 prog = re.compile(regex) |
| 326 | 342 |
| 327 Raises: | 343 fields = self._GetToListFields(bucket_fields) |
| 328 WildcardException: for bucket-only uri. | 344 if fields: |
| 345 fields.add('items/id') |
| 346 for bucket in self.gsutil_api.ListBuckets( |
| 347 fields=fields, project_id=self.project_id, |
| 348 provider=self.wildcard_url.scheme): |
| 349 if prog.match(bucket.id): |
| 350 url = StorageUrlFromString( |
| 351 '%s://%s/' % (self.wildcard_url.scheme, bucket.id)) |
| 352 yield BucketListingBucket(url, root_object=bucket) |
| 353 |
| 354 def _GetToListFields(self, get_fields=None): |
| 355 """Prepends 'items/' to the input fields and converts it to a set. |
| 356 |
| 357 This way field sets requested for GetBucket can be used in ListBucket calls. |
| 358 Note that the input set must contain only bucket or object fields; listing |
| 359 fields such as prefixes or nextPageToken should be added after calling |
| 360 this function. |
| 361 |
| 362 Args: |
| 363 get_fields: Iterable fields usable in GetBucket/GetObject calls. |
| 364 |
| 365 Returns: |
| 366 Set of fields usable in ListBuckets/ListObjects calls. |
| 329 """ | 367 """ |
| 330 for bucket_listing_ref in self. __iter__(): | 368 if get_fields: |
| 331 if bucket_listing_ref.HasKey(): | 369 list_fields = set() |
| 332 yield bucket_listing_ref.GetKey() | 370 for field in get_fields: |
| 371 list_fields.add('items/' + field) |
| 372 return list_fields |
| 333 | 373 |
| 334 def IterUris(self): | 374 def _GetObjectRef(self, bucket_url_string, gcs_object, with_version=False): |
| 375 """Creates a BucketListingRef of type OBJECT from the arguments. |
| 376 |
| 377 Args: |
| 378 bucket_url_string: Wildcardless string describing the containing bucket. |
| 379 gcs_object: gsutil_api root Object for populating the BucketListingRef. |
| 380 with_version: If true, return a reference with a versioned string. |
| 381 |
| 382 Returns: |
| 383 BucketListingRef of type OBJECT. |
| 335 """ | 384 """ |
| 336 Convenience iterator that runs underlying iterator and returns StorageUri | 385 # Generation can be None in test mocks, so just return the |
| 337 for each iteration. | 386 # live object for simplicity. |
| 387 if with_version and gcs_object.generation is not None: |
| 388 generation_str = GenerationFromUrlAndString(self.wildcard_url, |
| 389 gcs_object.generation) |
| 390 object_string = '%s%s#%s' % (bucket_url_string, gcs_object.name, |
| 391 generation_str) |
| 392 else: |
| 393 object_string = '%s%s' % (bucket_url_string, gcs_object.name) |
| 394 object_url = StorageUrlFromString(object_string) |
| 395 return BucketListingObject(object_url, root_object=gcs_object) |
| 396 |
| 397 def _GetPrefixRef(self, bucket_url_string, prefix): |
| 398 """Creates a BucketListingRef of type PREFIX from the arguments. |
| 399 |
| 400 Args: |
| 401 bucket_url_string: Wildcardless string describing the containing bucket. |
| 402 prefix: gsutil_api Prefix for populating the BucketListingRef |
| 403 |
| 404 Returns: |
| 405 BucketListingRef of type PREFIX. |
| 406 """ |
| 407 prefix_url = StorageUrlFromString('%s%s' % (bucket_url_string, prefix)) |
| 408 return BucketListingPrefix(prefix_url, root_object=prefix) |
| 409 |
| 410 def IterBuckets(self, bucket_fields=None): |
| 411 """Iterates over the wildcard, returning refs for each expanded bucket. |
| 412 |
| 413 This ignores the object part of the URL entirely and expands only the |
| 414 the bucket portion. It will yield BucketListingRefs of type BUCKET only. |
| 415 |
| 416 Args: |
| 417 bucket_fields: Iterable fields to include in bucket listings. |
| 418 Ex. ['defaultObjectAcl', 'logging']. This function is |
| 419 responsible for converting these to listing-style |
| 420 format ['items/defaultObjectAcl', 'items/logging'], as |
| 421 well as adding any fields necessary for listing such as |
| 422 'items/id'. API implemenation is responsible for |
| 423 adding pagination fields. If this is None, all fields are |
| 424 returned. |
| 338 | 425 |
| 339 Yields: | 426 Yields: |
| 340 StorageUri, or empty iterator if no matches. | 427 BucketListingRef of type BUCKET, or empty iterator if no matches. |
| 341 """ | 428 """ |
| 342 for bucket_listing_ref in self. __iter__(): | 429 for blr in self._ExpandBucketWildcards(bucket_fields=bucket_fields): |
| 343 yield bucket_listing_ref.GetUri() | 430 yield blr |
| 344 | 431 |
| 345 def IterUrisForKeys(self): | 432 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): |
| 346 """ | 433 """Iterates over the wildcard, yielding bucket, prefix or object refs. |
| 347 Convenience iterator that runs underlying iterator and returns the | 434 |
| 348 StorageUri for each iterated BucketListingRef that has a Key. | 435 Args: |
| 436 bucket_listing_fields: If present, populate only these metadata |
| 437 fields for listed objects. |
| 438 expand_top_level_buckets: If true and the wildcard expands only to |
| 439 Bucket(s), yields the expansion of each bucket |
| 440 into a top-level listing of prefixes and objects |
| 441 in that bucket instead of a BucketListingRef |
| 442 to that bucket. |
| 349 | 443 |
| 350 Yields: | 444 Yields: |
| 351 StorageUri, or empty iterator if no matches. | 445 BucketListingRef, or empty iterator if no matches. |
| 352 """ | 446 """ |
| 353 for bucket_listing_ref in self. __iter__(): | 447 for blr in self. __iter__( |
| 354 if bucket_listing_ref.HasKey(): | 448 bucket_listing_fields=bucket_listing_fields, |
| 355 yield bucket_listing_ref.GetUri() | 449 expand_top_level_buckets=expand_top_level_buckets): |
| 450 yield blr |
| 451 |
| 452 def IterObjects(self, bucket_listing_fields=None): |
| 453 """Iterates over the wildcard, yielding only object BucketListingRefs. |
| 454 |
| 455 Args: |
| 456 bucket_listing_fields: If present, populate only these metadata |
| 457 fields for listed objects. |
| 458 |
| 459 Yields: |
| 460 BucketListingRefs of type OBJECT or empty iterator if no matches. |
| 461 """ |
| 462 for blr in self. __iter__(bucket_listing_fields=bucket_listing_fields, |
| 463 expand_top_level_buckets=True): |
| 464 if blr.IsObject(): |
| 465 yield blr |
| 356 | 466 |
| 357 | 467 |
| 358 class FileWildcardIterator(WildcardIterator): | 468 class FileWildcardIterator(WildcardIterator): |
| 359 """WildcardIterator subclass for files and directories. | 469 """WildcardIterator subclass for files and directories. |
| 360 | 470 |
| 361 If you use recursive wildcards ('**') only a single such wildcard is | 471 If you use recursive wildcards ('**') only a single such wildcard is |
| 362 supported. For example you could use the wildcard '**/*.txt' to list all .txt | 472 supported. For example you could use the wildcard '**/*.txt' to list all .txt |
| 363 files in any subdirectory of the current directory, but you couldn't use a | 473 files in any subdirectory of the current directory, but you couldn't use a |
| 364 wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt | 474 wildcard like '**/abc/**/*.txt' (which would, if supported, let you find .txt |
| 365 files in any subdirectory named 'abc'). | 475 files in any subdirectory named 'abc'). |
| 366 """ | 476 """ |
| 367 | 477 |
| 368 def __init__(self, wildcard_uri, headers=None, debug=0): | 478 def __init__(self, wildcard_url, debug=0): |
| 369 """ | 479 """Instantiates an iterator over BucketListingRefs matching wildcard URL. |
| 370 Instantiate an iterator over BucketListingRefs matching given wildcard URI. | |
| 371 | 480 |
| 372 Args: | 481 Args: |
| 373 wildcard_uri: StorageUri that contains the wildcard to iterate. | 482 wildcard_url: FileUrl that contains the wildcard to iterate. |
| 374 headers: Dictionary containing optional HTTP headers to pass to boto. | 483 debug: Debug level (range 0..3). |
| 375 debug: Debug level to pass in to boto connection (range 0..3). | |
| 376 """ | 484 """ |
| 377 self.wildcard_uri = wildcard_uri | 485 self.wildcard_url = wildcard_url |
| 378 self.headers = headers | |
| 379 self.debug = debug | 486 self.debug = debug |
| 380 | 487 |
| 381 def __iter__(self): | 488 def __iter__(self): |
| 382 wildcard = self.wildcard_uri.object_name | 489 """Iterator that gets called when iterating over the file wildcard. |
| 383 match = re.search('\*\*', wildcard) | 490 |
| 491 In the case where no wildcard is present, returns a single matching file |
| 492 or directory. |
| 493 |
| 494 Raises: |
| 495 WildcardException: if invalid wildcard found. |
| 496 |
| 497 Yields: |
| 498 BucketListingRef of type OBJECT (for files) or PREFIX (for directories) |
| 499 """ |
| 500 wildcard = self.wildcard_url.object_name |
| 501 match = FLAT_LIST_REGEX.match(wildcard) |
| 384 if match: | 502 if match: |
| 385 # Recursive wildcarding request ('.../**/...'). | 503 # Recursive wildcarding request ('.../**/...'). |
| 386 # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' | 504 # Example input: wildcard = '/tmp/tmp2pQJAX/**/*' |
| 387 base_dir = wildcard[:match.start()-1] | 505 base_dir = match.group('before')[:-1] |
| 388 remaining_wildcard = wildcard[match.start()+2:] | 506 remaining_wildcard = match.group('after') |
| 389 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and | 507 # At this point for the above example base_dir = '/tmp/tmp2pQJAX' and |
| 390 # remaining_wildcard = '/*' | 508 # remaining_wildcard = '/*' |
| 391 if remaining_wildcard.startswith('*'): | 509 if remaining_wildcard.startswith('*'): |
| 392 raise WildcardException('Invalid wildcard with more than 2 consecutive ' | 510 raise WildcardException('Invalid wildcard with more than 2 consecutive ' |
| 393 '*s (%s)' % wildcard) | 511 '*s (%s)' % wildcard) |
| 394 # If there was no remaining wildcard past the recursive wildcard, | 512 # If there was no remaining wildcard past the recursive wildcard, |
| 395 # treat it as if it were a '*'. For example, file://tmp/** is equivalent | 513 # treat it as if it were a '*'. For example, file://tmp/** is equivalent |
| 396 # to file://tmp/**/* | 514 # to file://tmp/**/* |
| 397 if not remaining_wildcard: | 515 if not remaining_wildcard: |
| 398 remaining_wildcard = '*' | 516 remaining_wildcard = '*' |
| 399 # Skip slash(es). | 517 # Skip slash(es). |
| 400 remaining_wildcard = remaining_wildcard.lstrip(os.sep) | 518 remaining_wildcard = remaining_wildcard.lstrip(os.sep) |
| 401 filepaths = self._iter_dir(base_dir, remaining_wildcard) | 519 filepaths = self._IterDir(base_dir, remaining_wildcard) |
| 402 else: | 520 else: |
| 403 # Not a recursive wildcarding request. | 521 # Not a recursive wildcarding request. |
| 404 filepaths = glob.iglob(wildcard) | 522 filepaths = glob.iglob(wildcard) |
| 405 for filepath in filepaths: | 523 for filepath in filepaths: |
| 406 expanded_uri = self.wildcard_uri.clone_replace_name(filepath) | 524 expanded_url = StorageUrlFromString(filepath) |
| 407 yield BucketListingRef(expanded_uri) | 525 if os.path.isdir(filepath): |
| 526 yield BucketListingPrefix(expanded_url) |
| 527 else: |
| 528 yield BucketListingObject(expanded_url) |
| 408 | 529 |
| 409 def _iter_dir(self, dir, wildcard): | 530 def _IterDir(self, directory, wildcard): |
| 410 """An iterator over the specified dir and wildcard.""" | 531 """An iterator over the specified dir and wildcard.""" |
| 411 for dirpath, unused_dirnames, filenames in os.walk(dir): | 532 # UTF8-encode directory before passing it to os.walk() so if there are |
| 533 # non-valid UTF8 chars in the file name (e.g., that can happen if the file |
| 534 # originated on Windows) os.walk() will not attempt to decode and then die |
| 535 # with a "codec can't decode byte" error, and instead we can catch the error |
| 536 # at yield time and print a more informative error message. |
| 537 for dirpath, unused_dirnames, filenames in os.walk(directory.encode(UTF8)): |
| 412 for f in fnmatch.filter(filenames, wildcard): | 538 for f in fnmatch.filter(filenames, wildcard): |
| 413 yield os.path.join(dirpath, f) | 539 try: |
| 540 yield os.path.join(dirpath, f).decode(UTF8) |
| 541 except UnicodeDecodeError: |
| 542 # Note: We considered several ways to deal with this, but each had |
| 543 # problems: |
| 544 # 1. Raise an exception and try to catch in a higher layer (the |
| 545 # gsutil cp command), so we can properly support the gsutil cp -c |
| 546 # option. That doesn't work because raising an exception during |
| 547 # iteration terminates the generator. |
| 548 # 2. Accumulate a list of bad filenames and skip processing each |
| 549 # during iteration, then raise at the end, with exception text |
| 550 # printing the bad paths. That doesn't work because iteration is |
| 551 # wrapped in PluralityCheckableIterator, so it's possible there |
| 552 # are not-yet-performed copy operations at the time we reach the |
| 553 # end of the iteration and raise the exception - which would cause |
| 554 # us to skip copying validly named files. Moreover, the gsutil |
| 555 # cp command loops over argv, so if you run the command gsutil cp |
| 556 # -rc dir1 dir2 gs://bucket, an invalid unicode name inside dir1 |
| 557 # would cause dir2 never to be visited. |
| 558 # 3. Print the invalid pathname and skip it during iteration. That |
| 559 # would work but would mean gsutil cp could exit with status 0 |
| 560 # even though some files weren't copied. |
| 561 # 4. Change the WildcardIterator to include an error status along with |
| 562 # the result. That would solve the problem but would be a |
| 563 # substantial change (WildcardIterator is used in many parts of |
| 564 # gsutil), and we didn't feel that magnitude of change was |
| 565 # warranted by this relatively uncommon corner case. |
| 566 # Instead we chose to abort when one such file is encountered, and |
| 567 # require the user to remove or rename the files and try again. |
| 568 raise CommandException('\n'.join(textwrap.wrap( |
| 569 'Invalid Unicode path encountered (%s). gsutil cannot proceed ' |
| 570 'with such files present. Please remove or rename this file and ' |
| 571 'try again.' % repr(os.path.join(dirpath, f))))) |
| 414 | 572 |
| 415 def IterKeys(self): | 573 # pylint: disable=unused-argument |
| 574 def IterObjects(self, bucket_listing_fields=None): |
| 575 """Iterates over the wildcard, yielding only object (file) refs. |
| 576 |
| 577 Args: |
| 578 bucket_listing_fields: Ignored as filesystems don't have buckets. |
| 579 |
| 580 Yields: |
| 581 BucketListingRefs of type OBJECT or empty iterator if no matches. |
| 416 """ | 582 """ |
| 417 Placeholder to allow polymorphic use of WildcardIterator. | 583 for bucket_listing_ref in self.IterAll(): |
| 584 if bucket_listing_ref.IsObject(): |
| 585 yield bucket_listing_ref |
| 586 |
| 587 # pylint: disable=unused-argument |
| 588 def IterAll(self, bucket_listing_fields=None, expand_top_level_buckets=False): |
| 589 """Iterates over the wildcard, yielding BucketListingRefs. |
| 590 |
| 591 Args: |
| 592 bucket_listing_fields: Ignored; filesystems don't have buckets. |
| 593 expand_top_level_buckets: Ignored; filesystems don't have buckets. |
| 594 |
| 595 Yields: |
| 596 BucketListingRefs of type OBJECT (file) or PREFIX (directory), |
| 597 or empty iterator if no matches. |
| 598 """ |
| 599 for bucket_listing_ref in self.__iter__(): |
| 600 yield bucket_listing_ref |
| 601 |
| 602 def IterBuckets(self, unused_bucket_fields=None): |
| 603 """Placeholder to allow polymorphic use of WildcardIterator. |
| 604 |
| 605 Args: |
| 606 unused_bucket_fields: Ignored; filesystems don't have buckets. |
| 418 | 607 |
| 419 Raises: | 608 Raises: |
| 420 WildcardException: in all cases. | 609 WildcardException: in all cases. |
| 421 """ | 610 """ |
| 422 raise WildcardException( | 611 raise WildcardException( |
| 423 'Iterating over Keys not possible for file wildcards') | 612 'Iterating over Buckets not possible for file wildcards') |
| 424 | |
| 425 def IterUris(self): | |
| 426 """ | |
| 427 Convenience iterator that runs underlying iterator and returns StorageUri | |
| 428 for each iteration. | |
| 429 | |
| 430 Yields: | |
| 431 StorageUri, or empty iterator if no matches. | |
| 432 """ | |
| 433 for bucket_listing_ref in self. __iter__(): | |
| 434 yield bucket_listing_ref.GetUri() | |
| 435 | 613 |
| 436 | 614 |
| 437 class WildcardException(StandardError): | 615 class WildcardException(StandardError): |
| 438 """Exception thrown for invalid wildcard URIs.""" | 616 """Exception raised for invalid wildcard URLs.""" |
| 439 | 617 |
| 440 def __init__(self, reason): | 618 def __init__(self, reason): |
| 441 StandardError.__init__(self) | 619 StandardError.__init__(self) |
| 442 self.reason = reason | 620 self.reason = reason |
| 443 | 621 |
| 444 def __repr__(self): | 622 def __repr__(self): |
| 445 return 'WildcardException: %s' % self.reason | 623 return 'WildcardException: %s' % self.reason |
| 446 | 624 |
| 447 def __str__(self): | 625 def __str__(self): |
| 448 return 'WildcardException: %s' % self.reason | 626 return 'WildcardException: %s' % self.reason |
| 449 | 627 |
| 450 | 628 |
| 451 def wildcard_iterator(uri_or_str, proj_id_handler, | 629 def CreateWildcardIterator(url_str, gsutil_api, all_versions=False, debug=0, |
| 452 bucket_storage_uri_class=BucketStorageUri, | 630 project_id=None): |
| 453 all_versions=False, | 631 """Instantiate a WildcardIterator for the given URL string. |
| 454 headers=None, debug=0): | |
| 455 """Instantiate a WildCardIterator for the given StorageUri. | |
| 456 | 632 |
| 457 Args: | 633 Args: |
| 458 uri_or_str: StorageUri or URI string naming wildcard objects to iterate. | 634 url_str: URL string naming wildcard object(s) to iterate. |
| 459 proj_id_handler: ProjectIdHandler to use for current command. | 635 gsutil_api: Cloud storage interface. Passed in for thread safety, also |
| 460 bucket_storage_uri_class: BucketStorageUri interface. | 636 settable for testing/mocking. |
| 461 Settable for testing/mocking. | 637 all_versions: If true, the iterator yields all versions of objects |
| 462 headers: Dictionary containing optional HTTP headers to pass to boto. | 638 matching the wildcard. If false, yields just the live |
| 463 debug: Debug level to pass in to boto connection (range 0..3). | 639 object version. |
| 640 debug: Debug level to control debug output for iterator. |
| 641 project_id: Project id to use for bucket listings. |
| 464 | 642 |
| 465 Returns: | 643 Returns: |
| 466 A WildcardIterator that handles the requested iteration. | 644 A WildcardIterator that handles the requested iteration. |
| 467 """ | 645 """ |
| 468 | 646 |
| 469 if isinstance(uri_or_str, basestring): | 647 url = StorageUrlFromString(url_str) |
| 470 # Disable enforce_bucket_naming, to allow bucket names containing wildcard | 648 if url.IsFileUrl(): |
| 471 # chars. | 649 return FileWildcardIterator(url, debug=debug) |
| 472 uri = boto.storage_uri( | 650 else: # Cloud URL |
| 473 uri_or_str, debug=debug, validate=False, | |
| 474 bucket_storage_uri_class=bucket_storage_uri_class, | |
| 475 suppress_consec_slashes=False) | |
| 476 else: | |
| 477 uri = uri_or_str | |
| 478 | |
| 479 if uri.is_cloud_uri(): | |
| 480 return CloudWildcardIterator( | 651 return CloudWildcardIterator( |
| 481 uri, proj_id_handler, | 652 url, gsutil_api, all_versions=all_versions, debug=debug, |
| 482 bucket_storage_uri_class=bucket_storage_uri_class, | 653 project_id=project_id) |
| 483 all_versions=all_versions, | |
| 484 headers=headers, | |
| 485 debug=debug) | |
| 486 elif uri.is_file_uri(): | |
| 487 return FileWildcardIterator(uri, headers=headers, debug=debug) | |
| 488 else: | |
| 489 raise WildcardException('Unexpected type of StorageUri (%s)' % uri) | |
| 490 | |
| 491 | |
| 492 def ContainsWildcard(uri_or_str): | |
| 493 """Checks whether uri_or_str contains a wildcard. | |
| 494 | |
| 495 Args: | |
| 496 uri_or_str: StorageUri or URI string to check. | |
| 497 | |
| 498 Returns: | |
| 499 bool indicator. | |
| 500 """ | |
| 501 if isinstance(uri_or_str, basestring): | |
| 502 return bool(WILDCARD_REGEX.search(uri_or_str)) | |
| 503 else: | |
| 504 return bool(WILDCARD_REGEX.search(uri_or_str.uri)) | |
| OLD | NEW |