| OLD | NEW |
| (Empty) |
| 1 # -*- coding: utf-8 -*- | |
| 2 # Copyright 2013 Google Inc. All Rights Reserved. | |
| 3 # | |
| 4 # Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 # you may not use this file except in compliance with the License. | |
| 6 # You may obtain a copy of the License at | |
| 7 # | |
| 8 # http://www.apache.org/licenses/LICENSE-2.0 | |
| 9 # | |
| 10 # Unless required by applicable law or agreed to in writing, software | |
| 11 # distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 # See the License for the specific language governing permissions and | |
| 14 # limitations under the License. | |
| 15 """File and Cloud URL representation classes.""" | |
| 16 | |
| 17 from __future__ import absolute_import | |
| 18 | |
| 19 import os | |
| 20 import re | |
| 21 | |
| 22 from gslib.exception import InvalidUrlError | |
| 23 | |
| 24 # Matches provider strings of the form 'gs://' | |
| 25 PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$') | |
| 26 # Matches bucket strings of the form 'gs://bucket' | |
| 27 BUCKET_REGEX = re.compile(r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$') | |
| 28 # Matches object strings of the form 'gs://bucket/obj' | |
| 29 OBJECT_REGEX = re.compile( | |
| 30 r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)') | |
| 31 # Matches versioned object strings of the form 'gs://bucket/obj#1234' | |
| 32 GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$') | |
| 33 # Matches versioned object strings of the form 's3://bucket/obj#NULL' | |
| 34 S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$') | |
| 35 # Matches file strings of the form 'file://dir/filename' | |
| 36 FILE_OBJECT_REGEX = re.compile(r'([^:]*://)(?P<filepath>.*)') | |
| 37 # Regex to disallow buckets violating charset or not [3..255] chars total. | |
| 38 BUCKET_NAME_RE = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9\._-]{1,253}[a-zA-Z0-9]$') | |
| 39 # Regex to disallow buckets with individual DNS labels longer than 63. | |
| 40 TOO_LONG_DNS_NAME_COMP = re.compile(r'[-_a-z0-9]{64}') | |
| 41 # Regex to determine if a string contains any wildcards. | |
| 42 WILDCARD_REGEX = re.compile(r'[*?\[\]]') | |
| 43 | |
| 44 | |
| 45 class StorageUrl(object): | |
| 46 """Abstract base class for file and Cloud Storage URLs.""" | |
| 47 | |
| 48 def Clone(self): | |
| 49 raise NotImplementedError('Clone not overridden') | |
| 50 | |
| 51 def IsFileUrl(self): | |
| 52 raise NotImplementedError('IsFileUrl not overridden') | |
| 53 | |
| 54 def IsCloudUrl(self): | |
| 55 raise NotImplementedError('IsCloudUrl not overridden') | |
| 56 | |
| 57 def IsStream(self): | |
| 58 raise NotImplementedError('IsStream not overridden') | |
| 59 | |
| 60 def CreatePrefixUrl(self, wildcard_suffix=None): | |
| 61 """Returns a prefix of this URL that can be used for iterating. | |
| 62 | |
| 63 Args: | |
| 64 wildcard_suffix: If supplied, this wildcard suffix will be appended to the | |
| 65 prefix with a trailing slash before being returned. | |
| 66 | |
| 67 Returns: | |
| 68 A prefix of this URL that can be used for iterating. | |
| 69 | |
| 70 If this URL contains a trailing slash, it will be stripped to create the | |
| 71 prefix. This helps avoid infinite looping when prefixes are iterated, but | |
| 72 preserves other slashes so that objects with '/' in the name are handled | |
| 73 properly. | |
| 74 | |
| 75 For example, when recursively listing a bucket with the following contents: | |
| 76 gs://bucket// <-- object named slash | |
| 77 gs://bucket//one-dir-deep | |
| 78 a top-level expansion with '/' as a delimiter will result in the following | |
| 79 URL strings: | |
| 80 'gs://bucket//' : OBJECT | |
| 81 'gs://bucket//' : PREFIX | |
| 82 If we right-strip all slashes from the prefix entry and add a wildcard | |
| 83 suffix, we will get 'gs://bucket/*' which will produce identical results | |
| 84 (and infinitely recurse). | |
| 85 | |
| 86 Example return values: | |
| 87 ('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*' | |
| 88 ('gs://bucket/', '*') becomes 'gs://bucket/*' | |
| 89 ('gs://bucket/', None) becomes 'gs://bucket' | |
| 90 ('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*' | |
| 91 ('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**' | |
| 92 ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes | |
| 93 'gs://bucket/subdir/*', but iterating on this will return 'subdir/' | |
| 94 as a BucketListingObject, so we will not recurse on it as a subdir | |
| 95 during listing. | |
| 96 """ | |
| 97 raise NotImplementedError('CreatePrefixUrl not overridden') | |
| 98 | |
| 99 @property | |
| 100 def url_string(self): | |
| 101 raise NotImplementedError('url_string not overridden') | |
| 102 | |
| 103 @property | |
| 104 def versionless_url_string(self): | |
| 105 raise NotImplementedError('versionless_url_string not overridden') | |
| 106 | |
| 107 def __eq__(self, other): | |
| 108 return self.url_string == other.url_string | |
| 109 | |
| 110 def __hash__(self): | |
| 111 return hash(self.url_string) | |
| 112 | |
| 113 | |
| 114 class _FileUrl(StorageUrl): | |
| 115 """File URL class providing parsing and convenience methods. | |
| 116 | |
| 117 This class assists with usage and manipulation of an | |
| 118 (optionally wildcarded) file URL string. Depending on the string | |
| 119 contents, this class represents one or more directories or files. | |
| 120 | |
| 121 For File URLs, scheme is always file, bucket_name is always blank, | |
| 122 and object_name contains the file/directory path. | |
| 123 """ | |
| 124 | |
| 125 def __init__(self, url_string, is_stream=False): | |
| 126 self.scheme = 'file' | |
| 127 self.bucket_name = '' | |
| 128 match = FILE_OBJECT_REGEX.match(url_string) | |
| 129 if match and match.lastindex == 2: | |
| 130 self.object_name = match.group(2) | |
| 131 else: | |
| 132 self.object_name = url_string | |
| 133 self.generation = None | |
| 134 self.is_stream = is_stream | |
| 135 self.delim = os.sep | |
| 136 | |
| 137 def Clone(self): | |
| 138 return _FileUrl(self.url_string) | |
| 139 | |
| 140 def IsFileUrl(self): | |
| 141 return True | |
| 142 | |
| 143 def IsCloudUrl(self): | |
| 144 return False | |
| 145 | |
| 146 def IsStream(self): | |
| 147 return self.is_stream | |
| 148 | |
| 149 def IsDirectory(self): | |
| 150 return not self.IsStream() and os.path.isdir(self.object_name) | |
| 151 | |
| 152 def CreatePrefixUrl(self, wildcard_suffix=None): | |
| 153 return self.url_string | |
| 154 | |
| 155 @property | |
| 156 def url_string(self): | |
| 157 return '%s://%s' % (self.scheme, self.object_name) | |
| 158 | |
| 159 @property | |
| 160 def versionless_url_string(self): | |
| 161 return self.url_string | |
| 162 | |
| 163 def __str__(self): | |
| 164 return self.url_string | |
| 165 | |
| 166 | |
| 167 class _CloudUrl(StorageUrl): | |
| 168 """Cloud URL class providing parsing and convenience methods. | |
| 169 | |
| 170 This class assists with usage and manipulation of an | |
| 171 (optionally wildcarded) cloud URL string. Depending on the string | |
| 172 contents, this class represents a provider, bucket(s), or object(s). | |
| 173 | |
| 174 This class operates only on strings. No cloud storage API calls are | |
| 175 made from this class. | |
| 176 """ | |
| 177 | |
| 178 def __init__(self, url_string): | |
| 179 self.scheme = None | |
| 180 self.bucket_name = None | |
| 181 self.object_name = None | |
| 182 self.generation = None | |
| 183 self.delim = '/' | |
| 184 provider_match = PROVIDER_REGEX.match(url_string) | |
| 185 bucket_match = BUCKET_REGEX.match(url_string) | |
| 186 if provider_match: | |
| 187 self.scheme = provider_match.group('provider') | |
| 188 elif bucket_match: | |
| 189 self.scheme = bucket_match.group('provider') | |
| 190 self.bucket_name = bucket_match.group('bucket') | |
| 191 if (not ContainsWildcard(self.bucket_name) and | |
| 192 (not BUCKET_NAME_RE.match(self.bucket_name) or | |
| 193 TOO_LONG_DNS_NAME_COMP.search(self.bucket_name))): | |
| 194 raise InvalidUrlError('Invalid bucket name in URL "%s"' % url_string) | |
| 195 else: | |
| 196 object_match = OBJECT_REGEX.match(url_string) | |
| 197 if object_match: | |
| 198 self.scheme = object_match.group('provider') | |
| 199 self.bucket_name = object_match.group('bucket') | |
| 200 self.object_name = object_match.group('object') | |
| 201 if self.scheme == 'gs': | |
| 202 generation_match = GS_GENERATION_REGEX.match(self.object_name) | |
| 203 if generation_match: | |
| 204 self.object_name = generation_match.group('object') | |
| 205 self.generation = generation_match.group('generation') | |
| 206 elif self.scheme == 's3': | |
| 207 version_match = S3_VERSION_REGEX.match(self.object_name) | |
| 208 if version_match: | |
| 209 self.object_name = version_match.group('object') | |
| 210 self.generation = version_match.group('version_id') | |
| 211 else: | |
| 212 raise InvalidUrlError( | |
| 213 'CloudUrl: URL string %s did not match URL regex' % url_string) | |
| 214 | |
| 215 def Clone(self): | |
| 216 return _CloudUrl(self.url_string) | |
| 217 | |
| 218 def IsFileUrl(self): | |
| 219 return False | |
| 220 | |
| 221 def IsCloudUrl(self): | |
| 222 return True | |
| 223 | |
| 224 def IsStream(self): | |
| 225 raise NotImplementedError('IsStream not supported on CloudUrl') | |
| 226 | |
| 227 def IsBucket(self): | |
| 228 return bool(self.bucket_name and not self.object_name) | |
| 229 | |
| 230 def IsObject(self): | |
| 231 return bool(self.bucket_name and self.object_name) | |
| 232 | |
| 233 def HasGeneration(self): | |
| 234 return bool(self.generation) | |
| 235 | |
| 236 def IsProvider(self): | |
| 237 return bool(self.scheme and not self.bucket_name) | |
| 238 | |
| 239 def CreatePrefixUrl(self, wildcard_suffix=None): | |
| 240 prefix = StripOneSlash(self.versionless_url_string) | |
| 241 if wildcard_suffix: | |
| 242 prefix = '%s/%s' % (prefix, wildcard_suffix) | |
| 243 return prefix | |
| 244 | |
| 245 @property | |
| 246 def bucket_url_string(self): | |
| 247 return '%s://%s/' % (self.scheme, self.bucket_name) | |
| 248 | |
| 249 @property | |
| 250 def url_string(self): | |
| 251 url_str = self.versionless_url_string | |
| 252 if self.HasGeneration(): | |
| 253 url_str += '#%s' % self.generation | |
| 254 return url_str | |
| 255 | |
| 256 @property | |
| 257 def versionless_url_string(self): | |
| 258 if self.IsProvider(): | |
| 259 return '%s://' % self.scheme | |
| 260 elif self.IsBucket(): | |
| 261 return self.bucket_url_string | |
| 262 return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name) | |
| 263 | |
| 264 def __str__(self): | |
| 265 return self.url_string | |
| 266 | |
| 267 | |
| 268 def _GetSchemeFromUrlString(url_str): | |
| 269 """Returns scheme component of a URL string.""" | |
| 270 | |
| 271 end_scheme_idx = url_str.find('://') | |
| 272 if end_scheme_idx == -1: | |
| 273 # File is the default scheme. | |
| 274 return 'file' | |
| 275 else: | |
| 276 return url_str[0:end_scheme_idx].lower() | |
| 277 | |
| 278 | |
| 279 def _GetPathFromUrlString(url_str): | |
| 280 """Returns path component of a URL string.""" | |
| 281 | |
| 282 end_scheme_idx = url_str.find('://') | |
| 283 if end_scheme_idx == -1: | |
| 284 return url_str | |
| 285 else: | |
| 286 return url_str[end_scheme_idx + 3:] | |
| 287 | |
| 288 | |
| 289 def IsFileUrlString(url_str): | |
| 290 """Returns whether a string is a file URL.""" | |
| 291 | |
| 292 return _GetSchemeFromUrlString(url_str) == 'file' | |
| 293 | |
| 294 | |
| 295 def StorageUrlFromString(url_str): | |
| 296 """Static factory function for creating a StorageUrl from a string.""" | |
| 297 | |
| 298 scheme = _GetSchemeFromUrlString(url_str) | |
| 299 | |
| 300 if scheme not in ('file', 's3', 'gs'): | |
| 301 raise InvalidUrlError('Unrecognized scheme "%s"' % scheme) | |
| 302 if scheme == 'file': | |
| 303 path = _GetPathFromUrlString(url_str) | |
| 304 is_stream = (path == '-') | |
| 305 return _FileUrl(url_str, is_stream=is_stream) | |
| 306 return _CloudUrl(url_str) | |
| 307 | |
| 308 | |
| 309 def StripOneSlash(url_str): | |
| 310 if url_str and url_str.endswith('/'): | |
| 311 return url_str[:-1] | |
| 312 return url_str | |
| 313 | |
| 314 | |
| 315 def ContainsWildcard(url_string): | |
| 316 """Checks whether url_string contains a wildcard. | |
| 317 | |
| 318 Args: | |
| 319 url_string: URL string to check. | |
| 320 | |
| 321 Returns: | |
| 322 bool indicator. | |
| 323 """ | |
| 324 return bool(WILDCARD_REGEX.search(url_string)) | |
| OLD | NEW |