OLD | NEW |
| (Empty) |
1 # -*- coding: utf-8 -*- | |
2 # Copyright 2013 Google Inc. All Rights Reserved. | |
3 # | |
4 # Licensed under the Apache License, Version 2.0 (the "License"); | |
5 # you may not use this file except in compliance with the License. | |
6 # You may obtain a copy of the License at | |
7 # | |
8 # http://www.apache.org/licenses/LICENSE-2.0 | |
9 # | |
10 # Unless required by applicable law or agreed to in writing, software | |
11 # distributed under the License is distributed on an "AS IS" BASIS, | |
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 # See the License for the specific language governing permissions and | |
14 # limitations under the License. | |
15 """File and Cloud URL representation classes.""" | |
16 | |
17 from __future__ import absolute_import | |
18 | |
19 import os | |
20 import re | |
21 | |
22 from gslib.exception import InvalidUrlError | |
23 | |
24 # Matches provider strings of the form 'gs://' | |
25 PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$') | |
26 # Matches bucket strings of the form 'gs://bucket' | |
27 BUCKET_REGEX = re.compile(r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$') | |
28 # Matches object strings of the form 'gs://bucket/obj' | |
29 OBJECT_REGEX = re.compile( | |
30 r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)') | |
31 # Matches versioned object strings of the form 'gs://bucket/obj#1234' | |
32 GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$') | |
33 # Matches versioned object strings of the form 's3://bucket/obj#NULL' | |
34 S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$') | |
35 # Matches file strings of the form 'file://dir/filename' | |
36 FILE_OBJECT_REGEX = re.compile(r'([^:]*://)(?P<filepath>.*)') | |
37 # Regex to disallow buckets violating charset or not [3..255] chars total. | |
38 BUCKET_NAME_RE = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9\._-]{1,253}[a-zA-Z0-9]$') | |
39 # Regex to disallow buckets with individual DNS labels longer than 63. | |
40 TOO_LONG_DNS_NAME_COMP = re.compile(r'[-_a-z0-9]{64}') | |
41 # Regex to determine if a string contains any wildcards. | |
42 WILDCARD_REGEX = re.compile(r'[*?\[\]]') | |
43 | |
44 | |
45 class StorageUrl(object): | |
46 """Abstract base class for file and Cloud Storage URLs.""" | |
47 | |
48 def Clone(self): | |
49 raise NotImplementedError('Clone not overridden') | |
50 | |
51 def IsFileUrl(self): | |
52 raise NotImplementedError('IsFileUrl not overridden') | |
53 | |
54 def IsCloudUrl(self): | |
55 raise NotImplementedError('IsCloudUrl not overridden') | |
56 | |
57 def IsStream(self): | |
58 raise NotImplementedError('IsStream not overridden') | |
59 | |
60 def CreatePrefixUrl(self, wildcard_suffix=None): | |
61 """Returns a prefix of this URL that can be used for iterating. | |
62 | |
63 Args: | |
64 wildcard_suffix: If supplied, this wildcard suffix will be appended to the | |
65 prefix with a trailing slash before being returned. | |
66 | |
67 Returns: | |
68 A prefix of this URL that can be used for iterating. | |
69 | |
70 If this URL contains a trailing slash, it will be stripped to create the | |
71 prefix. This helps avoid infinite looping when prefixes are iterated, but | |
72 preserves other slashes so that objects with '/' in the name are handled | |
73 properly. | |
74 | |
75 For example, when recursively listing a bucket with the following contents: | |
76 gs://bucket// <-- object named slash | |
77 gs://bucket//one-dir-deep | |
78 a top-level expansion with '/' as a delimiter will result in the following | |
79 URL strings: | |
80 'gs://bucket//' : OBJECT | |
81 'gs://bucket//' : PREFIX | |
82 If we right-strip all slashes from the prefix entry and add a wildcard | |
83 suffix, we will get 'gs://bucket/*' which will produce identical results | |
84 (and infinitely recurse). | |
85 | |
86 Example return values: | |
87 ('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*' | |
88 ('gs://bucket/', '*') becomes 'gs://bucket/*' | |
89 ('gs://bucket/', None) becomes 'gs://bucket' | |
90 ('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*' | |
91 ('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**' | |
92 ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes | |
93 'gs://bucket/subdir/*', but iterating on this will return 'subdir/' | |
94 as a BucketListingObject, so we will not recurse on it as a subdir | |
95 during listing. | |
96 """ | |
97 raise NotImplementedError('CreatePrefixUrl not overridden') | |
98 | |
99 @property | |
100 def url_string(self): | |
101 raise NotImplementedError('url_string not overridden') | |
102 | |
103 @property | |
104 def versionless_url_string(self): | |
105 raise NotImplementedError('versionless_url_string not overridden') | |
106 | |
107 def __eq__(self, other): | |
108 return self.url_string == other.url_string | |
109 | |
110 def __hash__(self): | |
111 return hash(self.url_string) | |
112 | |
113 | |
114 class _FileUrl(StorageUrl): | |
115 """File URL class providing parsing and convenience methods. | |
116 | |
117 This class assists with usage and manipulation of an | |
118 (optionally wildcarded) file URL string. Depending on the string | |
119 contents, this class represents one or more directories or files. | |
120 | |
121 For File URLs, scheme is always file, bucket_name is always blank, | |
122 and object_name contains the file/directory path. | |
123 """ | |
124 | |
125 def __init__(self, url_string, is_stream=False): | |
126 self.scheme = 'file' | |
127 self.bucket_name = '' | |
128 match = FILE_OBJECT_REGEX.match(url_string) | |
129 if match and match.lastindex == 2: | |
130 self.object_name = match.group(2) | |
131 else: | |
132 self.object_name = url_string | |
133 self.generation = None | |
134 self.is_stream = is_stream | |
135 self.delim = os.sep | |
136 | |
137 def Clone(self): | |
138 return _FileUrl(self.url_string) | |
139 | |
140 def IsFileUrl(self): | |
141 return True | |
142 | |
143 def IsCloudUrl(self): | |
144 return False | |
145 | |
146 def IsStream(self): | |
147 return self.is_stream | |
148 | |
149 def IsDirectory(self): | |
150 return not self.IsStream() and os.path.isdir(self.object_name) | |
151 | |
152 def CreatePrefixUrl(self, wildcard_suffix=None): | |
153 return self.url_string | |
154 | |
155 @property | |
156 def url_string(self): | |
157 return '%s://%s' % (self.scheme, self.object_name) | |
158 | |
159 @property | |
160 def versionless_url_string(self): | |
161 return self.url_string | |
162 | |
163 def __str__(self): | |
164 return self.url_string | |
165 | |
166 | |
167 class _CloudUrl(StorageUrl): | |
168 """Cloud URL class providing parsing and convenience methods. | |
169 | |
170 This class assists with usage and manipulation of an | |
171 (optionally wildcarded) cloud URL string. Depending on the string | |
172 contents, this class represents a provider, bucket(s), or object(s). | |
173 | |
174 This class operates only on strings. No cloud storage API calls are | |
175 made from this class. | |
176 """ | |
177 | |
178 def __init__(self, url_string): | |
179 self.scheme = None | |
180 self.bucket_name = None | |
181 self.object_name = None | |
182 self.generation = None | |
183 self.delim = '/' | |
184 provider_match = PROVIDER_REGEX.match(url_string) | |
185 bucket_match = BUCKET_REGEX.match(url_string) | |
186 if provider_match: | |
187 self.scheme = provider_match.group('provider') | |
188 elif bucket_match: | |
189 self.scheme = bucket_match.group('provider') | |
190 self.bucket_name = bucket_match.group('bucket') | |
191 if (not ContainsWildcard(self.bucket_name) and | |
192 (not BUCKET_NAME_RE.match(self.bucket_name) or | |
193 TOO_LONG_DNS_NAME_COMP.search(self.bucket_name))): | |
194 raise InvalidUrlError('Invalid bucket name in URL "%s"' % url_string) | |
195 else: | |
196 object_match = OBJECT_REGEX.match(url_string) | |
197 if object_match: | |
198 self.scheme = object_match.group('provider') | |
199 self.bucket_name = object_match.group('bucket') | |
200 self.object_name = object_match.group('object') | |
201 if self.scheme == 'gs': | |
202 generation_match = GS_GENERATION_REGEX.match(self.object_name) | |
203 if generation_match: | |
204 self.object_name = generation_match.group('object') | |
205 self.generation = generation_match.group('generation') | |
206 elif self.scheme == 's3': | |
207 version_match = S3_VERSION_REGEX.match(self.object_name) | |
208 if version_match: | |
209 self.object_name = version_match.group('object') | |
210 self.generation = version_match.group('version_id') | |
211 else: | |
212 raise InvalidUrlError( | |
213 'CloudUrl: URL string %s did not match URL regex' % url_string) | |
214 | |
215 def Clone(self): | |
216 return _CloudUrl(self.url_string) | |
217 | |
218 def IsFileUrl(self): | |
219 return False | |
220 | |
221 def IsCloudUrl(self): | |
222 return True | |
223 | |
224 def IsStream(self): | |
225 raise NotImplementedError('IsStream not supported on CloudUrl') | |
226 | |
227 def IsBucket(self): | |
228 return bool(self.bucket_name and not self.object_name) | |
229 | |
230 def IsObject(self): | |
231 return bool(self.bucket_name and self.object_name) | |
232 | |
233 def HasGeneration(self): | |
234 return bool(self.generation) | |
235 | |
236 def IsProvider(self): | |
237 return bool(self.scheme and not self.bucket_name) | |
238 | |
239 def CreatePrefixUrl(self, wildcard_suffix=None): | |
240 prefix = StripOneSlash(self.versionless_url_string) | |
241 if wildcard_suffix: | |
242 prefix = '%s/%s' % (prefix, wildcard_suffix) | |
243 return prefix | |
244 | |
245 @property | |
246 def bucket_url_string(self): | |
247 return '%s://%s/' % (self.scheme, self.bucket_name) | |
248 | |
249 @property | |
250 def url_string(self): | |
251 url_str = self.versionless_url_string | |
252 if self.HasGeneration(): | |
253 url_str += '#%s' % self.generation | |
254 return url_str | |
255 | |
256 @property | |
257 def versionless_url_string(self): | |
258 if self.IsProvider(): | |
259 return '%s://' % self.scheme | |
260 elif self.IsBucket(): | |
261 return self.bucket_url_string | |
262 return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name) | |
263 | |
264 def __str__(self): | |
265 return self.url_string | |
266 | |
267 | |
268 def _GetSchemeFromUrlString(url_str): | |
269 """Returns scheme component of a URL string.""" | |
270 | |
271 end_scheme_idx = url_str.find('://') | |
272 if end_scheme_idx == -1: | |
273 # File is the default scheme. | |
274 return 'file' | |
275 else: | |
276 return url_str[0:end_scheme_idx].lower() | |
277 | |
278 | |
279 def _GetPathFromUrlString(url_str): | |
280 """Returns path component of a URL string.""" | |
281 | |
282 end_scheme_idx = url_str.find('://') | |
283 if end_scheme_idx == -1: | |
284 return url_str | |
285 else: | |
286 return url_str[end_scheme_idx + 3:] | |
287 | |
288 | |
289 def IsFileUrlString(url_str): | |
290 """Returns whether a string is a file URL.""" | |
291 | |
292 return _GetSchemeFromUrlString(url_str) == 'file' | |
293 | |
294 | |
295 def StorageUrlFromString(url_str): | |
296 """Static factory function for creating a StorageUrl from a string.""" | |
297 | |
298 scheme = _GetSchemeFromUrlString(url_str) | |
299 | |
300 if scheme not in ('file', 's3', 'gs'): | |
301 raise InvalidUrlError('Unrecognized scheme "%s"' % scheme) | |
302 if scheme == 'file': | |
303 path = _GetPathFromUrlString(url_str) | |
304 is_stream = (path == '-') | |
305 return _FileUrl(url_str, is_stream=is_stream) | |
306 return _CloudUrl(url_str) | |
307 | |
308 | |
309 def StripOneSlash(url_str): | |
310 if url_str and url_str.endswith('/'): | |
311 return url_str[:-1] | |
312 return url_str | |
313 | |
314 | |
315 def ContainsWildcard(url_string): | |
316 """Checks whether url_string contains a wildcard. | |
317 | |
318 Args: | |
319 url_string: URL string to check. | |
320 | |
321 Returns: | |
322 bool indicator. | |
323 """ | |
324 return bool(WILDCARD_REGEX.search(url_string)) | |
OLD | NEW |