third_party/gsutil/gslib/storage_url.py - Issue 1377933002: [catapult] - Copy Telemetry's gsutilz over to third_party.

Side by Side Diff: third_party/gsutil/gslib/storage_url.py

Issue 1377933002: [catapult] - Copy Telemetry's gsutilz over to third_party. (Closed) Base URL: https://github.com/catapult-project/catapult.git@master

Patch Set: Rename to gsutil. Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # -- coding: utf-8 --

	2 # Copyright 2013 Google Inc. All Rights Reserved.

	3 #

	4 # Licensed under the Apache License, Version 2.0 (the "License");

	5 # you may not use this file except in compliance with the License.

	6 # You may obtain a copy of the License at

	7 #

	8 # http://www.apache.org/licenses/LICENSE-2.0

	9 #

	10 # Unless required by applicable law or agreed to in writing, software

	11 # distributed under the License is distributed on an "AS IS" BASIS,

	12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

	13 # See the License for the specific language governing permissions and

	14 # limitations under the License.

	15 """File and Cloud URL representation classes."""

	16

	17 from __future__ import absolute_import

	18

	19 import os

	20 import re

	21

	22 from gslib.exception import InvalidUrlError

	23

	24 # Matches provider strings of the form 'gs://'

	25 PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$')

	26 # Matches bucket strings of the form 'gs://bucket'

	27 BUCKET_REGEX = re.compile(r'(?P<provider>[^:])://(?P<bucket>[^/])/{0,1}$')

	28 # Matches object strings of the form 'gs://bucket/obj'

	29 OBJECT_REGEX = re.compile(

	30 r'(?P<provider>[^:])://(?P<bucket>[^/])/(?P<object>.*)')

	31 # Matches versioned object strings of the form 'gs://bucket/obj#1234'

	32 GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$')

	33 # Matches versioned object strings of the form 's3://bucket/obj#NULL'

	34 S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$')

	35 # Matches file strings of the form 'file://dir/filename'

	36 FILE_OBJECT_REGEX = re.compile(r'([^:]://)(?P<filepath>.)')

	37 # Regex to disallow buckets violating charset or not [3..255] chars total.

	38 BUCKET_NAME_RE = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9\._-]{1,253}[a-zA-Z0-9]$')

	39 # Regex to disallow buckets with individual DNS labels longer than 63.

	40 TOO_LONG_DNS_NAME_COMP = re.compile(r'[-_a-z0-9]{64}')

	41 # Regex to determine if a string contains any wildcards.

	42 WILDCARD_REGEX = re.compile(r'[*?\[\]]')

	43

	44

	45 class StorageUrl(object):

	46 """Abstract base class for file and Cloud Storage URLs."""

	47

	48 def Clone(self):

	49 raise NotImplementedError('Clone not overridden')

	50

	51 def IsFileUrl(self):

	52 raise NotImplementedError('IsFileUrl not overridden')

	53

	54 def IsCloudUrl(self):

	55 raise NotImplementedError('IsCloudUrl not overridden')

	56

	57 def IsStream(self):

	58 raise NotImplementedError('IsStream not overridden')

	59

	60 def CreatePrefixUrl(self, wildcard_suffix=None):

	61 """Returns a prefix of this URL that can be used for iterating.

	62

	63 Args:

	64 wildcard_suffix: If supplied, this wildcard suffix will be appended to the

	65 prefix with a trailing slash before being returned.

	66

	67 Returns:

	68 A prefix of this URL that can be used for iterating.

	69

	70 If this URL contains a trailing slash, it will be stripped to create the

	71 prefix. This helps avoid infinite looping when prefixes are iterated, but

	72 preserves other slashes so that objects with '/' in the name are handled

	73 properly.

	74

	75 For example, when recursively listing a bucket with the following contents:

	76 gs://bucket// <-- object named slash

	77 gs://bucket//one-dir-deep

	78 a top-level expansion with '/' as a delimiter will result in the following

	79 URL strings:

	80 'gs://bucket//' : OBJECT

	81 'gs://bucket//' : PREFIX

	82 If we right-strip all slashes from the prefix entry and add a wildcard

	83 suffix, we will get 'gs://bucket/*' which will produce identical results

	84 (and infinitely recurse).

	85

	86 Example return values:

	87 ('gs://bucket/subdir/', '') becomes 'gs://bucket/subdir/'

	88 ('gs://bucket/', '') becomes 'gs://bucket/'

	89 ('gs://bucket/', None) becomes 'gs://bucket'

	90 ('gs://bucket/subdir//', '') becomes 'gs://bucket/subdir//'

	91 ('gs://bucket/subdir///', '') becomes 'gs://bucket/subdir///'

	92 ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes

	93 'gs://bucket/subdir/*', but iterating on this will return 'subdir/'

	94 as a BucketListingObject, so we will not recurse on it as a subdir

	95 during listing.

	96 """

	97 raise NotImplementedError('CreatePrefixUrl not overridden')

	98

	99 @property

	100 def url_string(self):

	101 raise NotImplementedError('url_string not overridden')

	102

	103 @property

	104 def versionless_url_string(self):

	105 raise NotImplementedError('versionless_url_string not overridden')

	106

	107 def __eq__(self, other):

	108 return self.url_string == other.url_string

	109

	110 def __hash__(self):

	111 return hash(self.url_string)

	112

	113

	114 class _FileUrl(StorageUrl):

	115 """File URL class providing parsing and convenience methods.

	116

	117 This class assists with usage and manipulation of an

	118 (optionally wildcarded) file URL string. Depending on the string

	119 contents, this class represents one or more directories or files.

	120

	121 For File URLs, scheme is always file, bucket_name is always blank,

	122 and object_name contains the file/directory path.

	123 """

	124

	125 def __init__(self, url_string, is_stream=False):

	126 self.scheme = 'file'

	127 self.bucket_name = ''

	128 match = FILE_OBJECT_REGEX.match(url_string)

	129 if match and match.lastindex == 2:

	130 self.object_name = match.group(2)

	131 else:

	132 self.object_name = url_string

	133 self.generation = None

	134 self.is_stream = is_stream

	135 self.delim = os.sep

	136

	137 def Clone(self):

	138 return _FileUrl(self.url_string)

	139

	140 def IsFileUrl(self):

	141 return True

	142

	143 def IsCloudUrl(self):

	144 return False

	145

	146 def IsStream(self):

	147 return self.is_stream

	148

	149 def IsDirectory(self):

	150 return not self.IsStream() and os.path.isdir(self.object_name)

	151

	152 def CreatePrefixUrl(self, wildcard_suffix=None):

	153 return self.url_string

	154

	155 @property

	156 def url_string(self):

	157 return '%s://%s' % (self.scheme, self.object_name)

	158

	159 @property

	160 def versionless_url_string(self):

	161 return self.url_string

	162

	163 def __str__(self):

	164 return self.url_string

	165

	166

	167 class _CloudUrl(StorageUrl):

	168 """Cloud URL class providing parsing and convenience methods.

	169

	170 This class assists with usage and manipulation of an

	171 (optionally wildcarded) cloud URL string. Depending on the string

	172 contents, this class represents a provider, bucket(s), or object(s).

	173

	174 This class operates only on strings. No cloud storage API calls are

	175 made from this class.

	176 """

	177

	178 def __init__(self, url_string):

	179 self.scheme = None

	180 self.bucket_name = None

	181 self.object_name = None

	182 self.generation = None

	183 self.delim = '/'

	184 provider_match = PROVIDER_REGEX.match(url_string)

	185 bucket_match = BUCKET_REGEX.match(url_string)

	186 if provider_match:

	187 self.scheme = provider_match.group('provider')

	188 elif bucket_match:

	189 self.scheme = bucket_match.group('provider')

	190 self.bucket_name = bucket_match.group('bucket')

	191 if (not ContainsWildcard(self.bucket_name) and

	192 (not BUCKET_NAME_RE.match(self.bucket_name) or

	193 TOO_LONG_DNS_NAME_COMP.search(self.bucket_name))):

	194 raise InvalidUrlError('Invalid bucket name in URL "%s"' % url_string)

	195 else:

	196 object_match = OBJECT_REGEX.match(url_string)

	197 if object_match:

	198 self.scheme = object_match.group('provider')

	199 self.bucket_name = object_match.group('bucket')

	200 self.object_name = object_match.group('object')

	201 if self.scheme == 'gs':

	202 generation_match = GS_GENERATION_REGEX.match(self.object_name)

	203 if generation_match:

	204 self.object_name = generation_match.group('object')

	205 self.generation = generation_match.group('generation')

	206 elif self.scheme == 's3':

	207 version_match = S3_VERSION_REGEX.match(self.object_name)

	208 if version_match:

	209 self.object_name = version_match.group('object')

	210 self.generation = version_match.group('version_id')

	211 else:

	212 raise InvalidUrlError(

	213 'CloudUrl: URL string %s did not match URL regex' % url_string)

	214

	215 def Clone(self):

	216 return _CloudUrl(self.url_string)

	217

	218 def IsFileUrl(self):

	219 return False

	220

	221 def IsCloudUrl(self):

	222 return True

	223

	224 def IsStream(self):

	225 raise NotImplementedError('IsStream not supported on CloudUrl')

	226

	227 def IsBucket(self):

	228 return bool(self.bucket_name and not self.object_name)

	229

	230 def IsObject(self):

	231 return bool(self.bucket_name and self.object_name)

	232

	233 def HasGeneration(self):

	234 return bool(self.generation)

	235

	236 def IsProvider(self):

	237 return bool(self.scheme and not self.bucket_name)

	238

	239 def CreatePrefixUrl(self, wildcard_suffix=None):

	240 prefix = StripOneSlash(self.versionless_url_string)

	241 if wildcard_suffix:

	242 prefix = '%s/%s' % (prefix, wildcard_suffix)

	243 return prefix

	244

	245 @property

	246 def bucket_url_string(self):

	247 return '%s://%s/' % (self.scheme, self.bucket_name)

	248

	249 @property

	250 def url_string(self):

	251 url_str = self.versionless_url_string

	252 if self.HasGeneration():

	253 url_str += '#%s' % self.generation

	254 return url_str

	255

	256 @property

	257 def versionless_url_string(self):

	258 if self.IsProvider():

	259 return '%s://' % self.scheme

	260 elif self.IsBucket():

	261 return self.bucket_url_string

	262 return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name)

	263

	264 def __str__(self):

	265 return self.url_string

	266

	267

	268 def _GetSchemeFromUrlString(url_str):

	269 """Returns scheme component of a URL string."""

	270

	271 end_scheme_idx = url_str.find('://')

	272 if end_scheme_idx == -1:

	273 # File is the default scheme.

	274 return 'file'

	275 else:

	276 return url_str[0:end_scheme_idx].lower()

	277

	278

	279 def _GetPathFromUrlString(url_str):

	280 """Returns path component of a URL string."""

	281

	282 end_scheme_idx = url_str.find('://')

	283 if end_scheme_idx == -1:

	284 return url_str

	285 else:

	286 return url_str[end_scheme_idx + 3:]

	287

	288

	289 def IsFileUrlString(url_str):

	290 """Returns whether a string is a file URL."""

	291

	292 return _GetSchemeFromUrlString(url_str) == 'file'

	293

	294

	295 def StorageUrlFromString(url_str):

	296 """Static factory function for creating a StorageUrl from a string."""

	297

	298 scheme = _GetSchemeFromUrlString(url_str)

	299

	300 if scheme not in ('file', 's3', 'gs'):

	301 raise InvalidUrlError('Unrecognized scheme "%s"' % scheme)

	302 if scheme == 'file':

	303 path = _GetPathFromUrlString(url_str)

	304 is_stream = (path == '-')

	305 return _FileUrl(url_str, is_stream=is_stream)

	306 return _CloudUrl(url_str)

	307

	308

	309 def StripOneSlash(url_str):

	310 if url_str and url_str.endswith('/'):

	311 return url_str[:-1]

	312 return url_str

	313

	314

	315 def ContainsWildcard(url_string):

	316 """Checks whether url_string contains a wildcard.

	317

	318 Args:

	319 url_string: URL string to check.

	320

	321 Returns:

	322 bool indicator.

	323 """

	324 return bool(WILDCARD_REGEX.search(url_string))

OLD	NEW

« no previous file with comments | « third_party/gsutil/gslib/storage_uri_builder.py ('k') | third_party/gsutil/gslib/tab_complete.py » ('j') | no next file with comments »