| Index: third_party/gsutil/gslib/storage_url.py
|
| diff --git a/third_party/gsutil/gslib/storage_url.py b/third_party/gsutil/gslib/storage_url.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..657883cd71cadacde9c713bc628cfc217a89991a
|
| --- /dev/null
|
| +++ b/third_party/gsutil/gslib/storage_url.py
|
| @@ -0,0 +1,324 @@
|
| +# -*- coding: utf-8 -*-
|
| +# Copyright 2013 Google Inc. All Rights Reserved.
|
| +#
|
| +# Licensed under the Apache License, Version 2.0 (the "License");
|
| +# you may not use this file except in compliance with the License.
|
| +# You may obtain a copy of the License at
|
| +#
|
| +# http://www.apache.org/licenses/LICENSE-2.0
|
| +#
|
| +# Unless required by applicable law or agreed to in writing, software
|
| +# distributed under the License is distributed on an "AS IS" BASIS,
|
| +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| +# See the License for the specific language governing permissions and
|
| +# limitations under the License.
|
| +"""File and Cloud URL representation classes."""
|
| +
|
| +from __future__ import absolute_import
|
| +
|
| +import os
|
| +import re
|
| +
|
| +from gslib.exception import InvalidUrlError
|
| +
|
| +# Matches provider strings of the form 'gs://'
|
| +PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$')
|
| +# Matches bucket strings of the form 'gs://bucket'
|
| +BUCKET_REGEX = re.compile(r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$')
|
| +# Matches object strings of the form 'gs://bucket/obj'
|
| +OBJECT_REGEX = re.compile(
|
| + r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)')
|
| +# Matches versioned object strings of the form 'gs://bucket/obj#1234'
|
| +GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$')
|
| +# Matches versioned object strings of the form 's3://bucket/obj#NULL'
|
| +S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$')
|
| +# Matches file strings of the form 'file://dir/filename'
|
| +FILE_OBJECT_REGEX = re.compile(r'([^:]*://)(?P<filepath>.*)')
|
| +# Regex to disallow buckets violating charset or not [3..255] chars total.
|
| +BUCKET_NAME_RE = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9\._-]{1,253}[a-zA-Z0-9]$')
|
| +# Regex to disallow buckets with individual DNS labels longer than 63.
|
| +TOO_LONG_DNS_NAME_COMP = re.compile(r'[-_a-z0-9]{64}')
|
| +# Regex to determine if a string contains any wildcards.
|
| +WILDCARD_REGEX = re.compile(r'[*?\[\]]')
|
| +
|
| +
|
| +class StorageUrl(object):
|
| + """Abstract base class for file and Cloud Storage URLs."""
|
| +
|
| + def Clone(self):
|
| + raise NotImplementedError('Clone not overridden')
|
| +
|
| + def IsFileUrl(self):
|
| + raise NotImplementedError('IsFileUrl not overridden')
|
| +
|
| + def IsCloudUrl(self):
|
| + raise NotImplementedError('IsCloudUrl not overridden')
|
| +
|
| + def IsStream(self):
|
| + raise NotImplementedError('IsStream not overridden')
|
| +
|
| + def CreatePrefixUrl(self, wildcard_suffix=None):
|
| + """Returns a prefix of this URL that can be used for iterating.
|
| +
|
| + Args:
|
| + wildcard_suffix: If supplied, this wildcard suffix will be appended to the
|
| + prefix with a trailing slash before being returned.
|
| +
|
| + Returns:
|
| + A prefix of this URL that can be used for iterating.
|
| +
|
| + If this URL contains a trailing slash, it will be stripped to create the
|
| + prefix. This helps avoid infinite looping when prefixes are iterated, but
|
| + preserves other slashes so that objects with '/' in the name are handled
|
| + properly.
|
| +
|
| + For example, when recursively listing a bucket with the following contents:
|
| + gs://bucket// <-- object named slash
|
| + gs://bucket//one-dir-deep
|
| + a top-level expansion with '/' as a delimiter will result in the following
|
| + URL strings:
|
| + 'gs://bucket//' : OBJECT
|
| + 'gs://bucket//' : PREFIX
|
| + If we right-strip all slashes from the prefix entry and add a wildcard
|
| + suffix, we will get 'gs://bucket/*' which will produce identical results
|
| + (and infinitely recurse).
|
| +
|
| + Example return values:
|
| + ('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*'
|
| + ('gs://bucket/', '*') becomes 'gs://bucket/*'
|
| + ('gs://bucket/', None) becomes 'gs://bucket'
|
| + ('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*'
|
| + ('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**'
|
| + ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes
|
| + 'gs://bucket/subdir/*', but iterating on this will return 'subdir/'
|
| + as a BucketListingObject, so we will not recurse on it as a subdir
|
| + during listing.
|
| + """
|
| + raise NotImplementedError('CreatePrefixUrl not overridden')
|
| +
|
| + @property
|
| + def url_string(self):
|
| + raise NotImplementedError('url_string not overridden')
|
| +
|
| + @property
|
| + def versionless_url_string(self):
|
| + raise NotImplementedError('versionless_url_string not overridden')
|
| +
|
| + def __eq__(self, other):
|
| + return self.url_string == other.url_string
|
| +
|
| + def __hash__(self):
|
| + return hash(self.url_string)
|
| +
|
| +
|
| +class _FileUrl(StorageUrl):
|
| + """File URL class providing parsing and convenience methods.
|
| +
|
| + This class assists with usage and manipulation of an
|
| + (optionally wildcarded) file URL string. Depending on the string
|
| + contents, this class represents one or more directories or files.
|
| +
|
| + For File URLs, scheme is always file, bucket_name is always blank,
|
| + and object_name contains the file/directory path.
|
| + """
|
| +
|
| + def __init__(self, url_string, is_stream=False):
|
| + self.scheme = 'file'
|
| + self.bucket_name = ''
|
| + match = FILE_OBJECT_REGEX.match(url_string)
|
| + if match and match.lastindex == 2:
|
| + self.object_name = match.group(2)
|
| + else:
|
| + self.object_name = url_string
|
| + self.generation = None
|
| + self.is_stream = is_stream
|
| + self.delim = os.sep
|
| +
|
| + def Clone(self):
|
| + return _FileUrl(self.url_string)
|
| +
|
| + def IsFileUrl(self):
|
| + return True
|
| +
|
| + def IsCloudUrl(self):
|
| + return False
|
| +
|
| + def IsStream(self):
|
| + return self.is_stream
|
| +
|
| + def IsDirectory(self):
|
| + return not self.IsStream() and os.path.isdir(self.object_name)
|
| +
|
| + def CreatePrefixUrl(self, wildcard_suffix=None):
|
| + return self.url_string
|
| +
|
| + @property
|
| + def url_string(self):
|
| + return '%s://%s' % (self.scheme, self.object_name)
|
| +
|
| + @property
|
| + def versionless_url_string(self):
|
| + return self.url_string
|
| +
|
| + def __str__(self):
|
| + return self.url_string
|
| +
|
| +
|
| +class _CloudUrl(StorageUrl):
|
| + """Cloud URL class providing parsing and convenience methods.
|
| +
|
| + This class assists with usage and manipulation of an
|
| + (optionally wildcarded) cloud URL string. Depending on the string
|
| + contents, this class represents a provider, bucket(s), or object(s).
|
| +
|
| + This class operates only on strings. No cloud storage API calls are
|
| + made from this class.
|
| + """
|
| +
|
| + def __init__(self, url_string):
|
| + self.scheme = None
|
| + self.bucket_name = None
|
| + self.object_name = None
|
| + self.generation = None
|
| + self.delim = '/'
|
| + provider_match = PROVIDER_REGEX.match(url_string)
|
| + bucket_match = BUCKET_REGEX.match(url_string)
|
| + if provider_match:
|
| + self.scheme = provider_match.group('provider')
|
| + elif bucket_match:
|
| + self.scheme = bucket_match.group('provider')
|
| + self.bucket_name = bucket_match.group('bucket')
|
| + if (not ContainsWildcard(self.bucket_name) and
|
| + (not BUCKET_NAME_RE.match(self.bucket_name) or
|
| + TOO_LONG_DNS_NAME_COMP.search(self.bucket_name))):
|
| + raise InvalidUrlError('Invalid bucket name in URL "%s"' % url_string)
|
| + else:
|
| + object_match = OBJECT_REGEX.match(url_string)
|
| + if object_match:
|
| + self.scheme = object_match.group('provider')
|
| + self.bucket_name = object_match.group('bucket')
|
| + self.object_name = object_match.group('object')
|
| + if self.scheme == 'gs':
|
| + generation_match = GS_GENERATION_REGEX.match(self.object_name)
|
| + if generation_match:
|
| + self.object_name = generation_match.group('object')
|
| + self.generation = generation_match.group('generation')
|
| + elif self.scheme == 's3':
|
| + version_match = S3_VERSION_REGEX.match(self.object_name)
|
| + if version_match:
|
| + self.object_name = version_match.group('object')
|
| + self.generation = version_match.group('version_id')
|
| + else:
|
| + raise InvalidUrlError(
|
| + 'CloudUrl: URL string %s did not match URL regex' % url_string)
|
| +
|
| + def Clone(self):
|
| + return _CloudUrl(self.url_string)
|
| +
|
| + def IsFileUrl(self):
|
| + return False
|
| +
|
| + def IsCloudUrl(self):
|
| + return True
|
| +
|
| + def IsStream(self):
|
| + raise NotImplementedError('IsStream not supported on CloudUrl')
|
| +
|
| + def IsBucket(self):
|
| + return bool(self.bucket_name and not self.object_name)
|
| +
|
| + def IsObject(self):
|
| + return bool(self.bucket_name and self.object_name)
|
| +
|
| + def HasGeneration(self):
|
| + return bool(self.generation)
|
| +
|
| + def IsProvider(self):
|
| + return bool(self.scheme and not self.bucket_name)
|
| +
|
| + def CreatePrefixUrl(self, wildcard_suffix=None):
|
| + prefix = StripOneSlash(self.versionless_url_string)
|
| + if wildcard_suffix:
|
| + prefix = '%s/%s' % (prefix, wildcard_suffix)
|
| + return prefix
|
| +
|
| + @property
|
| + def bucket_url_string(self):
|
| + return '%s://%s/' % (self.scheme, self.bucket_name)
|
| +
|
| + @property
|
| + def url_string(self):
|
| + url_str = self.versionless_url_string
|
| + if self.HasGeneration():
|
| + url_str += '#%s' % self.generation
|
| + return url_str
|
| +
|
| + @property
|
| + def versionless_url_string(self):
|
| + if self.IsProvider():
|
| + return '%s://' % self.scheme
|
| + elif self.IsBucket():
|
| + return self.bucket_url_string
|
| + return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name)
|
| +
|
| + def __str__(self):
|
| + return self.url_string
|
| +
|
| +
|
| +def _GetSchemeFromUrlString(url_str):
|
| + """Returns scheme component of a URL string."""
|
| +
|
| + end_scheme_idx = url_str.find('://')
|
| + if end_scheme_idx == -1:
|
| + # File is the default scheme.
|
| + return 'file'
|
| + else:
|
| + return url_str[0:end_scheme_idx].lower()
|
| +
|
| +
|
| +def _GetPathFromUrlString(url_str):
|
| + """Returns path component of a URL string."""
|
| +
|
| + end_scheme_idx = url_str.find('://')
|
| + if end_scheme_idx == -1:
|
| + return url_str
|
| + else:
|
| + return url_str[end_scheme_idx + 3:]
|
| +
|
| +
|
| +def IsFileUrlString(url_str):
|
| + """Returns whether a string is a file URL."""
|
| +
|
| + return _GetSchemeFromUrlString(url_str) == 'file'
|
| +
|
| +
|
| +def StorageUrlFromString(url_str):
|
| + """Static factory function for creating a StorageUrl from a string."""
|
| +
|
| + scheme = _GetSchemeFromUrlString(url_str)
|
| +
|
| + if scheme not in ('file', 's3', 'gs'):
|
| + raise InvalidUrlError('Unrecognized scheme "%s"' % scheme)
|
| + if scheme == 'file':
|
| + path = _GetPathFromUrlString(url_str)
|
| + is_stream = (path == '-')
|
| + return _FileUrl(url_str, is_stream=is_stream)
|
| + return _CloudUrl(url_str)
|
| +
|
| +
|
| +def StripOneSlash(url_str):
|
| + if url_str and url_str.endswith('/'):
|
| + return url_str[:-1]
|
| + return url_str
|
| +
|
| +
|
| +def ContainsWildcard(url_string):
|
| + """Checks whether url_string contains a wildcard.
|
| +
|
| + Args:
|
| + url_string: URL string to check.
|
| +
|
| + Returns:
|
| + bool indicator.
|
| + """
|
| + return bool(WILDCARD_REGEX.search(url_string))
|
|
|