Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(493)

Unified Diff: third_party/gsutil/gslib/storage_url.py

Issue 1377933002: [catapult] - Copy Telemetry's gsutilz over to third_party. (Closed) Base URL: https://github.com/catapult-project/catapult.git@master
Patch Set: Rename to gsutil. Created 5 years, 3 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/gsutil/gslib/storage_uri_builder.py ('k') | third_party/gsutil/gslib/tab_complete.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/gsutil/gslib/storage_url.py
diff --git a/third_party/gsutil/gslib/storage_url.py b/third_party/gsutil/gslib/storage_url.py
new file mode 100644
index 0000000000000000000000000000000000000000..657883cd71cadacde9c713bc628cfc217a89991a
--- /dev/null
+++ b/third_party/gsutil/gslib/storage_url.py
@@ -0,0 +1,324 @@
+# -*- coding: utf-8 -*-
+# Copyright 2013 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""File and Cloud URL representation classes."""
+
+from __future__ import absolute_import
+
+import os
+import re
+
+from gslib.exception import InvalidUrlError
+
+# Matches provider strings of the form 'gs://'
+PROVIDER_REGEX = re.compile(r'(?P<provider>[^:]*)://$')
+# Matches bucket strings of the form 'gs://bucket'
+BUCKET_REGEX = re.compile(r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/{0,1}$')
+# Matches object strings of the form 'gs://bucket/obj'
+OBJECT_REGEX = re.compile(
+ r'(?P<provider>[^:]*)://(?P<bucket>[^/]*)/(?P<object>.*)')
+# Matches versioned object strings of the form 'gs://bucket/obj#1234'
+GS_GENERATION_REGEX = re.compile(r'(?P<object>.+)#(?P<generation>[0-9]+)$')
+# Matches versioned object strings of the form 's3://bucket/obj#NULL'
+S3_VERSION_REGEX = re.compile(r'(?P<object>.+)#(?P<version_id>.+)$')
+# Matches file strings of the form 'file://dir/filename'
+FILE_OBJECT_REGEX = re.compile(r'([^:]*://)(?P<filepath>.*)')
+# Regex to disallow buckets violating charset or not [3..255] chars total.
+BUCKET_NAME_RE = re.compile(r'^[a-zA-Z0-9][a-zA-Z0-9\._-]{1,253}[a-zA-Z0-9]$')
+# Regex to disallow buckets with individual DNS labels longer than 63.
+TOO_LONG_DNS_NAME_COMP = re.compile(r'[-_a-z0-9]{64}')
+# Regex to determine if a string contains any wildcards.
+WILDCARD_REGEX = re.compile(r'[*?\[\]]')
+
+
+class StorageUrl(object):
+ """Abstract base class for file and Cloud Storage URLs."""
+
+ def Clone(self):
+ raise NotImplementedError('Clone not overridden')
+
+ def IsFileUrl(self):
+ raise NotImplementedError('IsFileUrl not overridden')
+
+ def IsCloudUrl(self):
+ raise NotImplementedError('IsCloudUrl not overridden')
+
+ def IsStream(self):
+ raise NotImplementedError('IsStream not overridden')
+
+ def CreatePrefixUrl(self, wildcard_suffix=None):
+ """Returns a prefix of this URL that can be used for iterating.
+
+ Args:
+ wildcard_suffix: If supplied, this wildcard suffix will be appended to the
+ prefix with a trailing slash before being returned.
+
+ Returns:
+ A prefix of this URL that can be used for iterating.
+
+ If this URL contains a trailing slash, it will be stripped to create the
+ prefix. This helps avoid infinite looping when prefixes are iterated, but
+ preserves other slashes so that objects with '/' in the name are handled
+ properly.
+
+ For example, when recursively listing a bucket with the following contents:
+ gs://bucket// <-- object named slash
+ gs://bucket//one-dir-deep
+ a top-level expansion with '/' as a delimiter will result in the following
+ URL strings:
+ 'gs://bucket//' : OBJECT
+ 'gs://bucket//' : PREFIX
+ If we right-strip all slashes from the prefix entry and add a wildcard
+ suffix, we will get 'gs://bucket/*' which will produce identical results
+ (and infinitely recurse).
+
+ Example return values:
+ ('gs://bucket/subdir/', '*') becomes 'gs://bucket/subdir/*'
+ ('gs://bucket/', '*') becomes 'gs://bucket/*'
+ ('gs://bucket/', None) becomes 'gs://bucket'
+ ('gs://bucket/subdir//', '*') becomes 'gs://bucket/subdir//*'
+ ('gs://bucket/subdir///', '**') becomes 'gs://bucket/subdir///**'
+ ('gs://bucket/subdir/', '*') where 'subdir/' is an object becomes
+ 'gs://bucket/subdir/*', but iterating on this will return 'subdir/'
+ as a BucketListingObject, so we will not recurse on it as a subdir
+ during listing.
+ """
+ raise NotImplementedError('CreatePrefixUrl not overridden')
+
+ @property
+ def url_string(self):
+ raise NotImplementedError('url_string not overridden')
+
+ @property
+ def versionless_url_string(self):
+ raise NotImplementedError('versionless_url_string not overridden')
+
+ def __eq__(self, other):
+ return self.url_string == other.url_string
+
+ def __hash__(self):
+ return hash(self.url_string)
+
+
+class _FileUrl(StorageUrl):
+ """File URL class providing parsing and convenience methods.
+
+ This class assists with usage and manipulation of an
+ (optionally wildcarded) file URL string. Depending on the string
+ contents, this class represents one or more directories or files.
+
+ For File URLs, scheme is always file, bucket_name is always blank,
+ and object_name contains the file/directory path.
+ """
+
+ def __init__(self, url_string, is_stream=False):
+ self.scheme = 'file'
+ self.bucket_name = ''
+ match = FILE_OBJECT_REGEX.match(url_string)
+ if match and match.lastindex == 2:
+ self.object_name = match.group(2)
+ else:
+ self.object_name = url_string
+ self.generation = None
+ self.is_stream = is_stream
+ self.delim = os.sep
+
+ def Clone(self):
+ return _FileUrl(self.url_string)
+
+ def IsFileUrl(self):
+ return True
+
+ def IsCloudUrl(self):
+ return False
+
+ def IsStream(self):
+ return self.is_stream
+
+ def IsDirectory(self):
+ return not self.IsStream() and os.path.isdir(self.object_name)
+
+ def CreatePrefixUrl(self, wildcard_suffix=None):
+ return self.url_string
+
+ @property
+ def url_string(self):
+ return '%s://%s' % (self.scheme, self.object_name)
+
+ @property
+ def versionless_url_string(self):
+ return self.url_string
+
+ def __str__(self):
+ return self.url_string
+
+
+class _CloudUrl(StorageUrl):
+ """Cloud URL class providing parsing and convenience methods.
+
+ This class assists with usage and manipulation of an
+ (optionally wildcarded) cloud URL string. Depending on the string
+ contents, this class represents a provider, bucket(s), or object(s).
+
+ This class operates only on strings. No cloud storage API calls are
+ made from this class.
+ """
+
+ def __init__(self, url_string):
+ self.scheme = None
+ self.bucket_name = None
+ self.object_name = None
+ self.generation = None
+ self.delim = '/'
+ provider_match = PROVIDER_REGEX.match(url_string)
+ bucket_match = BUCKET_REGEX.match(url_string)
+ if provider_match:
+ self.scheme = provider_match.group('provider')
+ elif bucket_match:
+ self.scheme = bucket_match.group('provider')
+ self.bucket_name = bucket_match.group('bucket')
+ if (not ContainsWildcard(self.bucket_name) and
+ (not BUCKET_NAME_RE.match(self.bucket_name) or
+ TOO_LONG_DNS_NAME_COMP.search(self.bucket_name))):
+ raise InvalidUrlError('Invalid bucket name in URL "%s"' % url_string)
+ else:
+ object_match = OBJECT_REGEX.match(url_string)
+ if object_match:
+ self.scheme = object_match.group('provider')
+ self.bucket_name = object_match.group('bucket')
+ self.object_name = object_match.group('object')
+ if self.scheme == 'gs':
+ generation_match = GS_GENERATION_REGEX.match(self.object_name)
+ if generation_match:
+ self.object_name = generation_match.group('object')
+ self.generation = generation_match.group('generation')
+ elif self.scheme == 's3':
+ version_match = S3_VERSION_REGEX.match(self.object_name)
+ if version_match:
+ self.object_name = version_match.group('object')
+ self.generation = version_match.group('version_id')
+ else:
+ raise InvalidUrlError(
+ 'CloudUrl: URL string %s did not match URL regex' % url_string)
+
+ def Clone(self):
+ return _CloudUrl(self.url_string)
+
+ def IsFileUrl(self):
+ return False
+
+ def IsCloudUrl(self):
+ return True
+
+ def IsStream(self):
+ raise NotImplementedError('IsStream not supported on CloudUrl')
+
+ def IsBucket(self):
+ return bool(self.bucket_name and not self.object_name)
+
+ def IsObject(self):
+ return bool(self.bucket_name and self.object_name)
+
+ def HasGeneration(self):
+ return bool(self.generation)
+
+ def IsProvider(self):
+ return bool(self.scheme and not self.bucket_name)
+
+ def CreatePrefixUrl(self, wildcard_suffix=None):
+ prefix = StripOneSlash(self.versionless_url_string)
+ if wildcard_suffix:
+ prefix = '%s/%s' % (prefix, wildcard_suffix)
+ return prefix
+
+ @property
+ def bucket_url_string(self):
+ return '%s://%s/' % (self.scheme, self.bucket_name)
+
+ @property
+ def url_string(self):
+ url_str = self.versionless_url_string
+ if self.HasGeneration():
+ url_str += '#%s' % self.generation
+ return url_str
+
+ @property
+ def versionless_url_string(self):
+ if self.IsProvider():
+ return '%s://' % self.scheme
+ elif self.IsBucket():
+ return self.bucket_url_string
+ return '%s://%s/%s' % (self.scheme, self.bucket_name, self.object_name)
+
+ def __str__(self):
+ return self.url_string
+
+
+def _GetSchemeFromUrlString(url_str):
+ """Returns scheme component of a URL string."""
+
+ end_scheme_idx = url_str.find('://')
+ if end_scheme_idx == -1:
+ # File is the default scheme.
+ return 'file'
+ else:
+ return url_str[0:end_scheme_idx].lower()
+
+
+def _GetPathFromUrlString(url_str):
+ """Returns path component of a URL string."""
+
+ end_scheme_idx = url_str.find('://')
+ if end_scheme_idx == -1:
+ return url_str
+ else:
+ return url_str[end_scheme_idx + 3:]
+
+
+def IsFileUrlString(url_str):
+ """Returns whether a string is a file URL."""
+
+ return _GetSchemeFromUrlString(url_str) == 'file'
+
+
+def StorageUrlFromString(url_str):
+ """Static factory function for creating a StorageUrl from a string."""
+
+ scheme = _GetSchemeFromUrlString(url_str)
+
+ if scheme not in ('file', 's3', 'gs'):
+ raise InvalidUrlError('Unrecognized scheme "%s"' % scheme)
+ if scheme == 'file':
+ path = _GetPathFromUrlString(url_str)
+ is_stream = (path == '-')
+ return _FileUrl(url_str, is_stream=is_stream)
+ return _CloudUrl(url_str)
+
+
+def StripOneSlash(url_str):
+ if url_str and url_str.endswith('/'):
+ return url_str[:-1]
+ return url_str
+
+
+def ContainsWildcard(url_string):
+ """Checks whether url_string contains a wildcard.
+
+ Args:
+ url_string: URL string to check.
+
+ Returns:
+ bool indicator.
+ """
+ return bool(WILDCARD_REGEX.search(url_string))
« no previous file with comments | « third_party/gsutil/gslib/storage_uri_builder.py ('k') | third_party/gsutil/gslib/tab_complete.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698