| Index: gslib/util.py
|
| ===================================================================
|
| --- gslib/util.py (revision 33376)
|
| +++ gslib/util.py (working copy)
|
| @@ -1,3 +1,4 @@
|
| +# -*- coding: utf-8 -*-
|
| # Copyright 2010 Google Inc. All Rights Reserved.
|
| #
|
| # Licensed under the Apache License, Version 2.0 (the "License");
|
| @@ -11,33 +12,45 @@
|
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| # See the License for the specific language governing permissions and
|
| # limitations under the License.
|
| -
|
| """Static data and helper functions."""
|
|
|
| -import binascii
|
| -import boto
|
| -import boto.auth
|
| +from __future__ import absolute_import
|
| +
|
| import errno
|
| -import gslib
|
| import math
|
| import multiprocessing
|
| import os
|
| +import pkgutil
|
| import re
|
| +import struct
|
| import sys
|
| +import tempfile
|
| import textwrap
|
| import threading
|
| import traceback
|
| import xml.etree.ElementTree as ElementTree
|
|
|
| +import boto
|
| from boto import config
|
| +import boto.auth
|
| from boto.exception import NoAuthHandlerFound
|
| from boto.gs.connection import GSConnection
|
| from boto.provider import Provider
|
| from boto.pyami.config import BotoConfigLocations
|
| -from gslib.exception import CommandException
|
| +import httplib2
|
| +from oauth2client.client import HAS_CRYPTO
|
| from retry_decorator import retry_decorator
|
| -from oauth2client.client import HAS_CRYPTO
|
|
|
| +import gslib
|
| +from gslib.exception import CommandException
|
| +from gslib.storage_url import StorageUrlFromString
|
| +from gslib.translation_helper import AclTranslation
|
| +from gslib.translation_helper import GenerationFromUrlAndString
|
| +from gslib.translation_helper import S3_ACL_MARKER_GUID
|
| +from gslib.translation_helper import S3_DELETE_MARKER_GUID
|
| +from gslib.translation_helper import S3_MARKER_GUIDS
|
| +
|
| +# pylint: disable=g-import-not-at-top
|
| try:
|
| # This module doesn't necessarily exist on Windows.
|
| import resource
|
| @@ -45,37 +58,70 @@
|
| except ImportError, e:
|
| HAS_RESOURCE_MODULE = False
|
|
|
| +ONE_KB = 1024
|
| TWO_MB = 2 * 1024 * 1024
|
| +TEN_MB = 10 * 1024 * 1024
|
| +DEFAULT_FILE_BUFFER_SIZE = 8192
|
| +_DEFAULT_LINES = 25
|
|
|
| +# By default, the timeout for SSL read errors is infinite. This could
|
| +# cause gsutil to hang on network disconnect, so pick a more reasonable
|
| +# timeout.
|
| +SSL_TIMEOUT = 60
|
| +
|
| +# Start with a progress callback every 64KB during uploads/downloads (JSON API).
|
| +# Callback implementation should back off until it hits the maximum size
|
| +# so that callbacks do not create huge amounts of log output.
|
| +START_CALLBACK_PER_BYTES = 1024*64
|
| +MAX_CALLBACK_PER_BYTES = 1024*1024*100
|
| +
|
| +# Upload/download files in 8KB chunks over the HTTP connection.
|
| +TRANSFER_BUFFER_SIZE = 1024*8
|
| +
|
| +# Default number of progress callbacks during transfer (XML API).
|
| +XML_PROGRESS_CALLBACKS = 10
|
| +
|
| +# For files >= this size, output a message indicating that we're running an
|
| +# operation on the file (like hashing or gzipping) so it does not appear to the
|
| +# user that the command is hanging.
|
| +MIN_SIZE_COMPUTE_LOGGING = 100*1024*1024 # 100 MB
|
| +
|
| NO_MAX = sys.maxint
|
|
|
| +UTF8 = 'utf-8'
|
| +
|
| VERSION_MATCHER = re.compile(r'^(?P<maj>\d+)(\.(?P<min>\d+)(?P<suffix>.*))?')
|
|
|
| RELEASE_NOTES_URL = 'https://pub.storage.googleapis.com/gsutil_ReleaseNotes.txt'
|
|
|
| # Binary exponentiation strings.
|
| _EXP_STRINGS = [
|
| - (0, 'B', 'bit'),
|
| - (10, 'KB', 'Kbit', 'K'),
|
| - (20, 'MB', 'Mbit', 'M'),
|
| - (30, 'GB', 'Gbit', 'G'),
|
| - (40, 'TB', 'Tbit', 'T'),
|
| - (50, 'PB', 'Pbit', 'P'),
|
| - (60, 'EB', 'Ebit', 'E'),
|
| + (0, 'B', 'bit'),
|
| + (10, 'KB', 'Kbit', 'K'),
|
| + (20, 'MB', 'Mbit', 'M'),
|
| + (30, 'GB', 'Gbit', 'G'),
|
| + (40, 'TB', 'Tbit', 'T'),
|
| + (50, 'PB', 'Pbit', 'P'),
|
| + (60, 'EB', 'Ebit', 'E'),
|
| ]
|
|
|
| -global manager
|
| +configured_certs_file = None
|
|
|
| +global manager # pylint: disable=global-at-module-level
|
| +
|
| +
|
| def InitializeMultiprocessingVariables():
|
| + """Perform necessary initialization for multiprocessing.
|
| +
|
| + See gslib.command.InitializeMultiprocessingVariables for an explanation
|
| + of why this is necessary.
|
| """
|
| - Perform necessary initialization - see
|
| - gslib.command.InitializeMultiprocessingVariables for an explanation of why
|
| - this is necessary.
|
| - """
|
| - global manager
|
| + global manager # pylint: disable=global-variable-undefined
|
| manager = multiprocessing.Manager()
|
|
|
| +
|
| def _GenerateSuffixRegex():
|
| + """Creates a suffix regex for human-readable byte counts."""
|
| human_bytes_re = r'(?P<num>\d*\.\d+|\d+)\s*(?P<suffix>%s)?'
|
| suffixes = []
|
| suffix_to_si = {}
|
| @@ -84,7 +130,7 @@
|
| for suffix in si_suffixes:
|
| suffix_to_si[suffix] = i
|
| suffixes.extend(si_suffixes)
|
| - human_bytes_re = human_bytes_re % '|'.join(suffixes)
|
| + human_bytes_re %= '|'.join(suffixes)
|
| matcher = re.compile(human_bytes_re)
|
| return suffix_to_si, matcher
|
|
|
| @@ -105,7 +151,7 @@
|
|
|
| GSUTIL_PUB_TARBALL = 'gs://pub/gsutil.tar.gz'
|
|
|
| -Retry = retry_decorator.retry
|
| +Retry = retry_decorator.retry # pylint: disable=invalid-name
|
|
|
| # Cache the values from this check such that they're available to all callers
|
| # without needing to run all the checks again (some of these, such as calling
|
| @@ -114,6 +160,7 @@
|
| cached_multiprocessing_is_available_stack_trace = None
|
| cached_multiprocessing_is_available_message = None
|
|
|
| +
|
| # Enum class for specifying listing style.
|
| class ListingStyle(object):
|
| SHORT = 'SHORT'
|
| @@ -126,48 +173,91 @@
|
| getattr(crcmod.crcmod, '_usingExtension', None))
|
|
|
|
|
| -def CreateTrackerDirIfNeeded():
|
| - """Looks up the configured directory where gsutil keeps its resumable
|
| - transfer tracker files, and creates it if it doesn't already exist.
|
| -
|
| - Returns:
|
| - The pathname to the tracker directory.
|
| - """
|
| - tracker_dir = config.get(
|
| - 'GSUtil', 'resumable_tracker_dir',
|
| - os.path.expanduser('~' + os.sep + '.gsutil'))
|
| - if not os.path.exists(tracker_dir):
|
| +def CreateDirIfNeeded(dir_path):
|
| + """Creates a directory, suppressing already-exists errors."""
|
| + if not os.path.exists(dir_path):
|
| try:
|
| # Unfortunately, even though we catch and ignore EEXIST, this call will
|
| - # will output a (needless) error message (no way to avoid that in Python).
|
| - os.makedirs(tracker_dir)
|
| + # output a (needless) error message (no way to avoid that in Python).
|
| + os.makedirs(dir_path)
|
| # Ignore 'already exists' in case user tried to start up several
|
| # resumable uploads concurrently from a machine where no tracker dir had
|
| # yet been created.
|
| except OSError as e:
|
| if e.errno != errno.EEXIST:
|
| raise
|
| +
|
| +
|
| +def GetGsutilStateDir():
|
| + """Returns the location of the directory for gsutil state files.
|
| +
|
| + Certain operations, such as cross-process credential sharing and
|
| + resumable transfer tracking, need a known location for state files which
|
| + are created by gsutil as-needed.
|
| +
|
| + This location should only be used for storing data that is required to be in
|
| + a static location.
|
| +
|
| + Returns:
|
| + Path to directory for gsutil static state files.
|
| + """
|
| + config_file_dir = config.get(
|
| + 'GSUtil', 'state_dir',
|
| + os.path.expanduser(os.path.join('~', '.gsutil')))
|
| + CreateDirIfNeeded(config_file_dir)
|
| + return config_file_dir
|
| +
|
| +
|
| +def GetCredentialStoreFilename():
|
| + return os.path.join(GetGsutilStateDir(), 'credcache')
|
| +
|
| +
|
| +def CreateTrackerDirIfNeeded():
|
| + """Looks up or creates the gsutil tracker file directory.
|
| +
|
| + This is the configured directory where gsutil keeps its resumable transfer
|
| + tracker files. This function creates it if it doesn't already exist.
|
| +
|
| + Returns:
|
| + The pathname to the tracker directory.
|
| + """
|
| + tracker_dir = config.get(
|
| + 'GSUtil', 'resumable_tracker_dir',
|
| + os.path.join(GetGsutilStateDir(), 'tracker-files'))
|
| + CreateDirIfNeeded(tracker_dir)
|
| return tracker_dir
|
|
|
|
|
| +def PrintTrackerDirDeprecationWarningIfNeeded():
|
| + # TODO: Remove this along with the tracker_dir config value 1 year after
|
| + # 4.6 release date. Use state_dir instead.
|
| + if config.has_option('GSUtil', 'resumable_tracker_dir'):
|
| + sys.stderr.write('Warning: you have set resumable_tracker_dir in your '
|
| + '.boto configuration file. This configuration option is '
|
| + 'deprecated; please use the state_dir configuration '
|
| + 'option instead.\n')
|
| +
|
| +
|
| # Name of file where we keep the timestamp for the last time we checked whether
|
| # a new version of gsutil is available.
|
| +PrintTrackerDirDeprecationWarningIfNeeded()
|
| +CreateDirIfNeeded(GetGsutilStateDir())
|
| LAST_CHECKED_FOR_GSUTIL_UPDATE_TIMESTAMP_FILE = (
|
| - os.path.join(CreateTrackerDirIfNeeded(), '.last_software_update_check'))
|
| + os.path.join(GetGsutilStateDir(), '.last_software_update_check'))
|
|
|
|
|
| def HasConfiguredCredentials():
|
| """Determines if boto credential/config file exists."""
|
| - config = boto.config
|
| has_goog_creds = (config.has_option('Credentials', 'gs_access_key_id') and
|
| config.has_option('Credentials', 'gs_secret_access_key'))
|
| has_amzn_creds = (config.has_option('Credentials', 'aws_access_key_id') and
|
| config.has_option('Credentials', 'aws_secret_access_key'))
|
| has_oauth_creds = (
|
| config.has_option('Credentials', 'gs_oauth2_refresh_token'))
|
| - has_service_account_creds = (HAS_CRYPTO and
|
| - config.has_option('Credentials', 'gs_service_client_id')
|
| - and config.has_option('Credentials', 'gs_service_key_file'))
|
| + has_service_account_creds = (
|
| + HAS_CRYPTO and
|
| + config.has_option('Credentials', 'gs_service_client_id') and
|
| + config.has_option('Credentials', 'gs_service_key_file'))
|
|
|
| valid_auth_handler = None
|
| try:
|
| @@ -188,10 +278,7 @@
|
|
|
|
|
| def ConfigureNoOpAuthIfNeeded():
|
| - """
|
| - Sets up no-op auth handler if no boto credentials are configured.
|
| - """
|
| - config = boto.config
|
| + """Sets up no-op auth handler if no boto credentials are configured."""
|
| if not HasConfiguredCredentials():
|
| if (config.has_option('Credentials', 'gs_service_client_id')
|
| and not HAS_CRYPTO):
|
| @@ -199,12 +286,12 @@
|
| 'Your gsutil is configured with an OAuth2 service account, but you '
|
| 'do not have PyOpenSSL or PyCrypto 2.6 or later installed. Service '
|
| 'account authentication requires one of these libraries; please '
|
| - 'install either of them to proceed, or configure a different type '
|
| + 'install either of them to proceed, or configure a different type '
|
| 'of credentials with "gsutil config".')))
|
| else:
|
| # With no boto config file the user can still access publicly readable
|
| # buckets and objects.
|
| - from gslib import no_op_auth_plugin
|
| + from gslib import no_op_auth_plugin # pylint: disable=unused-variable
|
|
|
|
|
| def GetConfigFilePath():
|
| @@ -234,6 +321,85 @@
|
| return cf_list
|
|
|
|
|
| +def GetCertsFile():
|
| + """Configures and returns the CA Certificates file.
|
| +
|
| + If one is already configured, use it. Otherwise, amend the configuration
|
| + (in boto.config) to use the cert roots distributed with gsutil.
|
| +
|
| + Returns:
|
| + string filename of the certs file to use.
|
| + """
|
| + certs_file = boto.config.get('Boto', 'ca_certificates_file', None)
|
| + if not certs_file:
|
| + if configured_certs_file:
|
| + disk_certs_file = configured_certs_file
|
| + else:
|
| + disk_certs_file = os.path.abspath(
|
| + os.path.join(gslib.GSLIB_DIR, 'data', 'cacerts.txt'))
|
| + if not os.path.exists(disk_certs_file):
|
| + # If the file is not present on disk, this means the gslib module
|
| + # doesn't actually exist on disk anywhere. This can happen if it's
|
| + # being imported from a zip file. Unfortunately, we have to copy the
|
| + # certs file to a local temp file on disk because the underlying SSL
|
| + # socket requires it to be a filesystem path.
|
| + certs_data = pkgutil.get_data('gslib', 'data/cacerts.txt')
|
| + if not certs_data:
|
| + raise CommandException('Certificates file not found. Please '
|
| + 'reinstall gsutil from scratch')
|
| + fd, fname = tempfile.mkstemp(suffix='.txt', prefix='gsutil-cacerts')
|
| + f = os.fdopen(fd, 'w')
|
| + f.write(certs_data)
|
| + f.close()
|
| + disk_certs_file = fname
|
| + certs_file = disk_certs_file
|
| + return certs_file
|
| +
|
| +
|
| +def GetCleanupFiles():
|
| + """Returns a list of temp files to delete (if possible) when program exits."""
|
| + cleanup_files = []
|
| + if configured_certs_file:
|
| + cleanup_files.append(configured_certs_file)
|
| + return cleanup_files
|
| +
|
| +
|
| +def GetNewHttp(http_class=httplib2.Http, **kwargs):
|
| + """Creates and returns a new httplib2.Http instance.
|
| +
|
| + Args:
|
| + http_class: Optional custom Http class to use.
|
| + **kwargs: Arguments to pass to http_class constructor.
|
| +
|
| + Returns:
|
| + An initialized httplib2.Http instance.
|
| + """
|
| + proxy_info = httplib2.ProxyInfo(
|
| + proxy_type=3,
|
| + proxy_host=boto.config.get('Boto', 'proxy', None),
|
| + proxy_port=boto.config.getint('Boto', 'proxy_port', 0),
|
| + proxy_user=boto.config.get('Boto', 'proxy_user', None),
|
| + proxy_pass=boto.config.get('Boto', 'proxy_pass', None),
|
| + proxy_rdns=boto.config.get('Boto', 'proxy_rdns', False))
|
| + # Some installers don't package a certs file with httplib2, so use the
|
| + # one included with gsutil.
|
| + kwargs['ca_certs'] = GetCertsFile()
|
| + # Use a non-infinite SSL timeout to avoid hangs during network flakiness.
|
| + kwargs['timeout'] = SSL_TIMEOUT
|
| + http = http_class(proxy_info=proxy_info, **kwargs)
|
| + http.disable_ssl_certificate_validation = (not config.getbool(
|
| + 'Boto', 'https_validate_certificates'))
|
| + return http
|
| +
|
| +
|
| +def GetNumRetries():
|
| + return config.getint('Boto', 'num_retries', 6)
|
| +
|
| +
|
| +def GetMaxRetryDelay():
|
| + return config.getint('Boto', 'max_retry_delay', 60)
|
| +
|
| +
|
| def _RoundToNearestExponent(num):
|
| i = 0
|
| while i+1 < len(_EXP_STRINGS) and num >= (2 ** _EXP_STRINGS[i+1][0]):
|
| @@ -274,6 +440,8 @@
|
| human_string: A string supplied by user, e.g. '1M', '3 GB'.
|
| Returns:
|
| An integer containing the number of bytes.
|
| + Raises:
|
| + ValueError: on an invalid string.
|
| """
|
| human_string = human_string.lower()
|
| m = MATCH_HUMAN_BYTES.match(human_string)
|
| @@ -314,82 +482,59 @@
|
| return d0 + d1
|
|
|
|
|
| -def ParseErrorDetail(e):
|
| - """Parse <Message> and/or <Details> text from XML content.
|
| +def RemoveCRLFFromString(input_str):
|
| + """Returns the input string with all \\n and \\r removed."""
|
| + return re.sub(r'[\r\n]', '', input_str)
|
|
|
| - Args:
|
| - e: The GSResponseError that includes XML to be parsed.
|
|
|
| - Returns:
|
| - (exception_name, m, d), where m is <Message> text or None,
|
| - and d is <Details> text or None.
|
| - """
|
| - exc_name_parts = re.split("[\.']", str(type(e)))
|
| - if len(exc_name_parts) < 2:
|
| - # Shouldn't happen, but have fallback in case.
|
| - exc_name = str(type(e))
|
| - else:
|
| - exc_name = exc_name_parts[-2]
|
| - if not hasattr(e, 'body') or e.body is None:
|
| - return (exc_name, None)
|
| +def UnaryDictToXml(message):
|
| + """Generates XML representation of a nested dict.
|
|
|
| - match = re.search(r'<Message>(?P<message>.*)</Message>', e.body)
|
| - m = match.group('message') if match else None
|
| - match = re.search(r'<Details>(?P<details>.*)</Details>', e.body)
|
| - d = match.group('details') if match else None
|
| - return (exc_name, m, d)
|
| + This dict contains exactly one top-level entry and an arbitrary number of
|
| + 2nd-level entries, e.g. capturing a WebsiteConfiguration message.
|
|
|
| -def FormatErrorMessage(exc_name, status, code, reason, message, detail):
|
| - """Formats an error message from components parsed by ParseErrorDetail."""
|
| - if message and detail:
|
| - return('%s: status=%d, code=%s, reason="%s", message="%s", detail="%s"' %
|
| - (exc_name, status, code, reason, message, detail))
|
| - if message:
|
| - return('%s: status=%d, code=%s, reason="%s", message="%s"' %
|
| - (exc_name, status, code, reason, message))
|
| - if detail:
|
| - return('%s: status=%d, code=%s, reason="%s", detail="%s"' %
|
| - (exc_name, status, code, reason, detail))
|
| - return('%s: status=%d, code=%s, reason="%s"' %
|
| - (exc_name, status, code, reason))
|
| -
|
| -def UnaryDictToXml(message):
|
| - """Generates XML representation of a nested dict with exactly one
|
| - top-level entry and an arbitrary number of 2nd-level entries, e.g.
|
| - capturing a WebsiteConfiguration message.
|
| -
|
| Args:
|
| message: The dict encoding the message.
|
|
|
| Returns:
|
| XML string representation of the input dict.
|
| +
|
| + Raises:
|
| + Exception: if dict contains more than one top-level entry.
|
| """
|
| if len(message) != 1:
|
| - raise Exception("Expected dict of size 1, got size %d" % len(message))
|
| + raise Exception('Expected dict of size 1, got size %d' % len(message))
|
|
|
| name, content = message.items()[0]
|
| - T = ElementTree.Element(name)
|
| - for property, value in sorted(content.items()):
|
| - node = ElementTree.SubElement(T, property)
|
| + element_type = ElementTree.Element(name)
|
| + for element_property, value in sorted(content.items()):
|
| + node = ElementTree.SubElement(element_type, element_property)
|
| node.text = value
|
| - return ElementTree.tostring(T)
|
| + return ElementTree.tostring(element_type)
|
|
|
|
|
| -def LookUpGsutilVersion(uri):
|
| - """Looks up the gsutil version of the specified gsutil tarball URI, from the
|
| - metadata field set on that object.
|
| +def LookUpGsutilVersion(gsutil_api, url_str):
|
| + """Looks up the gsutil version of the specified gsutil tarball URL.
|
|
|
| + Version is specified in the metadata field set on that object.
|
| +
|
| Args:
|
| - URI: gsutil URI tarball (such as gs://pub/gsutil.tar.gz).
|
| + gsutil_api: gsutil Cloud API to use when retrieving gsutil tarball.
|
| + url_str: tarball URL to retrieve (such as 'gs://pub/gsutil.tar.gz').
|
|
|
| Returns:
|
| - Version string if URI is a cloud URI containing x-goog-meta-gsutil-version
|
| + Version string if URL is a cloud URL containing x-goog-meta-gsutil-version
|
| metadata, else None.
|
| """
|
| - if uri.is_cloud_uri():
|
| - obj = uri.get_key(False)
|
| - if obj.metadata and 'gsutil_version' in obj.metadata:
|
| - return obj.metadata['gsutil_version']
|
| + url = StorageUrlFromString(url_str)
|
| + if url.IsCloudUrl():
|
| + obj = gsutil_api.GetObjectMetadata(url.bucket_name, url.object_name,
|
| + provider=url.scheme,
|
| + fields=['metadata'])
|
| + if obj.metadata and obj.metadata.additionalProperties:
|
| + for prop in obj.metadata.additionalProperties:
|
| + if prop.key == 'gsutil_version':
|
| + return prop.value
|
|
|
|
|
| def GetGsutilVersionModifiedTime():
|
| @@ -404,16 +549,22 @@
|
| return sys.stdout.isatty() and sys.stderr.isatty() and sys.stdin.isatty()
|
|
|
|
|
| +def _HttpsValidateCertifcatesEnabled():
|
| + return config.get('Boto', 'https_validate_certificates', True)
|
| +
|
| +CERTIFICATE_VALIDATION_ENABLED = _HttpsValidateCertifcatesEnabled()
|
| +
|
| +
|
| def _BotoIsSecure():
|
| - for cfg_var in ('is_secure', 'https_validate_certificates'):
|
| - if (config.has_option('Boto', cfg_var)
|
| - and not config.getboolean('Boto', cfg_var)):
|
| - return False, cfg_var
|
| - return True, ''
|
| + return config.get('Boto', 'is_secure', True)
|
|
|
| BOTO_IS_SECURE = _BotoIsSecure()
|
|
|
|
|
| +def ResumableThreshold():
|
| + return config.getint('GSUtil', 'resumable_threshold', TWO_MB)
|
| +
|
| +
|
| def AddAcceptEncoding(headers):
|
| """Adds accept-encoding:gzip to the dictionary of headers."""
|
| # If Accept-Encoding is not already set, set it to enable gzip.
|
| @@ -421,79 +572,86 @@
|
| headers['accept-encoding'] = 'gzip'
|
|
|
|
|
| -def PrintFullInfoAboutUri(uri, incl_acl, headers):
|
| - """Print full info for given URI (like what displays for gsutil ls -L).
|
| +# pylint: disable=too-many-statements
|
| +def PrintFullInfoAboutObject(bucket_listing_ref, incl_acl=True):
|
| + """Print full info for given object (like what displays for gsutil ls -L).
|
|
|
| Args:
|
| - uri: StorageUri being listed.
|
| + bucket_listing_ref: BucketListingRef being listed.
|
| + Must have ref_type OBJECT and a populated root_object
|
| + with the desired fields.
|
| incl_acl: True if ACL info should be output.
|
| - headers: The headers to pass to boto, if any.
|
|
|
| Returns:
|
| - Tuple (number of objects,
|
| - object length, if listing_style is one of the long listing formats)
|
| + Tuple (number of objects, object_length)
|
|
|
| Raises:
|
| Exception: if calling bug encountered.
|
| """
|
| - # Run in a try/except clause so we can continue listings past
|
| - # access-denied errors (which can happen because user may have READ
|
| - # permission on object and thus see the bucket listing data, but lack
|
| - # FULL_CONTROL over individual objects and thus not be able to read
|
| - # their ACLs).
|
| - # TODO: Switch this code to use string formatting instead of tabs.
|
| - try:
|
| - print '%s:' % uri.uri.encode('utf-8')
|
| - headers = headers.copy()
|
| - # Add accept encoding so that the HEAD request matches what would be
|
| - # sent for a GET request.
|
| - AddAcceptEncoding(headers)
|
| - got_key = False
|
| - obj = uri.get_key(False, headers=headers)
|
| - got_key = True
|
| - print '\tCreation time:\t\t%s' % obj.last_modified
|
| - if obj.cache_control:
|
| - print '\tCache-Control:\t\t%s' % obj.cache_control
|
| - if obj.content_disposition:
|
| - print '\tContent-Disposition:\t\t%s' % obj.content_disposition
|
| - if obj.content_encoding:
|
| - print '\tContent-Encoding:\t%s' % obj.content_encoding
|
| - if obj.content_language:
|
| - print '\tContent-Language:\t%s' % obj.content_language
|
| - print '\tContent-Length:\t\t%s' % obj.size
|
| - print '\tContent-Type:\t\t%s' % obj.content_type
|
| - if hasattr(obj, 'component_count') and obj.component_count:
|
| - print '\tComponent-Count:\t%d' % obj.component_count
|
| - if obj.metadata:
|
| - prefix = uri.get_provider().metadata_prefix
|
| - for name in obj.metadata:
|
| - meta_string = '\t%s%s:\t%s' % (prefix, name, obj.metadata[name])
|
| - print meta_string.encode('utf-8')
|
| - if hasattr(obj, 'cloud_hashes'):
|
| - for alg in obj.cloud_hashes:
|
| - print '\tHash (%s):\t\t%s' % (
|
| - alg, binascii.b2a_hex(obj.cloud_hashes[alg]))
|
| - print '\tETag:\t\t\t%s' % obj.etag.strip('"\'')
|
| - if hasattr(obj, 'generation'):
|
| - print '\tGeneration:\t\t%s' % obj.generation
|
| - if hasattr(obj, 'metageneration'):
|
| - print '\tMetageneration:\t\t%s' % obj.metageneration
|
| - if incl_acl:
|
| - print '\tACL:\t\t%s' % (uri.get_acl(False, headers))
|
| - return (1, obj.size)
|
| - except boto.exception.GSResponseError as e:
|
| - if e.status == 403:
|
| - if got_key:
|
| - print ('\tACL:\t\t\tACCESS DENIED. Note: you need FULL_CONTROL '
|
| - 'permission\n\t\t\ton the object to read its ACL.')
|
| - return (1, obj.size)
|
| + url_str = bucket_listing_ref.url_string
|
| + storage_url = StorageUrlFromString(url_str)
|
| + obj = bucket_listing_ref.root_object
|
| +
|
| + if (obj.metadata and S3_DELETE_MARKER_GUID in
|
| + obj.metadata.additionalProperties):
|
| + num_bytes = 0
|
| + num_objs = 0
|
| + url_str += '<DeleteMarker>'
|
| + else:
|
| + num_bytes = obj.size
|
| + num_objs = 1
|
| +
|
| + print '%s:' % url_str.encode(UTF8)
|
| + if obj.updated:
|
| + print '\tCreation time:\t\t%s' % obj.updated.strftime(
|
| + '%a, %d %b %Y %H:%M:%S GMT')
|
| + if obj.cacheControl:
|
| + print '\tCache-Control:\t\t%s' % obj.cacheControl
|
| + if obj.contentDisposition:
|
| + print '\tContent-Disposition:\t\t%s' % obj.contentDisposition
|
| + if obj.contentEncoding:
|
| + print '\tContent-Encoding:\t\t%s' % obj.contentEncoding
|
| + if obj.contentLanguage:
|
| + print '\tContent-Language:\t%s' % obj.contentLanguage
|
| + print '\tContent-Length:\t\t%s' % obj.size
|
| + print '\tContent-Type:\t\t%s' % obj.contentType
|
| + if obj.componentCount:
|
| + print '\tComponent-Count:\t%d' % obj.componentCount
|
| + marker_props = {}
|
| + if obj.metadata and obj.metadata.additionalProperties:
|
| + non_marker_props = []
|
| + for add_prop in obj.metadata.additionalProperties:
|
| + if add_prop.key not in S3_MARKER_GUIDS:
|
| + non_marker_props.append(add_prop)
|
| else:
|
| - print "You aren't authorized to read %s - skipping" % uri
|
| - return (1, 0)
|
| + marker_props[add_prop.key] = add_prop.value
|
| + if non_marker_props:
|
| + print '\tMetadata:'
|
| + for ap in non_marker_props:
|
| + meta_string = '\t\t%s:\t\t%s' % (ap.key, ap.value)
|
| + print meta_string.encode(UTF8)
|
| + if obj.crc32c: print '\tHash (crc32c):\t\t%s' % obj.crc32c
|
| + if obj.md5Hash: print '\tHash (md5):\t\t%s' % obj.md5Hash
|
| + print '\tETag:\t\t\t%s' % obj.etag.strip('"\'')
|
| + if obj.generation:
|
| + generation_str = GenerationFromUrlAndString(storage_url, obj.generation)
|
| + print '\tGeneration:\t\t%s' % generation_str
|
| + if obj.metageneration:
|
| + print '\tMetageneration:\t\t%s' % obj.metageneration
|
| + if incl_acl:
|
| + # JSON API won't return acls as part of the response unless we have
|
| + # full control scope
|
| + if obj.acl:
|
| + print '\tACL:\t\t%s' % AclTranslation.JsonFromMessage(obj.acl)
|
| + elif S3_ACL_MARKER_GUID in marker_props:
|
| + print '\tACL:\t\t%s' % marker_props[S3_ACL_MARKER_GUID]
|
| else:
|
| - raise e
|
| - return (numobjs, numbytes)
|
| + print ('\tACL:\t\t\tACCESS DENIED. Note: you need OWNER '
|
| + 'permission\n\t\t\t\ton the object to read its ACL.')
|
|
|
| + return (num_objs, num_bytes)
|
| +
|
| +
|
| def CompareVersions(first, second):
|
| """Compares the first and second gsutil version strings.
|
|
|
| @@ -502,6 +660,10 @@
|
| (e.g., 3.3RC4 vs. 3.3RC5). A version string with a suffix is treated as
|
| less than its non-suffix counterpart (e.g. 3.32 > 3.32pre).
|
|
|
| + Args:
|
| + first: First gsutil version string.
|
| + second: Second gsutil version string.
|
| +
|
| Returns:
|
| (g, m):
|
| g is True if first known to be greater than second, else False.
|
| @@ -531,50 +693,182 @@
|
| return (bool(suffix_ver2) and not suffix_ver1, False)
|
| return (False, False)
|
|
|
| -def _IncreaseSoftLimitForResource(resource_name):
|
| - """Sets a new soft limit for the maximum number of open files.
|
| - The soft limit is used for this process (and its children), but the
|
| - hard limit is set by the system and cannot be exceeded.
|
| +
|
| +def _IncreaseSoftLimitForResource(resource_name, fallback_value):
|
| + """Sets a new soft limit for the maximum number of open files.
|
| +
|
| + The soft limit is used for this process (and its children), but the
|
| + hard limit is set by the system and cannot be exceeded.
|
| +
|
| + We will first try to set the soft limit to the hard limit's value; if that
|
| + fails, we will try to set the soft limit to the fallback_value iff this would
|
| + increase the soft limit.
|
| +
|
| + Args:
|
| + resource_name: Name of the resource to increase the soft limit for.
|
| + fallback_value: Fallback value to be used if we couldn't set the
|
| + soft value to the hard value (e.g., if the hard value
|
| + is "unlimited").
|
| +
|
| + Returns:
|
| + Current soft limit for the resource (after any changes we were able to
|
| + make), or -1 if the resource doesn't exist.
|
| """
|
| +
|
| + # Get the value of the resource.
|
| try:
|
| (soft_limit, hard_limit) = resource.getrlimit(resource_name)
|
| - resource.setrlimit(resource_name, (hard_limit, hard_limit))
|
| - return hard_limit
|
| - except (resource.error, ValueError), e:
|
| - return 0
|
| + except (resource.error, ValueError):
|
| + # The resource wasn't present, so we can't do anything here.
|
| + return -1
|
|
|
| + # Try to set the value of the soft limit to the value of the hard limit.
|
| + if hard_limit > soft_limit: # Some OS's report 0 for "unlimited".
|
| + try:
|
| + resource.setrlimit(resource_name, (hard_limit, hard_limit))
|
| + return hard_limit
|
| + except (resource.error, ValueError):
|
| + # We'll ignore this and try the fallback value.
|
| + pass
|
| +
|
| + # Try to set the value of the soft limit to the fallback value.
|
| + if soft_limit < fallback_value:
|
| + try:
|
| + resource.setrlimit(resource_name, (fallback_value, hard_limit))
|
| + return fallback_value
|
| + except (resource.error, ValueError):
|
| + # We couldn't change the soft limit, so just report the current
|
| + # value of the soft limit.
|
| + return soft_limit
|
| + else:
|
| + return soft_limit
|
| +
|
| +
|
| +def GetCloudApiInstance(cls, thread_state=None):
|
| + """Gets a gsutil Cloud API instance.
|
| +
|
| + Since Cloud API implementations are not guaranteed to be thread-safe, each
|
| + thread needs its own instance. These instances are passed to each thread
|
| + via the thread pool logic in command.
|
| +
|
| + Args:
|
| + cls: Command class to be used for single-threaded case.
|
| + thread_state: Per thread state from this thread containing a gsutil
|
| + Cloud API instance.
|
| +
|
| + Returns:
|
| + gsutil Cloud API instance.
|
| + """
|
| + return thread_state or cls.gsutil_api
|
| +
|
| +
|
| +def GetFileSize(fp, position_to_eof=False):
|
| + """Returns size of file, optionally leaving fp positioned at EOF."""
|
| + if not position_to_eof:
|
| + cur_pos = fp.tell()
|
| + fp.seek(0, os.SEEK_END)
|
| + cur_file_size = fp.tell()
|
| + if not position_to_eof:
|
| + fp.seek(cur_pos)
|
| + return cur_file_size
|
| +
|
| +
|
| +def GetStreamFromFileUrl(storage_url):
|
| + if storage_url.IsStream():
|
| + return sys.stdin
|
| + else:
|
| + return open(storage_url.object_name, 'rb')
|
| +
|
| +
|
| +def UrlsAreForSingleProvider(url_args):
|
| + """Tests whether the URLs are all for a single provider.
|
| +
|
| + Args:
|
| + url_args: Strings to check.
|
| +
|
| + Returns:
|
| + True if URLs are for single provider, False otherwise.
|
| + """
|
| + provider = None
|
| + url = None
|
| + for url_str in url_args:
|
| + url = StorageUrlFromString(url_str)
|
| + if not provider:
|
| + provider = url.scheme
|
| + elif url.scheme != provider:
|
| + return False
|
| + return provider is not None
|
| +
|
| +
|
| +def HaveFileUrls(args_to_check):
|
| + """Checks whether args_to_check contain any file URLs.
|
| +
|
| + Args:
|
| + args_to_check: Command-line argument subset to check.
|
| +
|
| + Returns:
|
| + True if args_to_check contains any file URLs.
|
| + """
|
| + for url_str in args_to_check:
|
| + storage_url = StorageUrlFromString(url_str)
|
| + if storage_url.IsFileUrl():
|
| + return True
|
| + return False
|
| +
|
| +
|
| +def HaveProviderUrls(args_to_check):
|
| + """Checks whether args_to_check contains any provider URLs (like 'gs://').
|
| +
|
| + Args:
|
| + args_to_check: Command-line argument subset to check.
|
| +
|
| + Returns:
|
| + True if args_to_check contains any provider URLs.
|
| + """
|
| + for url_str in args_to_check:
|
| + storage_url = StorageUrlFromString(url_str)
|
| + if storage_url.IsCloudUrl() and storage_url.IsProvider():
|
| + return True
|
| + return False
|
| +
|
| +
|
| def MultiprocessingIsAvailable(logger=None):
|
| - """
|
| + """Checks if multiprocessing is available.
|
| +
|
| There are some environments in which there is no way to use multiprocessing
|
| logic that's built into Python (e.g., if /dev/shm is not available, then
|
| we can't create semaphores). This simply tries out a few things that will be
|
| needed to make sure the environment can support the pieces of the
|
| multiprocessing module that we need.
|
| -
|
| +
|
| + Args:
|
| + logger: logging.logger to use for debug output.
|
| +
|
| Returns:
|
| (multiprocessing_is_available, stack_trace):
|
| multiprocessing_is_available: True iff the multiprocessing module is
|
| available for use.
|
| stack_trace: The stack trace generated by the call we tried that failed.
|
| """
|
| + # pylint: disable=global-variable-undefined
|
| global cached_multiprocessing_is_available
|
| global cached_multiprocessing_check_stack_trace
|
| global cached_multiprocessing_is_available_message
|
| if cached_multiprocessing_is_available is not None:
|
| if logger:
|
| logger.debug(cached_multiprocessing_check_stack_trace)
|
| - logger.warn('\n'.join(textwrap.wrap(
|
| - cached_multiprocessing_is_available_message + '\n')))
|
| + logger.warn(cached_multiprocessing_is_available_message)
|
| return (cached_multiprocessing_is_available,
|
| cached_multiprocessing_check_stack_trace)
|
|
|
| stack_trace = None
|
| multiprocessing_is_available = True
|
| - message = (
|
| - 'You have requested multiple threads or processes for an operation,'
|
| - ' but the required functionality of Python\'s multiprocessing '
|
| - 'module is not available. Your operations will be performed '
|
| - 'sequentially, and any requests for parallelism will be ignored.')
|
| + message = """
|
| +You have requested multiple threads or processes for an operation, but the
|
| +required functionality of Python\'s multiprocessing module is not available.
|
| +Your operations will be performed sequentially, and any requests for
|
| +parallelism will be ignored.
|
| +"""
|
| try:
|
| # Fails if /dev/shm (or some equivalent thereof) is not available for use
|
| # (e.g., there's no implementation, or we can't write to it, etc.).
|
| @@ -582,48 +876,61 @@
|
| multiprocessing.Value('i', 0)
|
| except:
|
| if not IS_WINDOWS:
|
| - message += ('\nPlease ensure that you have write access to both '
|
| - '/dev/shm and /run/shm.')
|
| + message += """
|
| +Please ensure that you have write access to both /dev/shm and /run/shm.
|
| +"""
|
| raise # We'll handle this in one place below.
|
| -
|
| +
|
| # Manager objects and Windows are generally a pain to work with, so try it
|
| # out as a sanity check. This definitely works on some versions of Windows,
|
| # but it's certainly possible that there is some unknown configuration for
|
| # which it won't.
|
| multiprocessing.Manager()
|
| -
|
| +
|
| # Check that the max number of open files is reasonable. Always check this
|
| # after we're sure that the basic multiprocessing functionality is
|
| # available, since this won't matter unless that's true.
|
| - limit = 0
|
| + limit = -1
|
| if HAS_RESOURCE_MODULE:
|
| # Try to set this with both resource names - RLIMIT_NOFILE for most Unix
|
| # platforms, and RLIMIT_OFILE for BSD. Ignore AttributeError because the
|
| # "resource" module is not guaranteed to know about these names.
|
| try:
|
| limit = max(limit,
|
| - _IncreaseSoftLimitForResource(resource.RLIMIT_NOFILE))
|
| - except AttributeError, e:
|
| + _IncreaseSoftLimitForResource(
|
| + resource.RLIMIT_NOFILE,
|
| + MIN_ACCEPTABLE_OPEN_FILES_LIMIT))
|
| + except AttributeError:
|
| pass
|
| try:
|
| limit = max(limit,
|
| - _IncreaseSoftLimitForResource(resource.RLIMIT_OFILE))
|
| - except AttributeError, e:
|
| + _IncreaseSoftLimitForResource(
|
| + resource.RLIMIT_OFILE, MIN_ACCEPTABLE_OPEN_FILES_LIMIT))
|
| + except AttributeError:
|
| pass
|
| +
|
| if limit < MIN_ACCEPTABLE_OPEN_FILES_LIMIT and not IS_WINDOWS:
|
| - message += (
|
| - '\nYour max number of open files, %s, is too low to allow safe '
|
| - 'multiprocessing calls. If you are on a Unix-like OS, then you can '
|
| - 'fix this by adding something like "ulimit -n 10000" to your '
|
| - '~/.bashrc (Linux), ~/.bash_profile (OS X), or equivalent file, '
|
| - 'and opening a new terminal.' % limit)
|
| + message += ("""
|
| +Your max number of open files, %s, is too low to allow safe multiprocessing.
|
| +On Linux you can fix this by adding something like "ulimit -n 10000" to your
|
| +~/.bashrc or equivalent file and opening a new terminal.
|
| +
|
| +On MacOS, you may also need to run a command like this once (in addition to the
|
| +above instructions), which might require a restart of your system to take
|
| +effect:
|
| + launchctl limit maxfiles 10000
|
| +
|
| +Alternatively, edit /etc/launchd.conf with something like:
|
| + limit maxfiles 10000 10000
|
| +
|
| +""" % limit)
|
| raise Exception('Max number of open files, %s, is too low.' % limit)
|
| - except:
|
| + except: # pylint: disable=bare-except
|
| stack_trace = traceback.format_exc()
|
| multiprocessing_is_available = False
|
| if logger is not None:
|
| logger.debug(stack_trace)
|
| - logger.warn('\n'.join(textwrap.wrap(message + '\n')))
|
| + logger.warn(message)
|
|
|
| # Set the cached values so that we never need to do this check again.
|
| cached_multiprocessing_is_available = multiprocessing_is_available
|
| @@ -631,13 +938,79 @@
|
| cached_multiprocessing_is_available_message = message
|
| return (multiprocessing_is_available, stack_trace)
|
|
|
| +
|
| def CreateLock():
|
| + """Returns either a multiprocessing lock or a threading lock.
|
| +
|
| + Use Multiprocessing lock iff we have access to the parts of the
|
| + multiprocessing module that are necessary to enable parallelism in operations.
|
| +
|
| + Returns:
|
| + Multiprocessing or threading lock.
|
| """
|
| - Returns either a multiprocessing lock or a threading lock. We will use the
|
| - former iff we have access to the parts of the multiprocessing module that
|
| - are necessary to enable parallelism in operations.
|
| - """
|
| if MultiprocessingIsAvailable()[0]:
|
| return manager.Lock()
|
| else:
|
| return threading.Lock()
|
| +
|
| +
|
| +def IsCloudSubdirPlaceholder(url, blr=None):
|
| + """Determines if URL is a cloud subdir placeholder.
|
| +
|
| + This function is needed because GUI tools (like the GCS cloud console) allow
|
| + users to create empty "folders" by creating a placeholder object; and parts
|
| + of gsutil need to treat those placeholder objects specially. For example,
|
| + gsutil rsync needs to avoid downloading those objects because they can cause
|
| + conflicts (see comments in rsync command for details).
|
| +
|
| + We currently detect two cases:
|
| + - Cloud objects whose name ends with '_$folder$'
|
| + - Cloud objects whose name ends with '/'
|
| +
|
| + Args:
|
| + url: The URL to be checked.
|
| + blr: BucketListingRef to check, or None if not available.
|
| + If None, size won't be checked.
|
| +
|
| + Returns:
|
| + True/False.
|
| + """
|
| + if not url.IsCloudUrl():
|
| + return False
|
| + url_str = url.url_string
|
| + if url_str.endswith('_$folder$'):
|
| + return True
|
| + if blr and blr.IsObject():
|
| + size = blr.root_object.size
|
| + else:
|
| + size = 0
|
| + return size == 0 and url_str.endswith('/')
|
| +
|
| +
|
| +def GetTermLines():
|
| + """Returns number of terminal lines."""
|
| + # fcntl isn't supported in Windows.
|
| + try:
|
| + import fcntl # pylint: disable=g-import-not-at-top
|
| + import termios # pylint: disable=g-import-not-at-top
|
| + except ImportError:
|
| + return _DEFAULT_LINES
|
| + def ioctl_GWINSZ(fd): # pylint: disable=invalid-name
|
| + try:
|
| + return struct.unpack(
|
| + 'hh', fcntl.ioctl(fd, termios.TIOCGWINSZ, '1234'))[0]
|
| + except: # pylint: disable=bare-except
|
| + return 0 # Failure (so will retry on different file descriptor below).
|
| + # Try to find a valid number of lines from termio for stdin, stdout,
|
| + # or stderr, in that order.
|
| + ioc = ioctl_GWINSZ(0) or ioctl_GWINSZ(1) or ioctl_GWINSZ(2)
|
| + if not ioc:
|
| + try:
|
| + fd = os.open(os.ctermid(), os.O_RDONLY)
|
| + ioc = ioctl_GWINSZ(fd)
|
| + os.close(fd)
|
| + except: # pylint: disable=bare-except
|
| + pass
|
| + if not ioc:
|
| + ioc = os.environ.get('LINES', _DEFAULT_LINES)
|
| + return int(ioc)
|
|
|