Index: tools/telemetry/third_party/gsutil/gslib/hashing_helper.py |
diff --git a/tools/telemetry/third_party/gsutil/gslib/hashing_helper.py b/tools/telemetry/third_party/gsutil/gslib/hashing_helper.py |
deleted file mode 100644 |
index dee2f96c926d5ae01a4ff52b87f4966c3e472198..0000000000000000000000000000000000000000 |
--- a/tools/telemetry/third_party/gsutil/gslib/hashing_helper.py |
+++ /dev/null |
@@ -1,418 +0,0 @@ |
-# -*- coding: utf-8 -*- |
-# Copyright 2014 Google Inc. All Rights Reserved. |
-# |
-# Licensed under the Apache License, Version 2.0 (the "License"); |
-# you may not use this file except in compliance with the License. |
-# You may obtain a copy of the License at |
-# |
-# http://www.apache.org/licenses/LICENSE-2.0 |
-# |
-# Unless required by applicable law or agreed to in writing, software |
-# distributed under the License is distributed on an "AS IS" BASIS, |
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
-# See the License for the specific language governing permissions and |
-# limitations under the License. |
-"""Helper functions for hashing functionality.""" |
- |
-import base64 |
-import binascii |
-from hashlib import md5 |
-import os |
- |
-from boto import config |
-import crcmod |
- |
-from gslib.exception import CommandException |
-from gslib.util import DEFAULT_FILE_BUFFER_SIZE |
-from gslib.util import MIN_SIZE_COMPUTE_LOGGING |
-from gslib.util import TRANSFER_BUFFER_SIZE |
-from gslib.util import UsingCrcmodExtension |
- |
- |
-SLOW_CRCMOD_WARNING = """ |
-WARNING: You have requested checksumming but your crcmod installation isn't |
-using the module's C extension, so checksumming will run very slowly. For help |
-installing the extension, please see: |
- $ gsutil help crcmod |
-""" |
- |
- |
-_SLOW_CRCMOD_DOWNLOAD_WARNING = """ |
-WARNING: Downloading this composite object requires integrity checking with |
-CRC32c, but your crcmod installation isn't using the module's C extension, |
-so the hash computation will likely throttle download performance. For help |
-installing the extension, please see: |
- $ gsutil help crcmod |
-To disable slow integrity checking, see the "check_hashes" option in your |
-boto config file. |
-""" |
- |
-_SLOW_CRC_EXCEPTION_TEXT = """ |
-Downloading this composite object requires integrity checking with CRC32c, |
-but your crcmod installation isn't using the module's C extension, so the |
-hash computation will likely throttle download performance. For help |
-installing the extension, please see: |
- |
- $ gsutil help crcmod |
- |
-To download regardless of crcmod performance or to skip slow integrity |
-checks, see the "check_hashes" option in your boto config file. |
- |
-NOTE: It is strongly recommended that you not disable integrity checks. Doing so |
-could allow data corruption to go undetected during uploading/downloading.""" |
- |
- |
-_NO_HASH_CHECK_WARNING = """ |
-WARNING: This download will not be validated since your crcmod installation |
-doesn't use the module's C extension, so the hash computation would likely |
-throttle download performance. For help in installing the extension, please |
-see: |
- $ gsutil help crcmod |
-To force integrity checking, see the "check_hashes" option in your boto config |
-file. |
-""" |
- |
- |
-# Configuration values for hashing. |
-CHECK_HASH_IF_FAST_ELSE_FAIL = 'if_fast_else_fail' |
-CHECK_HASH_IF_FAST_ELSE_SKIP = 'if_fast_else_skip' |
-CHECK_HASH_ALWAYS = 'always' |
-CHECK_HASH_NEVER = 'never' |
- |
- |
-def _CalculateHashFromContents(fp, hash_alg): |
- """Calculates a base64 digest of the contents of a seekable stream. |
- |
- This function resets the file pointer to position 0. |
- |
- Args: |
- fp: An already-open file object. |
- hash_alg: Instance of hashing class initialized to start state. |
- |
- Returns: |
- Hash of the stream in hex string format. |
- """ |
- hash_dict = {'placeholder': hash_alg} |
- fp.seek(0) |
- CalculateHashesFromContents(fp, hash_dict) |
- fp.seek(0) |
- return hash_dict['placeholder'].hexdigest() |
- |
- |
-def CalculateHashesFromContents(fp, hash_dict, callback_processor=None): |
- """Calculates hashes of the contents of a file. |
- |
- Args: |
- fp: An already-open file object (stream will be consumed). |
- hash_dict: Dict of (string alg_name: initialized hashing class) |
- Hashing class will be populated with digests upon return. |
- callback_processor: Optional callback processing class that implements |
- Progress(integer amount of bytes processed). |
- """ |
- while True: |
- data = fp.read(DEFAULT_FILE_BUFFER_SIZE) |
- if not data: |
- break |
- for hash_alg in hash_dict.itervalues(): |
- hash_alg.update(data) |
- if callback_processor: |
- callback_processor.Progress(len(data)) |
- |
- |
-def CalculateB64EncodedCrc32cFromContents(fp): |
- """Calculates a base64 CRC32c checksum of the contents of a seekable stream. |
- |
- This function sets the stream position 0 before and after calculation. |
- |
- Args: |
- fp: An already-open file object. |
- |
- Returns: |
- CRC32c checksum of the file in base64 format. |
- """ |
- return _CalculateB64EncodedHashFromContents( |
- fp, crcmod.predefined.Crc('crc-32c')) |
- |
- |
-def CalculateB64EncodedMd5FromContents(fp): |
- """Calculates a base64 MD5 digest of the contents of a seekable stream. |
- |
- This function sets the stream position 0 before and after calculation. |
- |
- Args: |
- fp: An already-open file object. |
- |
- Returns: |
- MD5 digest of the file in base64 format. |
- """ |
- return _CalculateB64EncodedHashFromContents(fp, md5()) |
- |
- |
-def CalculateMd5FromContents(fp): |
- """Calculates a base64 MD5 digest of the contents of a seekable stream. |
- |
- This function sets the stream position 0 before and after calculation. |
- |
- Args: |
- fp: An already-open file object. |
- |
- Returns: |
- MD5 digest of the file in hex format. |
- """ |
- return _CalculateHashFromContents(fp, md5()) |
- |
- |
-def Base64EncodeHash(digest_value): |
- """Returns the base64-encoded version of the input hex digest value.""" |
- return base64.encodestring(binascii.unhexlify(digest_value)).rstrip('\n') |
- |
- |
-def Base64ToHexHash(base64_hash): |
- """Returns the hex digest value of the input base64-encoded hash. |
- |
- Args: |
- base64_hash: Base64-encoded hash, which may contain newlines and single or |
- double quotes. |
- |
- Returns: |
- Hex digest of the input argument. |
- """ |
- return binascii.hexlify(base64.decodestring(base64_hash.strip('\n"\''))) |
- |
- |
-def _CalculateB64EncodedHashFromContents(fp, hash_alg): |
- """Calculates a base64 digest of the contents of a seekable stream. |
- |
- This function sets the stream position 0 before and after calculation. |
- |
- Args: |
- fp: An already-open file object. |
- hash_alg: Instance of hashing class initialized to start state. |
- |
- Returns: |
- Hash of the stream in base64 format. |
- """ |
- return Base64EncodeHash(_CalculateHashFromContents(fp, hash_alg)) |
- |
- |
-def GetUploadHashAlgs(): |
- """Returns a dict of hash algorithms for validating an uploaded object. |
- |
- This is for use only with single object uploads, not compose operations |
- such as those used by parallel composite uploads (though it can be used to |
- validate the individual components). |
- |
- Returns: |
- dict of (algorithm_name: hash_algorithm) |
- """ |
- check_hashes_config = config.get( |
- 'GSUtil', 'check_hashes', CHECK_HASH_IF_FAST_ELSE_FAIL) |
- if check_hashes_config == 'never': |
- return {} |
- return {'md5': md5} |
- |
- |
-def GetDownloadHashAlgs(logger, src_has_md5=False, src_has_crc32c=False): |
- """Returns a dict of hash algorithms for validating an object. |
- |
- Args: |
- logger: logging.Logger for outputting log messages. |
- src_has_md5: If True, source object has an md5 hash. |
- src_has_crc32c: If True, source object has a crc32c hash. |
- |
- Returns: |
- Dict of (string, hash algorithm). |
- |
- Raises: |
- CommandException if hash algorithms satisfying the boto config file |
- cannot be returned. |
- """ |
- check_hashes_config = config.get( |
- 'GSUtil', 'check_hashes', CHECK_HASH_IF_FAST_ELSE_FAIL) |
- if check_hashes_config == CHECK_HASH_NEVER: |
- return {} |
- |
- hash_algs = {} |
- if src_has_md5: |
- hash_algs['md5'] = md5 |
- elif src_has_crc32c: |
- # If the cloud provider supplies a CRC, we'll compute a checksum to |
- # validate if we're using a native crcmod installation and MD5 isn't |
- # offered as an alternative. |
- if UsingCrcmodExtension(crcmod): |
- hash_algs['crc32c'] = lambda: crcmod.predefined.Crc('crc-32c') |
- elif not hash_algs: |
- if check_hashes_config == CHECK_HASH_IF_FAST_ELSE_FAIL: |
- raise CommandException(_SLOW_CRC_EXCEPTION_TEXT) |
- elif check_hashes_config == CHECK_HASH_IF_FAST_ELSE_SKIP: |
- logger.warn(_NO_HASH_CHECK_WARNING) |
- elif check_hashes_config == CHECK_HASH_ALWAYS: |
- logger.warn(_SLOW_CRCMOD_DOWNLOAD_WARNING) |
- hash_algs['crc32c'] = lambda: crcmod.predefined.Crc('crc-32c') |
- else: |
- raise CommandException( |
- 'Your boto config \'check_hashes\' option is misconfigured.') |
- |
- return hash_algs |
- |
- |
-class HashingFileUploadWrapper(object): |
- """Wraps an input stream in a hash digester and exposes a stream interface. |
- |
- This class provides integrity checking during file uploads via the |
- following properties: |
- |
- Calls to read will appropriately update digesters with all bytes read. |
- Calls to seek (assuming it is supported by the wrapped stream) using |
- os.SEEK_SET will catch up / reset the digesters to the specified |
- position. If seek is called with a different os.SEEK mode, the caller |
- must return to the original position using os.SEEK_SET before further |
- reads. |
- Calls to seek are fast if the desired position is equal to the position at |
- the beginning of the last read call (we only need to re-hash bytes |
- from that point on). |
- """ |
- |
- def __init__(self, stream, digesters, hash_algs, src_url, logger): |
- """Initializes the wrapper. |
- |
- Args: |
- stream: Input stream. |
- digesters: dict of {string: hash digester} containing digesters, where |
- string is the name of the hash algorithm. |
- hash_algs: dict of {string: hash algorithm} for resetting and |
- recalculating digesters. String is the name of the hash algorithm. |
- src_url: Source FileUrl that is being copied. |
- logger: For outputting log messages. |
- """ |
- if not digesters: |
- raise CommandException('HashingFileUploadWrapper used with no digesters.') |
- elif not hash_algs: |
- raise CommandException('HashingFileUploadWrapper used with no hash_algs.') |
- |
- self._orig_fp = stream |
- self._digesters = digesters |
- self._src_url = src_url |
- self._logger = logger |
- self._seek_away = None |
- |
- self._digesters_previous = {} |
- for alg in self._digesters: |
- self._digesters_previous[alg] = self._digesters[alg].copy() |
- self._digesters_previous_mark = 0 |
- self._digesters_current_mark = 0 |
- self._hash_algs = hash_algs |
- |
- def read(self, size=-1): # pylint: disable=invalid-name |
- """"Reads from the wrapped file pointer and calculates hash digests. |
- |
- Args: |
- size: The amount of bytes to read. If ommited or negative, the entire |
- contents of the file will be read, hashed, and returned. |
- |
- Returns: |
- Bytes from the wrapped stream. |
- |
- Raises: |
- CommandException if the position of the wrapped stream is unknown. |
- """ |
- if self._seek_away is not None: |
- raise CommandException('Read called on hashing file pointer in an ' |
- 'unknown position; cannot correctly compute ' |
- 'digest.') |
- |
- data = self._orig_fp.read(size) |
- self._digesters_previous_mark = self._digesters_current_mark |
- for alg in self._digesters: |
- self._digesters_previous[alg] = self._digesters[alg].copy() |
- self._digesters[alg].update(data) |
- self._digesters_current_mark += len(data) |
- return data |
- |
- def tell(self): # pylint: disable=invalid-name |
- """Returns the current stream position.""" |
- return self._orig_fp.tell() |
- |
- def seekable(self): # pylint: disable=invalid-name |
- """Returns true if the stream is seekable.""" |
- return self._orig_fp.seekable() |
- |
- def seek(self, offset, whence=os.SEEK_SET): # pylint: disable=invalid-name |
- """Seeks in the wrapped file pointer and catches up hash digests. |
- |
- Args: |
- offset: The offset to seek to. |
- whence: os.SEEK_CUR, or SEEK_END, SEEK_SET. |
- |
- Returns: |
- Return value from the wrapped stream's seek call. |
- """ |
- if whence != os.SEEK_SET: |
- # We do not catch up hashes for non-absolute seeks, and rely on the |
- # caller to seek to an absolute position before reading. |
- self._seek_away = self._orig_fp.tell() |
- |
- else: |
- # Hashes will be correct and it's safe to call read(). |
- self._seek_away = None |
- if offset < self._digesters_previous_mark: |
- # This is earlier than our earliest saved digest, so we need to |
- # reset the digesters and scan from the beginning. |
- for alg in self._digesters: |
- self._digesters[alg] = self._hash_algs[alg]() |
- self._digesters_current_mark = 0 |
- self._orig_fp.seek(0) |
- self._CatchUp(offset) |
- |
- elif offset == self._digesters_previous_mark: |
- # Just load the saved digests. |
- self._digesters_current_mark = self._digesters_previous_mark |
- for alg in self._digesters: |
- self._digesters[alg] = self._digesters_previous[alg] |
- |
- elif offset < self._digesters_current_mark: |
- # Reset the position to our previous digest and scan forward. |
- self._digesters_current_mark = self._digesters_previous_mark |
- for alg in self._digesters: |
- self._digesters[alg] = self._digesters_previous[alg] |
- self._orig_fp.seek(self._digesters_previous_mark) |
- self._CatchUp(offset - self._digesters_previous_mark) |
- |
- else: |
- # Scan forward from our current digest and position. |
- self._orig_fp.seek(self._digesters_current_mark) |
- self._CatchUp(offset - self._digesters_current_mark) |
- |
- return self._orig_fp.seek(offset, whence) |
- |
- def _CatchUp(self, bytes_to_read): |
- """Catches up hashes, but does not return data and uses little memory. |
- |
- Before calling this function, digesters_current_mark should be updated |
- to the current location of the original stream and the self._digesters |
- should be current to that point (but no further). |
- |
- Args: |
- bytes_to_read: Number of bytes to catch up from the original stream. |
- """ |
- if self._orig_fp.tell() != self._digesters_current_mark: |
- raise CommandException( |
- 'Invalid mark when catching up hashes. Stream position %s, hash ' |
- 'position %s' % (self._orig_fp.tell(), self._digesters_current_mark)) |
- |
- for alg in self._digesters: |
- if bytes_to_read >= MIN_SIZE_COMPUTE_LOGGING: |
- self._logger.info('Catching up %s for %s...', alg, |
- self._src_url.url_string) |
- self._digesters_previous[alg] = self._digesters[alg].copy() |
- |
- self._digesters_previous_mark = self._digesters_current_mark |
- bytes_remaining = bytes_to_read |
- bytes_this_round = min(bytes_remaining, TRANSFER_BUFFER_SIZE) |
- while bytes_this_round: |
- data = self._orig_fp.read(bytes_this_round) |
- bytes_remaining -= bytes_this_round |
- for alg in self._digesters: |
- self._digesters[alg].update(data) |
- bytes_this_round = min(bytes_remaining, TRANSFER_BUFFER_SIZE) |
- self._digesters_current_mark += bytes_to_read |