third_party/gsutil/gslib/copy_helper.py - Issue 1377933002: [catapult] - Copy Telemetry's gsutilz over to third_party.

Side by Side Diff: third_party/gsutil/gslib/copy_helper.py

Issue 1377933002: [catapult] - Copy Telemetry's gsutilz over to third_party. (Closed) Base URL: https://github.com/catapult-project/catapult.git@master

Patch Set: Rename to gsutil. Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # -- coding: utf-8 --

	2 # Copyright 2011 Google Inc. All Rights Reserved.

	3 # Copyright 2011, Nexenta Systems Inc.

	4 #

	5 # Licensed under the Apache License, Version 2.0 (the "License");

	6 # you may not use this file except in compliance with the License.

	7 # You may obtain a copy of the License at

	8 #

	9 # http://www.apache.org/licenses/LICENSE-2.0

	10 #

	11 # Unless required by applicable law or agreed to in writing, software

	12 # distributed under the License is distributed on an "AS IS" BASIS,

	13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

	14 # See the License for the specific language governing permissions and

	15 # limitations under the License.

	16 """Helper functions for copy functionality."""

	17

	18 from __future__ import absolute_import

	19

	20 import base64

	21 from collections import namedtuple

	22 import csv

	23 import datetime

	24 import errno

	25 import gzip

	26 from hashlib import md5

	27 import json

	28 import logging

	29 import mimetypes

	30 import multiprocessing

	31 import os

	32 import pickle

	33 import random

	34 import re

	35 import shutil

	36 import stat

	37 import subprocess

	38 import tempfile

	39 import textwrap

	40 import time

	41 import traceback

	42

	43 from boto import config

	44 import crcmod

	45

	46 import gslib

	47 from gslib.cloud_api import ArgumentException

	48 from gslib.cloud_api import CloudApi

	49 from gslib.cloud_api import NotFoundException

	50 from gslib.cloud_api import PreconditionException

	51 from gslib.cloud_api import Preconditions

	52 from gslib.cloud_api import ResumableDownloadException

	53 from gslib.cloud_api import ResumableUploadAbortException

	54 from gslib.cloud_api import ResumableUploadException

	55 from gslib.cloud_api import ResumableUploadStartOverException

	56 from gslib.cloud_api_helper import GetDownloadSerializationDict

	57 from gslib.commands.compose import MAX_COMPOSE_ARITY

	58 from gslib.commands.config import DEFAULT_PARALLEL_COMPOSITE_UPLOAD_COMPONENT_SI ZE

	59 from gslib.commands.config import DEFAULT_PARALLEL_COMPOSITE_UPLOAD_THRESHOLD

	60 from gslib.cs_api_map import ApiSelector

	61 from gslib.daisy_chain_wrapper import DaisyChainWrapper

	62 from gslib.exception import CommandException

	63 from gslib.exception import HashMismatchException

	64 from gslib.file_part import FilePart

	65 from gslib.hashing_helper import Base64EncodeHash

	66 from gslib.hashing_helper import CalculateB64EncodedMd5FromContents

	67 from gslib.hashing_helper import CalculateHashesFromContents

	68 from gslib.hashing_helper import GetDownloadHashAlgs

	69 from gslib.hashing_helper import GetUploadHashAlgs

	70 from gslib.hashing_helper import HashingFileUploadWrapper

	71 from gslib.parallelism_framework_util import ThreadAndProcessSafeDict

	72 from gslib.parallelism_framework_util import ThreadSafeDict

	73 from gslib.progress_callback import ConstructAnnounceText

	74 from gslib.progress_callback import FileProgressCallbackHandler

	75 from gslib.progress_callback import ProgressCallbackWithBackoff

	76 from gslib.resumable_streaming_upload import ResumableStreamingJsonUploadWrapper

	77 from gslib.storage_url import ContainsWildcard

	78 from gslib.storage_url import StorageUrlFromString

	79 from gslib.third_party.storage_apitools import storage_v1_messages as apitools_m essages

	80 from gslib.tracker_file import DeleteTrackerFile

	81 from gslib.tracker_file import GetTrackerFilePath

	82 from gslib.tracker_file import RaiseUnwritableTrackerFileException

	83 from gslib.tracker_file import ReadOrCreateDownloadTrackerFile

	84 from gslib.tracker_file import TrackerFileType

	85 from gslib.translation_helper import AddS3MarkerAclToObjectMetadata

	86 from gslib.translation_helper import CopyObjectMetadata

	87 from gslib.translation_helper import DEFAULT_CONTENT_TYPE

	88 from gslib.translation_helper import GenerationFromUrlAndString

	89 from gslib.translation_helper import ObjectMetadataFromHeaders

	90 from gslib.translation_helper import PreconditionsFromHeaders

	91 from gslib.translation_helper import S3MarkerAclFromObjectMetadata

	92 from gslib.util import CreateLock

	93 from gslib.util import DEFAULT_FILE_BUFFER_SIZE

	94 from gslib.util import GetCloudApiInstance

	95 from gslib.util import GetFileSize

	96 from gslib.util import GetJsonResumableChunkSize

	97 from gslib.util import GetMaxRetryDelay

	98 from gslib.util import GetNumRetries

	99 from gslib.util import GetStreamFromFileUrl

	100 from gslib.util import HumanReadableToBytes

	101 from gslib.util import IS_WINDOWS

	102 from gslib.util import IsCloudSubdirPlaceholder

	103 from gslib.util import MakeHumanReadable

	104 from gslib.util import MIN_SIZE_COMPUTE_LOGGING

	105 from gslib.util import MultiprocessingIsAvailable

	106 from gslib.util import ResumableThreshold

	107 from gslib.util import TEN_MIB

	108 from gslib.util import UTF8

	109 from gslib.wildcard_iterator import CreateWildcardIterator

	110

	111 # pylint: disable=g-import-not-at-top

	112 if IS_WINDOWS:

	113 import msvcrt

	114 from ctypes import c_int

	115 from ctypes import c_uint64

	116 from ctypes import c_char_p

	117 from ctypes import c_wchar_p

	118 from ctypes import windll

	119 from ctypes import POINTER

	120 from ctypes import WINFUNCTYPE

	121 from ctypes import WinError

	122

	123 # Declare copy_helper_opts as a global because namedtuple isn't aware of

	124 # assigning to a class member (which breaks pickling done by multiprocessing).

	125 # For details see

	126 # http://stackoverflow.com/questions/16377215/how-to-pickle-a-namedtuple-instanc e-correctly

	127 # Similarly can't pickle logger.

	128 # pylint: disable=global-at-module-level

	129 global global_copy_helper_opts, global_logger

	130

	131 # In-memory map of local files that are currently opened for write. Used to

	132 # ensure that if we write to the same file twice (say, for example, because the

	133 # user specified two identical source URLs), the writes occur serially.

	134 global open_files_map

	135 open_files_map = (

	136 ThreadSafeDict() if (IS_WINDOWS or not MultiprocessingIsAvailable()[0])

	137 else ThreadAndProcessSafeDict(multiprocessing.Manager()))

	138

	139 # For debugging purposes; if True, files and objects that fail hash validation

	140 # will be saved with the below suffix appended.

	141 _RENAME_ON_HASH_MISMATCH = False

	142 _RENAME_ON_HASH_MISMATCH_SUFFIX = '_corrupt'

	143

	144 PARALLEL_UPLOAD_TEMP_NAMESPACE = (

	145 u'/gsutil/tmp/parallel_composite_uploads/for_details_see/gsutil_help_cp/')

	146

	147 PARALLEL_UPLOAD_STATIC_SALT = u"""

	148 PARALLEL_UPLOAD_SALT_TO_PREVENT_COLLISIONS.

	149 The theory is that no user will have prepended this to the front of

	150 one of their object names and then done an MD5 hash of the name, and

	151 then prepended PARALLEL_UPLOAD_TEMP_NAMESPACE to the front of their object

	152 name. Note that there will be no problems with object name length since we

	153 hash the original name.

	154 """

	155

	156 # When uploading a file, get the following fields in the response for

	157 # filling in command output and manifests.

	158 UPLOAD_RETURN_FIELDS = ['crc32c', 'etag', 'generation', 'md5Hash', 'size']

	159

	160 # This tuple is used only to encapsulate the arguments needed for

	161 # command.Apply() in the parallel composite upload case.

	162 # Note that content_type is used instead of a full apitools Object() because

	163 # apitools objects are not picklable.

	164 # filename: String name of file.

	165 # file_start: start byte of file (may be in the middle of a file for partitioned

	166 # files).

	167 # file_length: length of upload (may not be the entire length of a file for

	168 # partitioned files).

	169 # src_url: FileUrl describing the source file.

	170 # dst_url: CloudUrl describing the destination component file.

	171 # canned_acl: canned_acl to apply to the uploaded file/component.

	172 # content_type: content-type for final object, used for setting content-type

	173 # of components and final object.

	174 # tracker_file: tracker file for this component.

	175 # tracker_file_lock: tracker file lock for tracker file(s).

	176 PerformParallelUploadFileToObjectArgs = namedtuple(

	177 'PerformParallelUploadFileToObjectArgs',

	178 'filename file_start file_length src_url dst_url canned_acl '

	179 'content_type tracker_file tracker_file_lock')

	180

	181 ObjectFromTracker = namedtuple('ObjectFromTracker',

	182 'object_name generation')

	183

	184 # TODO: Refactor this file to be less cumbersome. In particular, some of the

	185 # different paths (e.g., uploading a file to an object vs. downloading an

	186 # object to a file) could be split into separate files.

	187

	188 # Chunk size to use while zipping/unzipping gzip files.

	189 GZIP_CHUNK_SIZE = 8192

	190

	191 PARALLEL_COMPOSITE_SUGGESTION_THRESHOLD = 150 * 1024 * 1024

	192

	193 # S3 requires special Multipart upload logic (that we currently don't implement)

	194 # for files > 5GiB in size.

	195 S3_MAX_UPLOAD_SIZE = 5 * 1024 * 1024 * 1024

	196

	197 suggested_parallel_composites = False

	198

	199

	200 class FileConcurrencySkipError(Exception):

	201 """Raised when skipping a file due to a concurrent, duplicate copy."""

	202

	203

	204 def _RmExceptionHandler(cls, e):

	205 """Simple exception handler to allow post-completion status."""

	206 cls.logger.error(str(e))

	207

	208

	209 def _ParallelUploadCopyExceptionHandler(cls, e):

	210 """Simple exception handler to allow post-completion status."""

	211 cls.logger.error(str(e))

	212 cls.op_failure_count += 1

	213 cls.logger.debug('\n\nEncountered exception while copying:\n%s\n',

	214 traceback.format_exc())

	215

	216

	217 def _PerformParallelUploadFileToObject(cls, args, thread_state=None):

	218 """Function argument to Apply for performing parallel composite uploads.

	219

	220 Args:

	221 cls: Calling Command class.

	222 args: PerformParallelUploadFileToObjectArgs tuple describing the target.

	223 thread_state: gsutil Cloud API instance to use for the operation.

	224

	225 Returns:

	226 StorageUrl representing a successfully uploaded component.

	227 """

	228 fp = FilePart(args.filename, args.file_start, args.file_length)

	229 gsutil_api = GetCloudApiInstance(cls, thread_state=thread_state)

	230 with fp:

	231 # We take many precautions with the component names that make collisions

	232 # effectively impossible. Specifying preconditions will just allow us to

	233 # reach a state in which uploads will always fail on retries.

	234 preconditions = None

	235

	236 # Fill in content type if one was provided.

	237 dst_object_metadata = apitools_messages.Object(

	238 name=args.dst_url.object_name,

	239 bucket=args.dst_url.bucket_name,

	240 contentType=args.content_type)

	241

	242 try:

	243 if global_copy_helper_opts.canned_acl:

	244 # No canned ACL support in JSON, force XML API to be used for

	245 # upload/copy operations.

	246 orig_prefer_api = gsutil_api.prefer_api

	247 gsutil_api.prefer_api = ApiSelector.XML

	248 ret = _UploadFileToObject(args.src_url, fp, args.file_length,

	249 args.dst_url, dst_object_metadata,

	250 preconditions, gsutil_api, cls.logger, cls,

	251 _ParallelUploadCopyExceptionHandler,

	252 gzip_exts=None, allow_splitting=False)

	253 finally:

	254 if global_copy_helper_opts.canned_acl:

	255 gsutil_api.prefer_api = orig_prefer_api

	256

	257 component = ret[2]

	258 _AppendComponentTrackerToParallelUploadTrackerFile(

	259 args.tracker_file, component, args.tracker_file_lock)

	260 return ret

	261

	262

	263 CopyHelperOpts = namedtuple('CopyHelperOpts', [

	264 'perform_mv',

	265 'no_clobber',

	266 'daisy_chain',

	267 'read_args_from_stdin',

	268 'print_ver',

	269 'use_manifest',

	270 'preserve_acl',

	271 'canned_acl',

	272 'skip_unsupported_objects',

	273 'test_callback_file'])

	274

	275

	276 # pylint: disable=global-variable-undefined

	277 def CreateCopyHelperOpts(perform_mv=False, no_clobber=False, daisy_chain=False,

	278 read_args_from_stdin=False, print_ver=False,

	279 use_manifest=False, preserve_acl=False,

	280 canned_acl=None, skip_unsupported_objects=False,

	281 test_callback_file=None):

	282 """Creates CopyHelperOpts for passing options to CopyHelper."""

	283 # We create a tuple with union of options needed by CopyHelper and any

	284 # copy-related functionality in CpCommand, RsyncCommand, or Command class.

	285 global global_copy_helper_opts

	286 global_copy_helper_opts = CopyHelperOpts(

	287 perform_mv=perform_mv,

	288 no_clobber=no_clobber,

	289 daisy_chain=daisy_chain,

	290 read_args_from_stdin=read_args_from_stdin,

	291 print_ver=print_ver,

	292 use_manifest=use_manifest,

	293 preserve_acl=preserve_acl,

	294 canned_acl=canned_acl,

	295 skip_unsupported_objects=skip_unsupported_objects,

	296 test_callback_file=test_callback_file)

	297 return global_copy_helper_opts

	298

	299

	300 # pylint: disable=global-variable-undefined

	301 # pylint: disable=global-variable-not-assigned

	302 def GetCopyHelperOpts():

	303 """Returns namedtuple holding CopyHelper options."""

	304 global global_copy_helper_opts

	305 return global_copy_helper_opts

	306

	307

	308 def _SelectDownloadStrategy(dst_url):

	309 """Get download strategy based on the destination object.

	310

	311 Args:

	312 dst_url: Destination StorageUrl.

	313

	314 Returns:

	315 gsutil Cloud API DownloadStrategy.

	316 """

	317 dst_is_special = False

	318 if dst_url.IsFileUrl():

	319 # Check explicitly first because os.stat doesn't work on 'nul' in Windows.

	320 if dst_url.object_name == os.devnull:

	321 dst_is_special = True

	322 try:

	323 mode = os.stat(dst_url.object_name).st_mode

	324 if stat.S_ISCHR(mode):

	325 dst_is_special = True

	326 except OSError:

	327 pass

	328

	329 if dst_is_special:

	330 return CloudApi.DownloadStrategy.ONE_SHOT

	331 else:

	332 return CloudApi.DownloadStrategy.RESUMABLE

	333

	334

	335 def _GetUploadTrackerData(tracker_file_name, logger):

	336 """Reads tracker data from an upload tracker file if it exists.

	337

	338 Args:

	339 tracker_file_name: Tracker file name for this upload.

	340 logger: for outputting log messages.

	341

	342 Returns:

	343 Serialization data if the tracker file already exists (resume existing

	344 upload), None otherwise.

	345 """

	346 tracker_file = None

	347

	348 # If we already have a matching tracker file, get the serialization data

	349 # so that we can resume the upload.

	350 try:

	351 tracker_file = open(tracker_file_name, 'r')

	352 tracker_data = tracker_file.read()

	353 return tracker_data

	354 except IOError as e:

	355 # Ignore non-existent file (happens first time a upload is attempted on an

	356 # object, or when re-starting an upload after a

	357 # ResumableUploadStartOverException), but warn user for other errors.

	358 if e.errno != errno.ENOENT:

	359 logger.warn('Couldn\'t read upload tracker file (%s): %s. Restarting '

	360 'upload from scratch.', tracker_file_name, e.strerror)

	361 finally:

	362 if tracker_file:

	363 tracker_file.close()

	364

	365

	366 def InsistDstUrlNamesContainer(exp_dst_url, have_existing_dst_container,

	367 command_name):

	368 """Ensures the destination URL names a container.

	369

	370 Acceptable containers include directory, bucket, bucket

	371 subdir, and non-existent bucket subdir.

	372

	373 Args:

	374 exp_dst_url: Wildcard-expanded destination StorageUrl.

	375 have_existing_dst_container: bool indicator of whether exp_dst_url

	376 names a container (directory, bucket, or existing bucket subdir).

	377 command_name: Name of command making call. May not be the same as the

	378 calling class's self.command_name in the case of commands implemented

	379 atop other commands (like mv command).

	380

	381 Raises:

	382 CommandException: if the URL being checked does not name a container.

	383 """

	384 if ((exp_dst_url.IsFileUrl() and not exp_dst_url.IsDirectory()) or

	385 (exp_dst_url.IsCloudUrl() and exp_dst_url.IsBucket()

	386 and not have_existing_dst_container)):

	387 raise CommandException('Destination URL must name a directory, bucket, '

	388 'or bucket\nsubdirectory for the multiple '

	389 'source form of the %s command.' % command_name)

	390

	391

	392 def _ShouldTreatDstUrlAsBucketSubDir(have_multiple_srcs, dst_url,

	393 have_existing_dest_subdir,

	394 src_url_names_container,

	395 recursion_requested):

	396 """Checks whether dst_url should be treated as a bucket "sub-directory".

	397

	398 The decision about whether something constitutes a bucket "sub-directory"

	399 depends on whether there are multiple sources in this request and whether

	400 there is an existing bucket subdirectory. For example, when running the

	401 command:

	402 gsutil cp file gs://bucket/abc

	403 if there's no existing gs://bucket/abc bucket subdirectory we should copy

	404 file to the object gs://bucket/abc. In contrast, if

	405 there's an existing gs://bucket/abc bucket subdirectory we should copy

	406 file to gs://bucket/abc/file. And regardless of whether gs://bucket/abc

	407 exists, when running the command:

	408 gsutil cp file1 file2 gs://bucket/abc

	409 we should copy file1 to gs://bucket/abc/file1 (and similarly for file2).

	410 Finally, for recursive copies, if the source is a container then we should

	411 copy to a container as the target. For example, when running the command:

	412 gsutil cp -r dir1 gs://bucket/dir2

	413 we should copy the subtree of dir1 to gs://bucket/dir2.

	414

	415 Note that we don't disallow naming a bucket "sub-directory" where there's

	416 already an object at that URL. For example it's legitimate (albeit

	417 confusing) to have an object called gs://bucket/dir and

	418 then run the command

	419 gsutil cp file1 file2 gs://bucket/dir

	420 Doing so will end up with objects gs://bucket/dir, gs://bucket/dir/file1,

	421 and gs://bucket/dir/file2.

	422

	423 Args:

	424 have_multiple_srcs: Bool indicator of whether this is a multi-source

	425 operation.

	426 dst_url: StorageUrl to check.

	427 have_existing_dest_subdir: bool indicator whether dest is an existing

	428 subdirectory.

	429 src_url_names_container: bool indicator of whether the source URL

	430 is a container.

	431 recursion_requested: True if a recursive operation has been requested.

	432

	433 Returns:

	434 bool indicator.

	435 """

	436 if have_existing_dest_subdir:

	437 return True

	438 if dst_url.IsCloudUrl():

	439 return (have_multiple_srcs or

	440 (src_url_names_container and recursion_requested))

	441

	442

	443 def _ShouldTreatDstUrlAsSingleton(have_multiple_srcs,

	444 have_existing_dest_subdir, dst_url,

	445 recursion_requested):

	446 """Checks that dst_url names a single file/object after wildcard expansion.

	447

	448 It is possible that an object path might name a bucket sub-directory.

	449

	450 Args:

	451 have_multiple_srcs: Bool indicator of whether this is a multi-source

	452 operation.

	453 have_existing_dest_subdir: bool indicator whether dest is an existing

	454 subdirectory.

	455 dst_url: StorageUrl to check.

	456 recursion_requested: True if a recursive operation has been requested.

	457

	458 Returns:

	459 bool indicator.

	460 """

	461 if recursion_requested:

	462 return False

	463 if dst_url.IsFileUrl():

	464 return not dst_url.IsDirectory()

	465 else: # dst_url.IsCloudUrl()

	466 return (not have_multiple_srcs and

	467 not have_existing_dest_subdir and

	468 dst_url.IsObject())

	469

	470

	471 def ConstructDstUrl(src_url, exp_src_url, src_url_names_container,

	472 have_multiple_srcs, exp_dst_url, have_existing_dest_subdir,

	473 recursion_requested):

	474 """Constructs the destination URL for a given exp_src_url/exp_dst_url pair.

	475

	476 Uses context-dependent naming rules that mimic Linux cp and mv behavior.

	477

	478 Args:

	479 src_url: Source StorageUrl to be copied.

	480 exp_src_url: Single StorageUrl from wildcard expansion of src_url.

	481 src_url_names_container: True if src_url names a container (including the

	482 case of a wildcard-named bucket subdir (like gs://bucket/abc,

	483 where gs://bucket/abc/* matched some objects).

	484 have_multiple_srcs: True if this is a multi-source request. This can be

	485 true if src_url wildcard-expanded to multiple URLs or if there were

	486 multiple source URLs in the request.

	487 exp_dst_url: the expanded StorageUrl requested for the cp destination.

	488 Final written path is constructed from this plus a context-dependent

	489 variant of src_url.

	490 have_existing_dest_subdir: bool indicator whether dest is an existing

	491 subdirectory.

	492 recursion_requested: True if a recursive operation has been requested.

	493

	494 Returns:

	495 StorageUrl to use for copy.

	496

	497 Raises:

	498 CommandException if destination object name not specified for

	499 source and source is a stream.

	500 """

	501 if _ShouldTreatDstUrlAsSingleton(

	502 have_multiple_srcs, have_existing_dest_subdir, exp_dst_url,

	503 recursion_requested):

	504 # We're copying one file or object to one file or object.

	505 return exp_dst_url

	506

	507 if exp_src_url.IsFileUrl() and exp_src_url.IsStream():

	508 if have_existing_dest_subdir:

	509 raise CommandException('Destination object name needed when '

	510 'source is a stream')

	511 return exp_dst_url

	512

	513 if not recursion_requested and not have_multiple_srcs:

	514 # We're copying one file or object to a subdirectory. Append final comp

	515 # of exp_src_url to exp_dst_url.

	516 src_final_comp = exp_src_url.object_name.rpartition(src_url.delim)[-1]

	517 return StorageUrlFromString('%s%s%s' % (

	518 exp_dst_url.url_string.rstrip(exp_dst_url.delim),

	519 exp_dst_url.delim, src_final_comp))

	520

	521 # Else we're copying multiple sources to a directory, bucket, or a bucket

	522 # "sub-directory".

	523

	524 # Ensure exp_dst_url ends in delim char if we're doing a multi-src copy or

	525 # a copy to a directory. (The check for copying to a directory needs

	526 # special-case handling so that the command:

	527 # gsutil cp gs://bucket/obj dir

	528 # will turn into file://dir/ instead of file://dir -- the latter would cause

	529 # the file "dirobj" to be created.)

	530 # Note: need to check have_multiple_srcs or src_url.names_container()

	531 # because src_url could be a bucket containing a single object, named

	532 # as gs://bucket.

	533 if ((have_multiple_srcs or src_url_names_container or

	534 (exp_dst_url.IsFileUrl() and exp_dst_url.IsDirectory()))

	535 and not exp_dst_url.url_string.endswith(exp_dst_url.delim)):

	536 exp_dst_url = StorageUrlFromString('%s%s' % (exp_dst_url.url_string,

	537 exp_dst_url.delim))

	538

	539 # Making naming behavior match how things work with local Linux cp and mv

	540 # operations depends on many factors, including whether the destination is a

	541 # container, the plurality of the source(s), and whether the mv command is

	542 # being used:

	543 # 1. For the "mv" command that specifies a non-existent destination subdir,

	544 # renaming should occur at the level of the src subdir, vs appending that

	545 # subdir beneath the dst subdir like is done for copying. For example:

	546 # gsutil rm -r gs://bucket

	547 # gsutil cp -r dir1 gs://bucket

	548 # gsutil cp -r dir2 gs://bucket/subdir1

	549 # gsutil mv gs://bucket/subdir1 gs://bucket/subdir2

	550 # would (if using cp naming behavior) end up with paths like:

	551 # gs://bucket/subdir2/subdir1/dir2/.svn/all-wcprops

	552 # whereas mv naming behavior should result in:

	553 # gs://bucket/subdir2/dir2/.svn/all-wcprops

	554 # 2. Copying from directories, buckets, or bucket subdirs should result in

	555 # objects/files mirroring the source directory hierarchy. For example:

	556 # gsutil cp dir1/dir2 gs://bucket

	557 # should create the object gs://bucket/dir2/file2, assuming dir1/dir2

	558 # contains file2).

	559 # To be consistent with Linux cp behavior, there's one more wrinkle when

	560 # working with subdirs: The resulting object names depend on whether the

	561 # destination subdirectory exists. For example, if gs://bucket/subdir

	562 # exists, the command:

	563 # gsutil cp -r dir1/dir2 gs://bucket/subdir

	564 # should create objects named like gs://bucket/subdir/dir2/a/b/c. In

	565 # contrast, if gs://bucket/subdir does not exist, this same command

	566 # should create objects named like gs://bucket/subdir/a/b/c.

	567 # 3. Copying individual files or objects to dirs, buckets or bucket subdirs

	568 # should result in objects/files named by the final source file name

	569 # component. Example:

	570 # gsutil cp dir1/*.txt gs://bucket

	571 # should create the objects gs://bucket/f1.txt and gs://bucket/f2.txt,

	572 # assuming dir1 contains f1.txt and f2.txt.

	573

	574 recursive_move_to_new_subdir = False

	575 if (global_copy_helper_opts.perform_mv and recursion_requested

	576 and src_url_names_container and not have_existing_dest_subdir):

	577 # Case 1. Handle naming rules for bucket subdir mv. Here we want to

	578 # line up the src_url against its expansion, to find the base to build

	579 # the new name. For example, running the command:

	580 # gsutil mv gs://bucket/abcd gs://bucket/xyz

	581 # when processing exp_src_url=gs://bucket/abcd/123

	582 # exp_src_url_tail should become /123

	583 # Note: mv.py code disallows wildcard specification of source URL.

	584 recursive_move_to_new_subdir = True

	585 exp_src_url_tail = (

	586 exp_src_url.url_string[len(src_url.url_string):])

	587 dst_key_name = '%s/%s' % (exp_dst_url.object_name.rstrip('/'),

	588 exp_src_url_tail.strip('/'))

	589

	590 elif src_url_names_container and (exp_dst_url.IsCloudUrl() or

	591 exp_dst_url.IsDirectory()):

	592 # Case 2. Container copy to a destination other than a file.

	593 # Build dst_key_name from subpath of exp_src_url past

	594 # where src_url ends. For example, for src_url=gs://bucket/ and

	595 # exp_src_url=gs://bucket/src_subdir/obj, dst_key_name should be

	596 # src_subdir/obj.

	597 src_url_path_sans_final_dir = GetPathBeforeFinalDir(src_url)

	598 dst_key_name = exp_src_url.versionless_url_string[

	599 len(src_url_path_sans_final_dir):].lstrip(src_url.delim)

	600 # Handle case where dst_url is a non-existent subdir.

	601 if not have_existing_dest_subdir:

	602 dst_key_name = dst_key_name.partition(src_url.delim)[-1]

	603 # Handle special case where src_url was a directory named with '.' or

	604 # './', so that running a command like:

	605 # gsutil cp -r . gs://dest

	606 # will produce obj names of the form gs://dest/abc instead of

	607 # gs://dest/./abc.

	608 if dst_key_name.startswith('.%s' % os.sep):

	609 dst_key_name = dst_key_name[2:]

	610

	611 else:

	612 # Case 3.

	613 dst_key_name = exp_src_url.object_name.rpartition(src_url.delim)[-1]

	614

	615 if (not recursive_move_to_new_subdir and (

	616 exp_dst_url.IsFileUrl() or _ShouldTreatDstUrlAsBucketSubDir(

	617 have_multiple_srcs, exp_dst_url, have_existing_dest_subdir,

	618 src_url_names_container, recursion_requested))):

	619 if exp_dst_url.object_name and exp_dst_url.object_name.endswith(

	620 exp_dst_url.delim):

	621 dst_key_name = '%s%s%s' % (

	622 exp_dst_url.object_name.rstrip(exp_dst_url.delim),

	623 exp_dst_url.delim, dst_key_name)

	624 else:

	625 delim = exp_dst_url.delim if exp_dst_url.object_name else ''

	626 dst_key_name = '%s%s%s' % (exp_dst_url.object_name or '',

	627 delim, dst_key_name)

	628

	629 new_exp_dst_url = exp_dst_url.Clone()

	630 new_exp_dst_url.object_name = dst_key_name.replace(src_url.delim,

	631 exp_dst_url.delim)

	632 return new_exp_dst_url

	633

	634

	635 def _CreateDigestsFromDigesters(digesters):

	636 digests = {}

	637 if digesters:

	638 for alg in digesters:

	639 digests[alg] = base64.encodestring(

	640 digesters[alg].digest()).rstrip('\n')

	641 return digests

	642

	643

	644 def _CreateDigestsFromLocalFile(logger, algs, file_name, src_obj_metadata):

	645 """Creates a base64 CRC32C and/or MD5 digest from file_name.

	646

	647 Args:

	648 logger: for outputting log messages.

	649 algs: list of algorithms to compute.

	650 file_name: file to digest.

	651 src_obj_metadata: metadta of source object.

	652

	653 Returns:

	654 Dict of algorithm name : base 64 encoded digest

	655 """

	656 hash_dict = {}

	657 if 'md5' in algs:

	658 if src_obj_metadata.size and src_obj_metadata.size > TEN_MIB:

	659 logger.info(

	660 'Computing MD5 for %s...', file_name)

	661 hash_dict['md5'] = md5()

	662 if 'crc32c' in algs:

	663 hash_dict['crc32c'] = crcmod.predefined.Crc('crc-32c')

	664 with open(file_name, 'rb') as fp:

	665 CalculateHashesFromContents(

	666 fp, hash_dict, ProgressCallbackWithBackoff(

	667 src_obj_metadata.size,

	668 FileProgressCallbackHandler(

	669 ConstructAnnounceText('Hashing', file_name), logger).call))

	670 digests = {}

	671 for alg_name, digest in hash_dict.iteritems():

	672 digests[alg_name] = Base64EncodeHash(digest.hexdigest())

	673 return digests

	674

	675

	676 def _CheckCloudHashes(logger, src_url, dst_url, src_obj_metadata,

	677 dst_obj_metadata):

	678 """Validates integrity of two cloud objects copied via daisy-chain.

	679

	680 Args:

	681 logger: for outputting log messages.

	682 src_url: CloudUrl for source cloud object.

	683 dst_url: CloudUrl for destination cloud object.

	684 src_obj_metadata: Cloud Object metadata for object being downloaded from.

	685 dst_obj_metadata: Cloud Object metadata for object being uploaded to.

	686

	687 Raises:

	688 CommandException: if cloud digests don't match local digests.

	689 """

	690 checked_one = False

	691 download_hashes = {}

	692 upload_hashes = {}

	693 if src_obj_metadata.md5Hash:

	694 download_hashes['md5'] = src_obj_metadata.md5Hash

	695 if src_obj_metadata.crc32c:

	696 download_hashes['crc32c'] = src_obj_metadata.crc32c

	697 if dst_obj_metadata.md5Hash:

	698 upload_hashes['md5'] = dst_obj_metadata.md5Hash

	699 if dst_obj_metadata.crc32c:

	700 upload_hashes['crc32c'] = dst_obj_metadata.crc32c

	701

	702 for alg, upload_b64_digest in upload_hashes.iteritems():

	703 if alg not in download_hashes:

	704 continue

	705

	706 download_b64_digest = download_hashes[alg]

	707 logger.debug(

	708 'Comparing source vs destination %s-checksum for %s. (%s/%s)', alg,

	709 dst_url, download_b64_digest, upload_b64_digest)

	710 if download_b64_digest != upload_b64_digest:

	711 raise HashMismatchException(

	712 '%s signature for source object (%s) doesn\'t match '

	713 'destination object digest (%s). Object (%s) will be deleted.' % (

	714 alg, download_b64_digest, upload_b64_digest, dst_url))

	715 checked_one = True

	716 if not checked_one:

	717 # One known way this can currently happen is when downloading objects larger

	718 # than 5 GiB from S3 (for which the etag is not an MD5).

	719 logger.warn(

	720 'WARNING: Found no hashes to validate object downloaded from %s and '

	721 'uploaded to %s. Integrity cannot be assured without hashes.',

	722 src_url, dst_url)

	723

	724

	725 def _CheckHashes(logger, obj_url, obj_metadata, file_name, digests,

	726 is_upload=False):

	727 """Validates integrity by comparing cloud digest to local digest.

	728

	729 Args:

	730 logger: for outputting log messages.

	731 obj_url: CloudUrl for cloud object.

	732 obj_metadata: Cloud Object being downloaded from or uploaded to.

	733 file_name: Local file name on disk being downloaded to or uploaded from.

	734 digests: Computed Digests for the object.

	735 is_upload: If true, comparing for an uploaded object (controls logging).

	736

	737 Raises:

	738 CommandException: if cloud digests don't match local digests.

	739 """

	740 local_hashes = digests

	741 cloud_hashes = {}

	742 if obj_metadata.md5Hash:

	743 cloud_hashes['md5'] = obj_metadata.md5Hash.rstrip('\n')

	744 if obj_metadata.crc32c:

	745 cloud_hashes['crc32c'] = obj_metadata.crc32c.rstrip('\n')

	746

	747 checked_one = False

	748 for alg in local_hashes:

	749 if alg not in cloud_hashes:

	750 continue

	751

	752 local_b64_digest = local_hashes[alg]

	753 cloud_b64_digest = cloud_hashes[alg]

	754 logger.debug(

	755 'Comparing local vs cloud %s-checksum for %s. (%s/%s)', alg, file_name,

	756 local_b64_digest, cloud_b64_digest)

	757 if local_b64_digest != cloud_b64_digest:

	758

	759 raise HashMismatchException(

	760 '%s signature computed for local file (%s) doesn\'t match '

	761 'cloud-supplied digest (%s). %s (%s) will be deleted.' % (

	762 alg, local_b64_digest, cloud_b64_digest,

	763 'Cloud object' if is_upload else 'Local file',

	764 obj_url if is_upload else file_name))

	765 checked_one = True

	766 if not checked_one:

	767 if is_upload:

	768 logger.warn(

	769 'WARNING: Found no hashes to validate object uploaded to %s. '

	770 'Integrity cannot be assured without hashes.', obj_url)

	771 else:

	772 # One known way this can currently happen is when downloading objects larger

	773 # than 5 GB from S3 (for which the etag is not an MD5).

	774 logger.warn(

	775 'WARNING: Found no hashes to validate object downloaded to %s. '

	776 'Integrity cannot be assured without hashes.', file_name)

	777

	778

	779 def IsNoClobberServerException(e):

	780 """Checks to see if the server attempted to clobber a file.

	781

	782 In this case we specified via a precondition that we didn't want the file

	783 clobbered.

	784

	785 Args:

	786 e: The Exception that was generated by a failed copy operation

	787

	788 Returns:

	789 bool indicator - True indicates that the server did attempt to clobber

	790 an existing file.

	791 """

	792 return ((isinstance(e, PreconditionException)) or

	793 (isinstance(e, ResumableUploadException) and '412' in e.message))

	794

	795

	796 def CheckForDirFileConflict(exp_src_url, dst_url):

	797 """Checks whether copying exp_src_url into dst_url is not possible.

	798

	799 This happens if a directory exists in local file system where a file

	800 needs to go or vice versa. In that case we print an error message and

	801 exits. Example: if the file "./x" exists and you try to do:

	802 gsutil cp gs://mybucket/x/y .

	803 the request can't succeed because it requires a directory where

	804 the file x exists.

	805

	806 Note that we don't enforce any corresponding restrictions for buckets,

	807 because the flat namespace semantics for buckets doesn't prohibit such

	808 cases the way hierarchical file systems do. For example, if a bucket

	809 contains an object called gs://bucket/dir and then you run the command:

	810 gsutil cp file1 file2 gs://bucket/dir

	811 you'll end up with objects gs://bucket/dir, gs://bucket/dir/file1, and

	812 gs://bucket/dir/file2.

	813

	814 Args:

	815 exp_src_url: Expanded source StorageUrl.

	816 dst_url: Destination StorageUrl.

	817

	818 Raises:

	819 CommandException: if errors encountered.

	820 """

	821 if dst_url.IsCloudUrl():

	822 # The problem can only happen for file destination URLs.

	823 return

	824 dst_path = dst_url.object_name

	825 final_dir = os.path.dirname(dst_path)

	826 if os.path.isfile(final_dir):

	827 raise CommandException('Cannot retrieve %s because a file exists '

	828 'where a directory needs to be created (%s).' %

	829 (exp_src_url.url_string, final_dir))

	830 if os.path.isdir(dst_path):

	831 raise CommandException('Cannot retrieve %s because a directory exists '

	832 '(%s) where the file needs to be created.' %

	833 (exp_src_url.url_string, dst_path))

	834

	835

	836 def _PartitionFile(fp, file_size, src_url, content_type, canned_acl,

	837 dst_bucket_url, random_prefix, tracker_file,

	838 tracker_file_lock):

	839 """Partitions a file into FilePart objects to be uploaded and later composed.

	840

	841 These objects, when composed, will match the original file. This entails

	842 splitting the file into parts, naming and forming a destination URL for each

	843 part, and also providing the PerformParallelUploadFileToObjectArgs

	844 corresponding to each part.

	845

	846 Args:

	847 fp: The file object to be partitioned.

	848 file_size: The size of fp, in bytes.

	849 src_url: Source FileUrl from the original command.

	850 content_type: content type for the component and final objects.

	851 canned_acl: The user-provided canned_acl, if applicable.

	852 dst_bucket_url: CloudUrl for the destination bucket

	853 random_prefix: The randomly-generated prefix used to prevent collisions

	854 among the temporary component names.

	855 tracker_file: The path to the parallel composite upload tracker file.

	856 tracker_file_lock: The lock protecting access to the tracker file.

	857

	858 Returns:

	859 dst_args: The destination URIs for the temporary component objects.

	860 """

	861 parallel_composite_upload_component_size = HumanReadableToBytes(

	862 config.get('GSUtil', 'parallel_composite_upload_component_size',

	863 DEFAULT_PARALLEL_COMPOSITE_UPLOAD_COMPONENT_SIZE))

	864 (num_components, component_size) = _GetPartitionInfo(

	865 file_size, MAX_COMPOSE_ARITY, parallel_composite_upload_component_size)

	866

	867 dst_args = {} # Arguments to create commands and pass to subprocesses.

	868 file_names = [] # Used for the 2-step process of forming dst_args.

	869 for i in range(num_components):

	870 # "Salt" the object name with something a user is very unlikely to have

	871 # used in an object name, then hash the extended name to make sure

	872 # we don't run into problems with name length. Using a deterministic

	873 # naming scheme for the temporary components allows users to take

	874 # advantage of resumable uploads for each component.

	875 encoded_name = (PARALLEL_UPLOAD_STATIC_SALT + fp.name).encode(UTF8)

	876 content_md5 = md5()

	877 content_md5.update(encoded_name)

	878 digest = content_md5.hexdigest()

	879 temp_file_name = (random_prefix + PARALLEL_UPLOAD_TEMP_NAMESPACE +

	880 digest + '_' + str(i))

	881 tmp_dst_url = dst_bucket_url.Clone()

	882 tmp_dst_url.object_name = temp_file_name

	883

	884 if i < (num_components - 1):

	885 # Every component except possibly the last is the same size.

	886 file_part_length = component_size

	887 else:

	888 # The last component just gets all of the remaining bytes.

	889 file_part_length = (file_size - ((num_components -1) * component_size))

	890 offset = i * component_size

	891 func_args = PerformParallelUploadFileToObjectArgs(

	892 fp.name, offset, file_part_length, src_url, tmp_dst_url, canned_acl,

	893 content_type, tracker_file, tracker_file_lock)

	894 file_names.append(temp_file_name)

	895 dst_args[temp_file_name] = func_args

	896

	897 return dst_args

	898

	899

	900 def _DoParallelCompositeUpload(fp, src_url, dst_url, dst_obj_metadata,

	901 canned_acl, file_size, preconditions, gsutil_api,

	902 command_obj, copy_exception_handler):

	903 """Uploads a local file to a cloud object using parallel composite upload.

	904

	905 The file is partitioned into parts, and then the parts are uploaded in

	906 parallel, composed to form the original destination object, and deleted.

	907

	908 Args:

	909 fp: The file object to be uploaded.

	910 src_url: FileUrl representing the local file.

	911 dst_url: CloudUrl representing the destination file.

	912 dst_obj_metadata: apitools Object describing the destination object.

	913 canned_acl: The canned acl to apply to the object, if any.

	914 file_size: The size of the source file in bytes.

	915 preconditions: Cloud API Preconditions for the final object.

	916 gsutil_api: gsutil Cloud API instance to use.

	917 command_obj: Command object (for calling Apply).

	918 copy_exception_handler: Copy exception handler (for use in Apply).

	919

	920 Returns:

	921 Elapsed upload time, uploaded Object with generation, crc32c, and size

	922 fields populated.

	923 """

	924 start_time = time.time()

	925 dst_bucket_url = StorageUrlFromString(dst_url.bucket_url_string)

	926 api_selector = gsutil_api.GetApiSelector(provider=dst_url.scheme)

	927 # Determine which components, if any, have already been successfully

	928 # uploaded.

	929 tracker_file = GetTrackerFilePath(dst_url, TrackerFileType.PARALLEL_UPLOAD,

	930 api_selector, src_url)

	931 tracker_file_lock = CreateLock()

	932 (random_prefix, existing_components) = (

	933 _ParseParallelUploadTrackerFile(tracker_file, tracker_file_lock))

	934

	935 # Create the initial tracker file for the upload.

	936 _CreateParallelUploadTrackerFile(tracker_file, random_prefix,

	937 existing_components, tracker_file_lock)

	938

	939 # Get the set of all components that should be uploaded.

	940 dst_args = _PartitionFile(

	941 fp, file_size, src_url, dst_obj_metadata.contentType, canned_acl,

	942 dst_bucket_url, random_prefix, tracker_file, tracker_file_lock)

	943

	944 (components_to_upload, existing_components, existing_objects_to_delete) = (

	945 FilterExistingComponents(dst_args, existing_components, dst_bucket_url,

	946 gsutil_api))

	947

	948 # In parallel, copy all of the file parts that haven't already been

	949 # uploaded to temporary objects.

	950 cp_results = command_obj.Apply(

	951 _PerformParallelUploadFileToObject, components_to_upload,

	952 copy_exception_handler, ('op_failure_count', 'total_bytes_transferred'),

	953 arg_checker=gslib.command.DummyArgChecker,

	954 parallel_operations_override=True, should_return_results=True)

	955 uploaded_components = []

	956 for cp_result in cp_results:

	957 uploaded_components.append(cp_result[2])

	958 components = uploaded_components + existing_components

	959

	960 if len(components) == len(dst_args):

	961 # Only try to compose if all of the components were uploaded successfully.

	962

	963 def _GetComponentNumber(component):

	964 return int(component.object_name[component.object_name.rfind('_')+1:])

	965 # Sort the components so that they will be composed in the correct order.

	966 components = sorted(components, key=_GetComponentNumber)

	967

	968 request_components = []

	969 for component_url in components:

	970 src_obj_metadata = (

	971 apitools_messages.ComposeRequest.SourceObjectsValueListEntry(

	972 name=component_url.object_name))

	973 if component_url.HasGeneration():

	974 src_obj_metadata.generation = long(component_url.generation)

	975 request_components.append(src_obj_metadata)

	976

	977 composed_object = gsutil_api.ComposeObject(

	978 request_components, dst_obj_metadata, preconditions=preconditions,

	979 provider=dst_url.scheme, fields=['generation', 'crc32c', 'size'])

	980

	981 try:

	982 # Make sure only to delete things that we know were successfully

	983 # uploaded (as opposed to all of the objects that we attempted to

	984 # create) so that we don't delete any preexisting objects, except for

	985 # those that were uploaded by a previous, failed run and have since

	986 # changed (but still have an old generation lying around).

	987 objects_to_delete = components + existing_objects_to_delete

	988 command_obj.Apply(_DeleteObjectFn, objects_to_delete, _RmExceptionHandler,

	989 arg_checker=gslib.command.DummyArgChecker,

	990 parallel_operations_override=True)

	991 except Exception: # pylint: disable=broad-except

	992 # If some of the delete calls fail, don't cause the whole command to

	993 # fail. The copy was successful iff the compose call succeeded, so

	994 # reduce this to a warning.

	995 logging.warning(

	996 'Failed to delete some of the following temporary objects:\n' +

	997 '\n'.join(dst_args.keys()))

	998 finally:

	999 with tracker_file_lock:

	1000 if os.path.exists(tracker_file):

	1001 os.unlink(tracker_file)

	1002 else:

	1003 # Some of the components failed to upload. In this case, we want to exit

	1004 # without deleting the objects.

	1005 raise CommandException(

	1006 'Some temporary components were not uploaded successfully. '

	1007 'Please retry this upload.')

	1008

	1009 elapsed_time = time.time() - start_time

	1010 return elapsed_time, composed_object

	1011

	1012

	1013 def _ShouldDoParallelCompositeUpload(logger, allow_splitting, src_url, dst_url,

	1014 file_size, canned_acl=None):

	1015 """Determines whether parallel composite upload strategy should be used.

	1016

	1017 Args:

	1018 logger: for outputting log messages.

	1019 allow_splitting: If false, then this function returns false.

	1020 src_url: FileUrl corresponding to a local file.

	1021 dst_url: CloudUrl corresponding to destination cloud object.

	1022 file_size: The size of the source file, in bytes.

	1023 canned_acl: Canned ACL to apply to destination object, if any.

	1024

	1025 Returns:

	1026 True iff a parallel upload should be performed on the source file.

	1027 """

	1028 global suggested_parallel_composites

	1029 parallel_composite_upload_threshold = HumanReadableToBytes(config.get(

	1030 'GSUtil', 'parallel_composite_upload_threshold',

	1031 DEFAULT_PARALLEL_COMPOSITE_UPLOAD_THRESHOLD))

	1032

	1033 all_factors_but_size = (

	1034 allow_splitting # Don't split the pieces multiple times.

	1035 and not src_url.IsStream() # We can't partition streams.

	1036 and dst_url.scheme == 'gs' # Compose is only for gs.

	1037 and not canned_acl) # TODO: Implement canned ACL support for compose.

	1038

	1039 # Since parallel composite uploads are disabled by default, make user aware of

	1040 # them.

	1041 # TODO: Once compiled crcmod is being distributed by major Linux distributions

	1042 # remove this check.

	1043 if (all_factors_but_size and parallel_composite_upload_threshold == 0

	1044 and file_size >= PARALLEL_COMPOSITE_SUGGESTION_THRESHOLD

	1045 and not suggested_parallel_composites):

	1046 logger.info('\n'.join(textwrap.wrap(

	1047 '==> NOTE: You are uploading one or more large file(s), which would '

	1048 'run significantly faster if you enable parallel composite uploads. '

	1049 'This feature can be enabled by editing the '

	1050 '"parallel_composite_upload_threshold" value in your .boto '

	1051 'configuration file. However, note that if you do this you and any '

	1052 'users that download such composite files will need to have a compiled '

	1053 'crcmod installed (see "gsutil help crcmod").')) + '\n')

	1054 suggested_parallel_composites = True

	1055

	1056 return (all_factors_but_size

	1057 and parallel_composite_upload_threshold > 0

	1058 and file_size >= parallel_composite_upload_threshold)

	1059

	1060

	1061 def ExpandUrlToSingleBlr(url_str, gsutil_api, debug, project_id,

	1062 treat_nonexistent_object_as_subdir=False):

	1063 """Expands wildcard if present in url_str.

	1064

	1065 Args:

	1066 url_str: String representation of requested url.

	1067 gsutil_api: gsutil Cloud API instance to use.

	1068 debug: debug level to use (for iterators).

	1069 project_id: project ID to use (for iterators).

	1070 treat_nonexistent_object_as_subdir: indicates if should treat a non-existent

	1071 object as a subdir.

	1072

	1073 Returns:

	1074 (exp_url, have_existing_dst_container)

	1075 where exp_url is a StorageUrl

	1076 and have_existing_dst_container is a bool indicating whether

	1077 exp_url names an existing directory, bucket, or bucket subdirectory.

	1078 In the case where we match a subdirectory AND an object, the

	1079 object is returned.

	1080

	1081 Raises:

	1082 CommandException: if url_str matched more than 1 URL.

	1083 """

	1084 # Handle wildcarded url case.

	1085 if ContainsWildcard(url_str):

	1086 blr_expansion = list(CreateWildcardIterator(url_str, gsutil_api,

	1087 debug=debug,

	1088 project_id=project_id))

	1089 if len(blr_expansion) != 1:

	1090 raise CommandException('Destination (%s) must match exactly 1 URL' %

	1091 url_str)

	1092 blr = blr_expansion[0]

	1093 # BLR is either an OBJECT, PREFIX, or BUCKET; the latter two represent

	1094 # directories.

	1095 return (StorageUrlFromString(blr.url_string), not blr.IsObject())

	1096

	1097 storage_url = StorageUrlFromString(url_str)

	1098

	1099 # Handle non-wildcarded URL.

	1100 if storage_url.IsFileUrl():

	1101 return (storage_url, storage_url.IsDirectory())

	1102

	1103 # At this point we have a cloud URL.

	1104 if storage_url.IsBucket():

	1105 return (storage_url, True)

	1106

	1107 # For object/prefix URLs check 3 cases: (a) if the name ends with '/' treat

	1108 # as a subdir; otherwise, use the wildcard iterator with url to

	1109 # find if (b) there's a Prefix matching url, or (c) name is of form

	1110 # dir_$folder$ (and in both these cases also treat dir as a subdir).

	1111 # Cloud subdirs are always considered to be an existing container.

	1112 if IsCloudSubdirPlaceholder(storage_url):

	1113 return (storage_url, True)

	1114

	1115 # Check for the special case where we have a folder marker object.

	1116 folder_expansion = CreateWildcardIterator(

	1117 storage_url.versionless_url_string + '_$folder$', gsutil_api,

	1118 debug=debug, project_id=project_id).IterAll(

	1119 bucket_listing_fields=['name'])

	1120 for blr in folder_expansion:

	1121 return (storage_url, True)

	1122

	1123 blr_expansion = CreateWildcardIterator(url_str, gsutil_api,

	1124 debug=debug,

	1125 project_id=project_id).IterAll(

	1126 bucket_listing_fields=['name'])

	1127 expansion_empty = True

	1128 for blr in blr_expansion:

	1129 expansion_empty = False

	1130 if blr.IsPrefix():

	1131 return (storage_url, True)

	1132

	1133 return (storage_url,

	1134 expansion_empty and treat_nonexistent_object_as_subdir)

	1135

	1136

	1137 def FixWindowsNaming(src_url, dst_url):

	1138 """Translates Windows pathnames to cloud pathnames.

	1139

	1140 Rewrites the destination URL built by ConstructDstUrl().

	1141

	1142 Args:

	1143 src_url: Source StorageUrl to be copied.

	1144 dst_url: The destination StorageUrl built by ConstructDstUrl().

	1145

	1146 Returns:

	1147 StorageUrl to use for copy.

	1148 """

	1149 if (src_url.IsFileUrl() and src_url.delim == '\\'

	1150 and dst_url.IsCloudUrl()):

	1151 trans_url_str = re.sub(r'\\', '/', dst_url.url_string)

	1152 dst_url = StorageUrlFromString(trans_url_str)

	1153 return dst_url

	1154

	1155

	1156 def SrcDstSame(src_url, dst_url):

	1157 """Checks if src_url and dst_url represent the same object or file.

	1158

	1159 We don't handle anything about hard or symbolic links.

	1160

	1161 Args:

	1162 src_url: Source StorageUrl.

	1163 dst_url: Destination StorageUrl.

	1164

	1165 Returns:

	1166 Bool indicator.

	1167 """

	1168 if src_url.IsFileUrl() and dst_url.IsFileUrl():

	1169 # Translate a/b/./c to a/b/c, so src=dst comparison below works.

	1170 new_src_path = os.path.normpath(src_url.object_name)

	1171 new_dst_path = os.path.normpath(dst_url.object_name)

	1172 return new_src_path == new_dst_path

	1173 else:

	1174 return (src_url.url_string == dst_url.url_string and

	1175 src_url.generation == dst_url.generation)

	1176

	1177

	1178 def _LogCopyOperation(logger, src_url, dst_url, dst_obj_metadata):

	1179 """Logs copy operation, including Content-Type if appropriate.

	1180

	1181 Args:

	1182 logger: logger instance to use for output.

	1183 src_url: Source StorageUrl.

	1184 dst_url: Destination StorageUrl.

	1185 dst_obj_metadata: Object-specific metadata that should be overidden during

	1186 the copy.

	1187 """

	1188 if (dst_url.IsCloudUrl() and dst_obj_metadata and

	1189 dst_obj_metadata.contentType):

	1190 content_type_msg = ' [Content-Type=%s]' % dst_obj_metadata.contentType

	1191 else:

	1192 content_type_msg = ''

	1193 if src_url.IsFileUrl() and src_url.IsStream():

	1194 logger.info('Copying from <STDIN>%s...', content_type_msg)

	1195 else:

	1196 logger.info('Copying %s%s...', src_url.url_string, content_type_msg)

	1197

	1198

	1199 # pylint: disable=undefined-variable

	1200 def _CopyObjToObjInTheCloud(src_url, src_obj_metadata, dst_url,

	1201 dst_obj_metadata, preconditions, gsutil_api,

	1202 logger):

	1203 """Performs copy-in-the cloud from specified src to dest object.

	1204

	1205 Args:

	1206 src_url: Source CloudUrl.

	1207 src_obj_metadata: Metadata for source object; must include etag and size.

	1208 dst_url: Destination CloudUrl.

	1209 dst_obj_metadata: Object-specific metadata that should be overidden during

	1210 the copy.

	1211 preconditions: Preconditions to use for the copy.

	1212 gsutil_api: gsutil Cloud API instance to use for the copy.

	1213 logger: logging.Logger for log message output.

	1214

	1215 Returns:

	1216 (elapsed_time, bytes_transferred, dst_url with generation,

	1217 md5 hash of destination) excluding overhead like initial GET.

	1218

	1219 Raises:

	1220 CommandException: if errors encountered.

	1221 """

	1222 start_time = time.time()

	1223

	1224 progress_callback = FileProgressCallbackHandler(

	1225 ConstructAnnounceText('Copying', dst_url.url_string), logger).call

	1226 if global_copy_helper_opts.test_callback_file:

	1227 with open(global_copy_helper_opts.test_callback_file, 'rb') as test_fp:

	1228 progress_callback = pickle.loads(test_fp.read()).call

	1229 dst_obj = gsutil_api.CopyObject(

	1230 src_obj_metadata, dst_obj_metadata, src_generation=src_url.generation,

	1231 canned_acl=global_copy_helper_opts.canned_acl,

	1232 preconditions=preconditions, progress_callback=progress_callback,

	1233 provider=dst_url.scheme, fields=UPLOAD_RETURN_FIELDS)

	1234

	1235 end_time = time.time()

	1236

	1237 result_url = dst_url.Clone()

	1238 result_url.generation = GenerationFromUrlAndString(result_url,

	1239 dst_obj.generation)

	1240

	1241 return (end_time - start_time, src_obj_metadata.size, result_url,

	1242 dst_obj.md5Hash)

	1243

	1244

	1245 def _CheckFreeSpace(path):

	1246 """Return path/drive free space (in bytes)."""

	1247 if IS_WINDOWS:

	1248 # pylint: disable=g-import-not-at-top

	1249 try:

	1250 # pylint: disable=invalid-name

	1251 get_disk_free_space_ex = WINFUNCTYPE(c_int, c_wchar_p,

	1252 POINTER(c_uint64),

	1253 POINTER(c_uint64),

	1254 POINTER(c_uint64))

	1255 get_disk_free_space_ex = get_disk_free_space_ex(

	1256 ('GetDiskFreeSpaceExW', windll.kernel32), (

	1257 (1, 'lpszPathName'),

	1258 (2, 'lpFreeUserSpace'),

	1259 (2, 'lpTotalSpace'),

	1260 (2, 'lpFreeSpace'),))

	1261 except AttributeError:

	1262 get_disk_free_space_ex = WINFUNCTYPE(c_int, c_char_p,

	1263 POINTER(c_uint64),

	1264 POINTER(c_uint64),

	1265 POINTER(c_uint64))

	1266 get_disk_free_space_ex = get_disk_free_space_ex(

	1267 ('GetDiskFreeSpaceExA', windll.kernel32), (

	1268 (1, 'lpszPathName'),

	1269 (2, 'lpFreeUserSpace'),

	1270 (2, 'lpTotalSpace'),

	1271 (2, 'lpFreeSpace'),))

	1272

	1273 def GetDiskFreeSpaceExErrCheck(result, unused_func, args):

	1274 if not result:

	1275 raise WinError()

	1276 return args[1].value

	1277 get_disk_free_space_ex.errcheck = GetDiskFreeSpaceExErrCheck

	1278

	1279 return get_disk_free_space_ex(os.getenv('SystemDrive'))

	1280 else:

	1281 (_, f_frsize, _, _, f_bavail, _, _, _, _, _) = os.statvfs(path)

	1282 return f_frsize * f_bavail

	1283

	1284

	1285 def _SetContentTypeFromFile(src_url, dst_obj_metadata):

	1286 """Detects and sets Content-Type if src_url names a local file.

	1287

	1288 Args:

	1289 src_url: Source StorageUrl.

	1290 dst_obj_metadata: Object-specific metadata that should be overidden during

	1291 the copy.

	1292 """

	1293 # contentType == '' if user requested default type.

	1294 if (dst_obj_metadata.contentType is None and src_url.IsFileUrl()

	1295 and not src_url.IsStream()):

	1296 # Only do content type recognition if src_url is a file. Object-to-object

	1297 # copies with no -h Content-Type specified re-use the content type of the

	1298 # source object.

	1299 object_name = src_url.object_name

	1300 content_type = None

	1301 # Streams (denoted by '-') are expected to be 'application/octet-stream'

	1302 # and 'file' would partially consume them.

	1303 if object_name != '-':

	1304 if config.getbool('GSUtil', 'use_magicfile', False):

	1305 p = subprocess.Popen(['file', '--mime-type', object_name],

	1306 stdout=subprocess.PIPE, stderr=subprocess.PIPE)

	1307 output, error = p.communicate()

	1308 p.stdout.close()

	1309 p.stderr.close()

	1310 if p.returncode != 0 or error:

	1311 raise CommandException(

	1312 'Encountered error running "file --mime-type %s" '

	1313 '(returncode=%d).\n%s' % (object_name, p.returncode, error))

	1314 # Parse output by removing line delimiter and splitting on last ":

	1315 content_type = output.rstrip().rpartition(': ')[2]

	1316 else:

	1317 content_type = mimetypes.guess_type(object_name)[0]

	1318 if not content_type:

	1319 content_type = DEFAULT_CONTENT_TYPE

	1320 dst_obj_metadata.contentType = content_type

	1321

	1322

	1323 # pylint: disable=undefined-variable

	1324 def _UploadFileToObjectNonResumable(src_url, src_obj_filestream,

	1325 src_obj_size, dst_url, dst_obj_metadata,

	1326 preconditions, gsutil_api, logger):

	1327 """Uploads the file using a non-resumable strategy.

	1328

	1329 Args:

	1330 src_url: Source StorageUrl to upload.

	1331 src_obj_filestream: File pointer to uploadable bytes.

	1332 src_obj_size: Size of the source object.

	1333 dst_url: Destination StorageUrl for the upload.

	1334 dst_obj_metadata: Metadata for the target object.

	1335 preconditions: Preconditions for the upload, if any.

	1336 gsutil_api: gsutil Cloud API instance to use for the upload.

	1337 logger: For outputting log messages.

	1338

	1339 Returns:

	1340 Elapsed upload time, uploaded Object with generation, md5, and size fields

	1341 populated.

	1342 """

	1343 progress_callback = FileProgressCallbackHandler(

	1344 ConstructAnnounceText('Uploading', dst_url.url_string), logger).call

	1345 if global_copy_helper_opts.test_callback_file:

	1346 with open(global_copy_helper_opts.test_callback_file, 'rb') as test_fp:

	1347 progress_callback = pickle.loads(test_fp.read()).call

	1348 start_time = time.time()

	1349

	1350 if src_url.IsStream():

	1351 # TODO: gsutil-beta: Provide progress callbacks for streaming uploads.

	1352 uploaded_object = gsutil_api.UploadObjectStreaming(

	1353 src_obj_filestream, object_metadata=dst_obj_metadata,

	1354 canned_acl=global_copy_helper_opts.canned_acl,

	1355 preconditions=preconditions, progress_callback=progress_callback,

	1356 provider=dst_url.scheme, fields=UPLOAD_RETURN_FIELDS)

	1357 else:

	1358 uploaded_object = gsutil_api.UploadObject(

	1359 src_obj_filestream, object_metadata=dst_obj_metadata,

	1360 canned_acl=global_copy_helper_opts.canned_acl, size=src_obj_size,

	1361 preconditions=preconditions, progress_callback=progress_callback,

	1362 provider=dst_url.scheme, fields=UPLOAD_RETURN_FIELDS)

	1363 end_time = time.time()

	1364 elapsed_time = end_time - start_time

	1365

	1366 return elapsed_time, uploaded_object

	1367

	1368

	1369 # pylint: disable=undefined-variable

	1370 def _UploadFileToObjectResumable(src_url, src_obj_filestream,

	1371 src_obj_size, dst_url, dst_obj_metadata,

	1372 preconditions, gsutil_api, logger):

	1373 """Uploads the file using a resumable strategy.

	1374

	1375 Args:

	1376 src_url: Source FileUrl to upload. Must not be a stream.

	1377 src_obj_filestream: File pointer to uploadable bytes.

	1378 src_obj_size: Size of the source object.

	1379 dst_url: Destination StorageUrl for the upload.

	1380 dst_obj_metadata: Metadata for the target object.

	1381 preconditions: Preconditions for the upload, if any.

	1382 gsutil_api: gsutil Cloud API instance to use for the upload.

	1383 logger: for outputting log messages.

	1384

	1385 Returns:

	1386 Elapsed upload time, uploaded Object with generation, md5, and size fields

	1387 populated.

	1388 """

	1389 tracker_file_name = GetTrackerFilePath(

	1390 dst_url, TrackerFileType.UPLOAD,

	1391 gsutil_api.GetApiSelector(provider=dst_url.scheme))

	1392

	1393 def _UploadTrackerCallback(serialization_data):

	1394 """Creates a new tracker file for starting an upload from scratch.

	1395

	1396 This function is called by the gsutil Cloud API implementation and the

	1397 the serialization data is implementation-specific.

	1398

	1399 Args:

	1400 serialization_data: Serialization data used in resuming the upload.

	1401 """

	1402 tracker_file = None

	1403 try:

	1404 tracker_file = open(tracker_file_name, 'w')

	1405 tracker_file.write(str(serialization_data))

	1406 except IOError as e:

	1407 RaiseUnwritableTrackerFileException(tracker_file_name, e.strerror)

	1408 finally:

	1409 if tracker_file:

	1410 tracker_file.close()

	1411

	1412 # This contains the upload URL, which will uniquely identify the

	1413 # destination object.

	1414 tracker_data = _GetUploadTrackerData(tracker_file_name, logger)

	1415 if tracker_data:

	1416 logger.info(

	1417 'Resuming upload for %s', src_url.url_string)

	1418

	1419 retryable = True

	1420

	1421 progress_callback = FileProgressCallbackHandler(

	1422 ConstructAnnounceText('Uploading', dst_url.url_string), logger).call

	1423 if global_copy_helper_opts.test_callback_file:

	1424 with open(global_copy_helper_opts.test_callback_file, 'rb') as test_fp:

	1425 progress_callback = pickle.loads(test_fp.read()).call

	1426

	1427 start_time = time.time()

	1428 num_startover_attempts = 0

	1429 # This loop causes us to retry when the resumable upload failed in a way that

	1430 # requires starting over with a new upload ID. Retries within a single upload

	1431 # ID within the current process are handled in

	1432 # gsutil_api.UploadObjectResumable, and retries within a single upload ID

	1433 # spanning processes happens if an exception occurs not caught below (which

	1434 # will leave the tracker file in place, and cause the upload ID to be reused

	1435 # the next time the user runs gsutil and attempts the same upload).

	1436 while retryable:

	1437 try:

	1438 uploaded_object = gsutil_api.UploadObjectResumable(

	1439 src_obj_filestream, object_metadata=dst_obj_metadata,

	1440 canned_acl=global_copy_helper_opts.canned_acl,

	1441 preconditions=preconditions, provider=dst_url.scheme,

	1442 size=src_obj_size, serialization_data=tracker_data,

	1443 fields=UPLOAD_RETURN_FIELDS,

	1444 tracker_callback=_UploadTrackerCallback,

	1445 progress_callback=progress_callback)

	1446 retryable = False

	1447 except ResumableUploadStartOverException, e:

	1448 # This can happen, for example, if the server sends a 410 response code.

	1449 # In that case the current resumable upload ID can't be reused, so delete

	1450 # the tracker file and try again up to max retries.

	1451 num_startover_attempts += 1

	1452 retryable = (num_startover_attempts < GetNumRetries())

	1453 if not retryable:

	1454 raise

	1455

	1456 # If the server sends a 404 response code, then the upload should only

	1457 # be restarted if it was the object (and not the bucket) that was missing.

	1458 try:

	1459 gsutil_api.GetBucket(dst_obj_metadata.bucket, provider=dst_url.scheme)

	1460 except NotFoundException:

	1461 raise

	1462

	1463 logger.info('Restarting upload from scratch after exception %s', e)

	1464 DeleteTrackerFile(tracker_file_name)

	1465 tracker_data = None

	1466 src_obj_filestream.seek(0)

	1467 # Reset the progress callback handler.

	1468 progress_callback = FileProgressCallbackHandler(

	1469 ConstructAnnounceText('Uploading', dst_url.url_string), logger).call

	1470 logger.info('\n'.join(textwrap.wrap(

	1471 'Resumable upload of %s failed with a response code indicating we '

	1472 'need to start over with a new resumable upload ID. Backing off '

	1473 'and retrying.' % src_url.url_string)))

	1474 time.sleep(min(random.random() * (2 ** num_startover_attempts),

	1475 GetMaxRetryDelay()))

	1476 except ResumableUploadAbortException:

	1477 retryable = False

	1478 raise

	1479 finally:

	1480 if not retryable:

	1481 DeleteTrackerFile(tracker_file_name)

	1482

	1483 end_time = time.time()

	1484 elapsed_time = end_time - start_time

	1485

	1486 return (elapsed_time, uploaded_object)

	1487

	1488

	1489 def _CompressFileForUpload(src_url, src_obj_filestream, src_obj_size, logger):

	1490 """Compresses a to-be-uploaded local file to save bandwidth.

	1491

	1492 Args:

	1493 src_url: Source FileUrl.

	1494 src_obj_filestream: Read stream of the source file - will be consumed

	1495 and closed.

	1496 src_obj_size: Size of the source file.

	1497 logger: for outputting log messages.

	1498

	1499 Returns:

	1500 StorageUrl path to compressed file, compressed file size.

	1501 """

	1502 # TODO: Compress using a streaming model as opposed to all at once here.

	1503 if src_obj_size >= MIN_SIZE_COMPUTE_LOGGING:

	1504 logger.info(

	1505 'Compressing %s (to tmp)...', src_url)

	1506 (gzip_fh, gzip_path) = tempfile.mkstemp()

	1507 gzip_fp = None

	1508 try:

	1509 # Check for temp space. Assume the compressed object is at most 2x

	1510 # the size of the object (normally should compress to smaller than

	1511 # the object)

	1512 if _CheckFreeSpace(gzip_path) < 2*int(src_obj_size):

	1513 raise CommandException('Inadequate temp space available to compress '

	1514 '%s. See the CHANGING TEMP DIRECTORIES section '

	1515 'of "gsutil help cp" for more info.' % src_url)

	1516 gzip_fp = gzip.open(gzip_path, 'wb')

	1517 data = src_obj_filestream.read(GZIP_CHUNK_SIZE)

	1518 while data:

	1519 gzip_fp.write(data)

	1520 data = src_obj_filestream.read(GZIP_CHUNK_SIZE)

	1521 finally:

	1522 if gzip_fp:

	1523 gzip_fp.close()

	1524 os.close(gzip_fh)

	1525 src_obj_filestream.close()

	1526 gzip_size = os.path.getsize(gzip_path)

	1527 return StorageUrlFromString(gzip_path), gzip_size

	1528

	1529

	1530 def _UploadFileToObject(src_url, src_obj_filestream, src_obj_size,

	1531 dst_url, dst_obj_metadata, preconditions, gsutil_api,

	1532 logger, command_obj, copy_exception_handler,

	1533 gzip_exts=None, allow_splitting=True):

	1534 """Uploads a local file to an object.

	1535

	1536 Args:

	1537 src_url: Source FileUrl.

	1538 src_obj_filestream: Read stream of the source file to be read and closed.

	1539 src_obj_size: Size of the source file.

	1540 dst_url: Destination CloudUrl.

	1541 dst_obj_metadata: Metadata to be applied to the destination object.

	1542 preconditions: Preconditions to use for the copy.

	1543 gsutil_api: gsutil Cloud API to use for the copy.

	1544 logger: for outputting log messages.

	1545 command_obj: command object for use in Apply in parallel composite uploads.

	1546 copy_exception_handler: For handling copy exceptions during Apply.

	1547 gzip_exts: List of file extensions to gzip prior to upload, if any.

	1548 allow_splitting: Whether to allow the file to be split into component

	1549 pieces for an parallel composite upload.

	1550

	1551 Returns:

	1552 (elapsed_time, bytes_transferred, dst_url with generation,

	1553 md5 hash of destination) excluding overhead like initial GET.

	1554

	1555 Raises:

	1556 CommandException: if errors encountered.

	1557 """

	1558 if not dst_obj_metadata or not dst_obj_metadata.contentLanguage:

	1559 content_language = config.get_value('GSUtil', 'content_language')

	1560 if content_language:

	1561 dst_obj_metadata.contentLanguage = content_language

	1562

	1563 fname_parts = src_url.object_name.split('.')

	1564 upload_url = src_url

	1565 upload_stream = src_obj_filestream

	1566 upload_size = src_obj_size

	1567 zipped_file = False

	1568 if gzip_exts and len(fname_parts) > 1 and fname_parts[-1] in gzip_exts:

	1569 upload_url, upload_size = _CompressFileForUpload(

	1570 src_url, src_obj_filestream, src_obj_size, logger)

	1571 upload_stream = open(upload_url.object_name, 'rb')

	1572 dst_obj_metadata.contentEncoding = 'gzip'

	1573 zipped_file = True

	1574

	1575 elapsed_time = None

	1576 uploaded_object = None

	1577 hash_algs = GetUploadHashAlgs()

	1578 digesters = dict((alg, hash_algs[alg]()) for alg in hash_algs or {})

	1579

	1580 parallel_composite_upload = _ShouldDoParallelCompositeUpload(

	1581 logger, allow_splitting, upload_url, dst_url, src_obj_size,

	1582 canned_acl=global_copy_helper_opts.canned_acl)

	1583

	1584 if (src_url.IsStream() and

	1585 gsutil_api.GetApiSelector(provider=dst_url.scheme) == ApiSelector.JSON):

	1586 orig_stream = upload_stream

	1587 # Add limited seekable properties to the stream via buffering.

	1588 upload_stream = ResumableStreamingJsonUploadWrapper(

	1589 orig_stream, GetJsonResumableChunkSize())

	1590

	1591 if not parallel_composite_upload and len(hash_algs):

	1592 # Parallel composite uploads calculate hashes per-component in subsequent

	1593 # calls to this function, but the composition of the final object is a

	1594 # cloud-only operation.

	1595 wrapped_filestream = HashingFileUploadWrapper(upload_stream, digesters,

	1596 hash_algs, upload_url, logger)

	1597 else:

	1598 wrapped_filestream = upload_stream

	1599

	1600 try:

	1601 if parallel_composite_upload:

	1602 elapsed_time, uploaded_object = _DoParallelCompositeUpload(

	1603 upload_stream, upload_url, dst_url, dst_obj_metadata,

	1604 global_copy_helper_opts.canned_acl, upload_size, preconditions,

	1605 gsutil_api, command_obj, copy_exception_handler)

	1606 elif upload_size < ResumableThreshold() or src_url.IsStream():

	1607 elapsed_time, uploaded_object = _UploadFileToObjectNonResumable(

	1608 upload_url, wrapped_filestream, upload_size, dst_url,

	1609 dst_obj_metadata, preconditions, gsutil_api, logger)

	1610 else:

	1611 elapsed_time, uploaded_object = _UploadFileToObjectResumable(

	1612 upload_url, wrapped_filestream, upload_size, dst_url,

	1613 dst_obj_metadata, preconditions, gsutil_api, logger)

	1614

	1615 finally:

	1616 if zipped_file:

	1617 try:

	1618 os.unlink(upload_url.object_name)

	1619 # Windows sometimes complains the temp file is locked when you try to

	1620 # delete it.

	1621 except Exception: # pylint: disable=broad-except

	1622 logger.warning(

	1623 'Could not delete %s. This can occur in Windows because the '

	1624 'temporary file is still locked.', upload_url.object_name)

	1625 # In the gzip case, this is the gzip stream. _CompressFileForUpload will

	1626 # have already closed the original source stream.

	1627 upload_stream.close()

	1628

	1629 if not parallel_composite_upload:

	1630 try:

	1631 digests = _CreateDigestsFromDigesters(digesters)

	1632 _CheckHashes(logger, dst_url, uploaded_object, src_url.object_name,

	1633 digests, is_upload=True)

	1634 except HashMismatchException:

	1635 if _RENAME_ON_HASH_MISMATCH:

	1636 corrupted_obj_metadata = apitools_messages.Object(

	1637 name=dst_obj_metadata.name,

	1638 bucket=dst_obj_metadata.bucket,

	1639 etag=uploaded_object.etag)

	1640 dst_obj_metadata.name = (dst_url.object_name +

	1641 _RENAME_ON_HASH_MISMATCH_SUFFIX)

	1642 gsutil_api.CopyObject(corrupted_obj_metadata,

	1643 dst_obj_metadata, provider=dst_url.scheme)

	1644 # If the digest doesn't match, delete the object.

	1645 gsutil_api.DeleteObject(dst_url.bucket_name, dst_url.object_name,

	1646 generation=uploaded_object.generation,

	1647 provider=dst_url.scheme)

	1648 raise

	1649

	1650 result_url = dst_url.Clone()

	1651

	1652 result_url.generation = uploaded_object.generation

	1653 result_url.generation = GenerationFromUrlAndString(

	1654 result_url, uploaded_object.generation)

	1655

	1656 return (elapsed_time, uploaded_object.size, result_url,

	1657 uploaded_object.md5Hash)

	1658

	1659

	1660 # TODO: Refactor this long function into smaller pieces.

	1661 # pylint: disable=too-many-statements

	1662 def _DownloadObjectToFile(src_url, src_obj_metadata, dst_url,

	1663 gsutil_api, logger, test_method=None):

	1664 """Downloads an object to a local file.

	1665

	1666 Args:

	1667 src_url: Source CloudUrl.

	1668 src_obj_metadata: Metadata from the source object.

	1669 dst_url: Destination FileUrl.

	1670 gsutil_api: gsutil Cloud API instance to use for the download.

	1671 logger: for outputting log messages.

	1672 test_method: Optional test method for modifying the file before validation

	1673 during unit tests.

	1674 Returns:

	1675 (elapsed_time, bytes_transferred, dst_url, md5), excluding overhead like

	1676 initial GET.

	1677

	1678 Raises:

	1679 CommandException: if errors encountered.

	1680 """

	1681 global open_files_map

	1682 file_name = dst_url.object_name

	1683 dir_name = os.path.dirname(file_name)

	1684 if dir_name and not os.path.exists(dir_name):

	1685 # Do dir creation in try block so can ignore case where dir already

	1686 # exists. This is needed to avoid a race condition when running gsutil

	1687 # -m cp.

	1688 try:

	1689 os.makedirs(dir_name)

	1690 except OSError, e:

	1691 if e.errno != errno.EEXIST:

	1692 raise

	1693 api_selector = gsutil_api.GetApiSelector(provider=src_url.scheme)

	1694 # For gzipped objects download to a temp file and unzip. For the XML API,

	1695 # the represents the result of a HEAD request. For the JSON API, this is

	1696 # the stored encoding which the service may not respect. However, if the

	1697 # server sends decompressed bytes for a file that is stored compressed

	1698 # (double compressed case), there is no way we can validate the hash and

	1699 # we will fail our hash check for the object.

	1700 if (src_obj_metadata.contentEncoding and

	1701 src_obj_metadata.contentEncoding.lower().endswith('gzip')):

	1702 # We can't use tempfile.mkstemp() here because we need a predictable

	1703 # filename for resumable downloads.

	1704 download_file_name = _GetDownloadZipFileName(file_name)

	1705 logger.info(

	1706 'Downloading to temp gzip filename %s', download_file_name)

	1707 need_to_unzip = True

	1708 else:

	1709 download_file_name = file_name

	1710 need_to_unzip = False

	1711

	1712 if download_file_name.endswith(dst_url.delim):

	1713 logger.warn('\n'.join(textwrap.wrap(

	1714 'Skipping attempt to download to filename ending with slash (%s). This '

	1715 'typically happens when using gsutil to download from a subdirectory '

	1716 'created by the Cloud Console (https://cloud.google.com/console)'

	1717 % download_file_name)))

	1718 return (0, 0, dst_url, '')

	1719

	1720 # Set up hash digesters.

	1721 hash_algs = GetDownloadHashAlgs(

	1722 logger, src_has_md5=src_obj_metadata.md5Hash,

	1723 src_has_crc32c=src_obj_metadata.crc32c)

	1724 digesters = dict((alg, hash_algs[alg]()) for alg in hash_algs or {})

	1725

	1726 fp = None

	1727 # Tracks whether the server used a gzip encoding.

	1728 server_encoding = None

	1729 download_complete = False

	1730 download_strategy = _SelectDownloadStrategy(dst_url)

	1731 download_start_point = 0

	1732 # This is used for resuming downloads, but also for passing the mediaLink

	1733 # and size into the download for new downloads so that we can avoid

	1734 # making an extra HTTP call.

	1735 serialization_data = None

	1736 serialization_dict = GetDownloadSerializationDict(src_obj_metadata)

	1737 open_files = []

	1738 try:

	1739 if download_strategy is CloudApi.DownloadStrategy.ONE_SHOT:

	1740 fp = open(download_file_name, 'wb')

	1741 elif download_strategy is CloudApi.DownloadStrategy.RESUMABLE:

	1742 # If this is a resumable download, we need to open the file for append and

	1743 # manage a tracker file.

	1744 if open_files_map.get(download_file_name, False):

	1745 # Ensure another process/thread is not already writing to this file.

	1746 raise FileConcurrencySkipError

	1747 open_files.append(download_file_name)

	1748 open_files_map[download_file_name] = True

	1749 fp = open(download_file_name, 'ab')

	1750

	1751 resuming = ReadOrCreateDownloadTrackerFile(

	1752 src_obj_metadata, dst_url, api_selector)

	1753 if resuming:

	1754 # Find out how far along we are so we can request the appropriate

	1755 # remaining range of the object.

	1756 existing_file_size = GetFileSize(fp, position_to_eof=True)

	1757 if existing_file_size > src_obj_metadata.size:

	1758 DeleteTrackerFile(GetTrackerFilePath(

	1759 dst_url, TrackerFileType.DOWNLOAD, api_selector))

	1760 raise CommandException(

	1761 '%s is larger (%d) than %s (%d).\nDeleting tracker file, so '

	1762 'if you re-try this download it will start from scratch' %

	1763 (download_file_name, existing_file_size, src_url.object_name,

	1764 src_obj_metadata.size))

	1765 else:

	1766 if existing_file_size == src_obj_metadata.size:

	1767 logger.info(

	1768 'Download already complete for file %s, skipping download but '

	1769 'will run integrity checks.', download_file_name)

	1770 download_complete = True

	1771 else:

	1772 download_start_point = existing_file_size

	1773 serialization_dict['progress'] = download_start_point

	1774 logger.info('Resuming download for %s', src_url.url_string)

	1775 # Catch up our digester with the hash data.

	1776 if existing_file_size > TEN_MIB:

	1777 for alg_name in digesters:

	1778 logger.info(

	1779 'Catching up %s for %s', alg_name, download_file_name)

	1780 with open(download_file_name, 'rb') as hash_fp:

	1781 while True:

	1782 data = hash_fp.read(DEFAULT_FILE_BUFFER_SIZE)

	1783 if not data:

	1784 break

	1785 for alg_name in digesters:

	1786 digesters[alg_name].update(data)

	1787 else:

	1788 # Starting a new download, blow away whatever is already there.

	1789 fp.truncate(0)

	1790 fp.seek(0)

	1791

	1792 else:

	1793 raise CommandException('Invalid download strategy %s chosen for'

	1794 'file %s' % (download_strategy, fp.name))

	1795

	1796 if not dst_url.IsStream():

	1797 serialization_data = json.dumps(serialization_dict)

	1798

	1799 progress_callback = FileProgressCallbackHandler(

	1800 ConstructAnnounceText('Downloading', dst_url.url_string),

	1801 logger).call

	1802 if global_copy_helper_opts.test_callback_file:

	1803 with open(global_copy_helper_opts.test_callback_file, 'rb') as test_fp:

	1804 progress_callback = pickle.loads(test_fp.read()).call

	1805

	1806 start_time = time.time()

	1807 # TODO: With gzip encoding (which may occur on-the-fly and not be part of

	1808 # the object's metadata), when we request a range to resume, it's possible

	1809 # that the server will just resend the entire object, which means our

	1810 # caught-up hash will be incorrect. We recalculate the hash on

	1811 # the local file in the case of a failed gzip hash anyway, but it would

	1812 # be better if we actively detected this case.

	1813 if not download_complete:

	1814 server_encoding = gsutil_api.GetObjectMedia(

	1815 src_url.bucket_name, src_url.object_name, fp,

	1816 start_byte=download_start_point, generation=src_url.generation,

	1817 object_size=src_obj_metadata.size,

	1818 download_strategy=download_strategy, provider=src_url.scheme,

	1819 serialization_data=serialization_data, digesters=digesters,

	1820 progress_callback=progress_callback)

	1821

	1822 end_time = time.time()

	1823

	1824 # If a custom test method is defined, call it here. For the copy command,

	1825 # test methods are expected to take one argument: an open file pointer,

	1826 # and are used to perturb the open file during download to exercise

	1827 # download error detection.

	1828 if test_method:

	1829 test_method(fp)

	1830

	1831 except ResumableDownloadException as e:

	1832 logger.warning('Caught ResumableDownloadException (%s) for file %s.',

	1833 e.reason, file_name)

	1834 raise

	1835

	1836 finally:

	1837 if fp:

	1838 fp.close()

	1839 for file_name in open_files:

	1840 open_files_map.delete(file_name)

	1841

	1842 # If we decompressed a content-encoding gzip file on the fly, this may not

	1843 # be accurate, but it is the best we can do without going deep into the

	1844 # underlying HTTP libraries. Note that this value is only used for

	1845 # reporting in log messages; inaccuracy doesn't impact the integrity of the

	1846 # download.

	1847 bytes_transferred = src_obj_metadata.size - download_start_point

	1848 server_gzip = server_encoding and server_encoding.lower().endswith('gzip')

	1849 local_md5 = _ValidateDownloadHashes(logger, src_url, src_obj_metadata,

	1850 dst_url, need_to_unzip, server_gzip,

	1851 digesters, hash_algs, api_selector,

	1852 bytes_transferred)

	1853

	1854 return (end_time - start_time, bytes_transferred, dst_url, local_md5)

	1855

	1856

	1857 def _GetDownloadZipFileName(file_name):

	1858 """Returns the file name for a temporarily compressed downloaded file."""

	1859 return '%s_.gztmp' % file_name

	1860

	1861

	1862 def _ValidateDownloadHashes(logger, src_url, src_obj_metadata, dst_url,

	1863 need_to_unzip, server_gzip, digesters, hash_algs,

	1864 api_selector, bytes_transferred):

	1865 """Validates a downloaded file's integrity.

	1866

	1867 Args:

	1868 logger: For outputting log messages.

	1869 src_url: StorageUrl for the source object.

	1870 src_obj_metadata: Metadata for the source object, potentially containing

	1871 hash values.

	1872 dst_url: StorageUrl describing the destination file.

	1873 need_to_unzip: If true, a temporary zip file was used and must be

	1874 uncompressed as part of validation.

	1875 server_gzip: If true, the server gzipped the bytes (regardless of whether

	1876 the object metadata claimed it was gzipped).

	1877 digesters: dict of {string, hash digester} that contains up-to-date digests

	1878 computed during the download. If a digester for a particular

	1879 algorithm is None, an up-to-date digest is not available and the

	1880 hash must be recomputed from the local file.

	1881 hash_algs: dict of {string, hash algorithm} that can be used if digesters

	1882 don't have up-to-date digests.

	1883 api_selector: The Cloud API implementation used (used tracker file naming).

	1884 bytes_transferred: Number of bytes downloaded (used for logging).

	1885

	1886 Returns:

	1887 An MD5 of the local file, if one was calculated as part of the integrity

	1888 check.

	1889 """

	1890 file_name = dst_url.object_name

	1891 download_file_name = (_GetDownloadZipFileName(file_name) if need_to_unzip else

	1892 file_name)

	1893 digesters_succeeded = True

	1894 for alg in digesters:

	1895 # If we get a digester with a None algorithm, the underlying

	1896 # implementation failed to calculate a digest, so we will need to

	1897 # calculate one from scratch.

	1898 if not digesters[alg]:

	1899 digesters_succeeded = False

	1900 break

	1901

	1902 if digesters_succeeded:

	1903 local_hashes = _CreateDigestsFromDigesters(digesters)

	1904 else:

	1905 local_hashes = _CreateDigestsFromLocalFile(

	1906 logger, hash_algs, download_file_name, src_obj_metadata)

	1907

	1908 digest_verified = True

	1909 hash_invalid_exception = None

	1910 try:

	1911 _CheckHashes(logger, src_url, src_obj_metadata, download_file_name,

	1912 local_hashes)

	1913 DeleteTrackerFile(GetTrackerFilePath(

	1914 dst_url, TrackerFileType.DOWNLOAD, api_selector))

	1915 except HashMismatchException, e:

	1916 # If an non-gzipped object gets sent with gzip content encoding, the hash

	1917 # we calculate will match the gzipped bytes, not the original object. Thus,

	1918 # we'll need to calculate and check it after unzipping.

	1919 if server_gzip:

	1920 logger.debug(

	1921 'Hash did not match but server gzipped the content, will '

	1922 'recalculate.')

	1923 digest_verified = False

	1924 elif api_selector == ApiSelector.XML:

	1925 logger.debug(

	1926 'Hash did not match but server may have gzipped the content, will '

	1927 'recalculate.')

	1928 # Save off the exception in case this isn't a gzipped file.

	1929 hash_invalid_exception = e

	1930 digest_verified = False

	1931 else:

	1932 DeleteTrackerFile(GetTrackerFilePath(

	1933 dst_url, TrackerFileType.DOWNLOAD, api_selector))

	1934 if _RENAME_ON_HASH_MISMATCH:

	1935 os.rename(download_file_name,

	1936 download_file_name + _RENAME_ON_HASH_MISMATCH_SUFFIX)

	1937 else:

	1938 os.unlink(download_file_name)

	1939 raise

	1940

	1941 if server_gzip and not need_to_unzip:

	1942 # Server compressed bytes on-the-fly, thus we need to rename and decompress.

	1943 # We can't decompress on-the-fly because prior to Python 3.2 the gzip

	1944 # module makes a bunch of seek calls on the stream.

	1945 download_file_name = _GetDownloadZipFileName(file_name)

	1946 os.rename(file_name, download_file_name)

	1947

	1948 if need_to_unzip or server_gzip:

	1949 # Log that we're uncompressing if the file is big enough that

	1950 # decompressing would make it look like the transfer "stalled" at the end.

	1951 if bytes_transferred > TEN_MIB:

	1952 logger.info(

	1953 'Uncompressing downloaded tmp file to %s...', file_name)

	1954

	1955 # Downloaded gzipped file to a filename w/o .gz extension, so unzip.

	1956 gzip_fp = None

	1957 try:

	1958 gzip_fp = gzip.open(download_file_name, 'rb')

	1959 with open(file_name, 'wb') as f_out:

	1960 data = gzip_fp.read(GZIP_CHUNK_SIZE)

	1961 while data:

	1962 f_out.write(data)

	1963 data = gzip_fp.read(GZIP_CHUNK_SIZE)

	1964 except IOError, e:

	1965 # In the XML case where we don't know if the file was gzipped, raise

	1966 # the original hash exception if we find that it wasn't.

	1967 if 'Not a gzipped file' in str(e) and hash_invalid_exception:

	1968 # Linter improperly thinks we're raising None despite the above check.

	1969 # pylint: disable=raising-bad-type

	1970 raise hash_invalid_exception

	1971 finally:

	1972 if gzip_fp:

	1973 gzip_fp.close()

	1974

	1975 os.unlink(download_file_name)

	1976

	1977 if not digest_verified:

	1978 try:

	1979 # Recalculate hashes on the unzipped local file.

	1980 local_hashes = _CreateDigestsFromLocalFile(logger, hash_algs, file_name,

	1981 src_obj_metadata)

	1982 _CheckHashes(logger, src_url, src_obj_metadata, file_name, local_hashes)

	1983 DeleteTrackerFile(GetTrackerFilePath(

	1984 dst_url, TrackerFileType.DOWNLOAD, api_selector))

	1985 except HashMismatchException:

	1986 DeleteTrackerFile(GetTrackerFilePath(

	1987 dst_url, TrackerFileType.DOWNLOAD, api_selector))

	1988 if _RENAME_ON_HASH_MISMATCH:

	1989 os.rename(file_name,

	1990 file_name + _RENAME_ON_HASH_MISMATCH_SUFFIX)

	1991 else:

	1992 os.unlink(file_name)

	1993 raise

	1994

	1995 if 'md5' in local_hashes:

	1996 return local_hashes['md5']

	1997

	1998

	1999 def _CopyFileToFile(src_url, dst_url):

	2000 """Copies a local file to a local file.

	2001

	2002 Args:

	2003 src_url: Source FileUrl.

	2004 dst_url: Destination FileUrl.

	2005 Returns:

	2006 (elapsed_time, bytes_transferred, dst_url, md5=None).

	2007

	2008 Raises:

	2009 CommandException: if errors encountered.

	2010 """

	2011 src_fp = GetStreamFromFileUrl(src_url)

	2012 dir_name = os.path.dirname(dst_url.object_name)

	2013 if dir_name and not os.path.exists(dir_name):

	2014 os.makedirs(dir_name)

	2015 dst_fp = open(dst_url.object_name, 'wb')

	2016 start_time = time.time()

	2017 shutil.copyfileobj(src_fp, dst_fp)

	2018 end_time = time.time()

	2019 return (end_time - start_time, os.path.getsize(dst_url.object_name),

	2020 dst_url, None)

	2021

	2022

	2023 def _DummyTrackerCallback(_):

	2024 pass

	2025

	2026

	2027 # pylint: disable=undefined-variable

	2028 def _CopyObjToObjDaisyChainMode(src_url, src_obj_metadata, dst_url,

	2029 dst_obj_metadata, preconditions, gsutil_api,

	2030 logger):

	2031 """Copies from src_url to dst_url in "daisy chain" mode.

	2032

	2033 See -D OPTION documentation about what daisy chain mode is.

	2034

	2035 Args:

	2036 src_url: Source CloudUrl

	2037 src_obj_metadata: Metadata from source object

	2038 dst_url: Destination CloudUrl

	2039 dst_obj_metadata: Object-specific metadata that should be overidden during

	2040 the copy.

	2041 preconditions: Preconditions to use for the copy.

	2042 gsutil_api: gsutil Cloud API to use for the copy.

	2043 logger: For outputting log messages.

	2044

	2045 Returns:

	2046 (elapsed_time, bytes_transferred, dst_url with generation,

	2047 md5 hash of destination) excluding overhead like initial GET.

	2048

	2049 Raises:

	2050 CommandException: if errors encountered.

	2051 """

	2052 # We don't attempt to preserve ACLs across providers because

	2053 # GCS and S3 support different ACLs and disjoint principals.

	2054 if (global_copy_helper_opts.preserve_acl

	2055 and src_url.scheme != dst_url.scheme):

	2056 raise NotImplementedError(

	2057 'Cross-provider cp -p not supported')

	2058 if not global_copy_helper_opts.preserve_acl:

	2059 dst_obj_metadata.acl = []

	2060

	2061 # Don't use callbacks for downloads on the daisy chain wrapper because

	2062 # upload callbacks will output progress, but respect test hooks if present.

	2063 progress_callback = None

	2064 if global_copy_helper_opts.test_callback_file:

	2065 with open(global_copy_helper_opts.test_callback_file, 'rb') as test_fp:

	2066 progress_callback = pickle.loads(test_fp.read()).call

	2067

	2068 start_time = time.time()

	2069 upload_fp = DaisyChainWrapper(src_url, src_obj_metadata.size, gsutil_api,

	2070 progress_callback=progress_callback)

	2071 uploaded_object = None

	2072 if src_obj_metadata.size == 0:

	2073 # Resumable uploads of size 0 are not supported.

	2074 uploaded_object = gsutil_api.UploadObject(

	2075 upload_fp, object_metadata=dst_obj_metadata,

	2076 canned_acl=global_copy_helper_opts.canned_acl,

	2077 preconditions=preconditions, provider=dst_url.scheme,

	2078 fields=UPLOAD_RETURN_FIELDS, size=src_obj_metadata.size)

	2079 else:

	2080 # TODO: Support process-break resumes. This will resume across connection

	2081 # breaks and server errors, but the tracker callback is a no-op so this

	2082 # won't resume across gsutil runs.

	2083 # TODO: Test retries via test_callback_file.

	2084 uploaded_object = gsutil_api.UploadObjectResumable(

	2085 upload_fp, object_metadata=dst_obj_metadata,

	2086 canned_acl=global_copy_helper_opts.canned_acl,

	2087 preconditions=preconditions, provider=dst_url.scheme,

	2088 fields=UPLOAD_RETURN_FIELDS, size=src_obj_metadata.size,

	2089 progress_callback=FileProgressCallbackHandler(

	2090 ConstructAnnounceText('Uploading', dst_url.url_string),

	2091 logger).call,

	2092 tracker_callback=_DummyTrackerCallback)

	2093 end_time = time.time()

	2094

	2095 try:

	2096 _CheckCloudHashes(logger, src_url, dst_url, src_obj_metadata,

	2097 uploaded_object)

	2098 except HashMismatchException:

	2099 if _RENAME_ON_HASH_MISMATCH:

	2100 corrupted_obj_metadata = apitools_messages.Object(

	2101 name=dst_obj_metadata.name,

	2102 bucket=dst_obj_metadata.bucket,

	2103 etag=uploaded_object.etag)

	2104 dst_obj_metadata.name = (dst_url.object_name +

	2105 _RENAME_ON_HASH_MISMATCH_SUFFIX)

	2106 gsutil_api.CopyObject(corrupted_obj_metadata,

	2107 dst_obj_metadata, provider=dst_url.scheme)

	2108 # If the digest doesn't match, delete the object.

	2109 gsutil_api.DeleteObject(dst_url.bucket_name, dst_url.object_name,

	2110 generation=uploaded_object.generation,

	2111 provider=dst_url.scheme)

	2112 raise

	2113

	2114 result_url = dst_url.Clone()

	2115 result_url.generation = GenerationFromUrlAndString(

	2116 result_url, uploaded_object.generation)

	2117

	2118 return (end_time - start_time, src_obj_metadata.size, result_url,

	2119 uploaded_object.md5Hash)

	2120

	2121

	2122 # pylint: disable=undefined-variable

	2123 # pylint: disable=too-many-statements

	2124 def PerformCopy(logger, src_url, dst_url, gsutil_api, command_obj,

	2125 copy_exception_handler, allow_splitting=True,

	2126 headers=None, manifest=None, gzip_exts=None, test_method=None):

	2127 """Performs copy from src_url to dst_url, handling various special cases.

	2128

	2129 Args:

	2130 logger: for outputting log messages.

	2131 src_url: Source StorageUrl.

	2132 dst_url: Destination StorageUrl.

	2133 gsutil_api: gsutil Cloud API instance to use for the copy.

	2134 command_obj: command object for use in Apply in parallel composite uploads.

	2135 copy_exception_handler: for handling copy exceptions during Apply.

	2136 allow_splitting: Whether to allow the file to be split into component

	2137 pieces for an parallel composite upload.

	2138 headers: optional headers to use for the copy operation.

	2139 manifest: optional manifest for tracking copy operations.

	2140 gzip_exts: List of file extensions to gzip for uploads, if any.

	2141 test_method: optional test method for modifying files during unit tests.

	2142

	2143 Returns:

	2144 (elapsed_time, bytes_transferred, version-specific dst_url) excluding

	2145 overhead like initial GET.

	2146

	2147 Raises:

	2148 ItemExistsError: if no clobber flag is specified and the destination

	2149 object already exists.

	2150 SkipUnsupportedObjectError: if skip_unsupported_objects flag is specified

	2151 and the source is an unsupported type.

	2152 CommandException: if other errors encountered.

	2153 """

	2154 if headers:

	2155 dst_obj_headers = headers.copy()

	2156 else:

	2157 dst_obj_headers = {}

	2158

	2159 # Create a metadata instance for each destination object so metadata

	2160 # such as content-type can be applied per-object.

	2161 # Initialize metadata from any headers passed in via -h.

	2162 dst_obj_metadata = ObjectMetadataFromHeaders(dst_obj_headers)

	2163

	2164 if dst_url.IsCloudUrl() and dst_url.scheme == 'gs':

	2165 preconditions = PreconditionsFromHeaders(dst_obj_headers)

	2166 else:

	2167 preconditions = Preconditions()

	2168

	2169 src_obj_metadata = None

	2170 src_obj_filestream = None

	2171 if src_url.IsCloudUrl():

	2172 src_obj_fields = None

	2173 if dst_url.IsCloudUrl():

	2174 # For cloud or daisy chain copy, we need every copyable field.

	2175 # If we're not modifying or overriding any of the fields, we can get

	2176 # away without retrieving the object metadata because the copy

	2177 # operation can succeed with just the destination bucket and object

	2178 # name. But if we are sending any metadata, the JSON API will expect a

	2179 # complete object resource. Since we want metadata like the object size

	2180 # for our own tracking, we just get all of the metadata here.

	2181 src_obj_fields = ['cacheControl', 'componentCount',

	2182 'contentDisposition', 'contentEncoding',

	2183 'contentLanguage', 'contentType', 'crc32c',

	2184 'etag', 'generation', 'md5Hash', 'mediaLink',

	2185 'metadata', 'metageneration', 'size']

	2186 # We only need the ACL if we're going to preserve it.

	2187 if global_copy_helper_opts.preserve_acl:

	2188 src_obj_fields.append('acl')

	2189 if (src_url.scheme == dst_url.scheme

	2190 and not global_copy_helper_opts.daisy_chain):

	2191 copy_in_the_cloud = True

	2192 else:

	2193 copy_in_the_cloud = False

	2194 else:

	2195 # Just get the fields needed to validate the download.

	2196 src_obj_fields = ['crc32c', 'contentEncoding', 'contentType', 'etag',

	2197 'mediaLink', 'md5Hash', 'size']

	2198

	2199 if (src_url.scheme == 's3' and

	2200 global_copy_helper_opts.skip_unsupported_objects):

	2201 src_obj_fields.append('storageClass')

	2202

	2203 try:

	2204 src_generation = GenerationFromUrlAndString(src_url, src_url.generation)

	2205 src_obj_metadata = gsutil_api.GetObjectMetadata(

	2206 src_url.bucket_name, src_url.object_name,

	2207 generation=src_generation, provider=src_url.scheme,

	2208 fields=src_obj_fields)

	2209 except NotFoundException:

	2210 raise CommandException(

	2211 'NotFoundException: Could not retrieve source object %s.' %

	2212 src_url.url_string)

	2213 if (src_url.scheme == 's3' and

	2214 global_copy_helper_opts.skip_unsupported_objects and

	2215 src_obj_metadata.storageClass == 'GLACIER'):

	2216 raise SkipGlacierError()

	2217

	2218 src_obj_size = src_obj_metadata.size

	2219 dst_obj_metadata.contentType = src_obj_metadata.contentType

	2220 if global_copy_helper_opts.preserve_acl:

	2221 dst_obj_metadata.acl = src_obj_metadata.acl

	2222 # Special case for S3-to-S3 copy URLs using

	2223 # global_copy_helper_opts.preserve_acl.

	2224 # dst_url will be verified in _CopyObjToObjDaisyChainMode if it

	2225 # is not s3 (and thus differs from src_url).

	2226 if src_url.scheme == 's3':

	2227 acl_text = S3MarkerAclFromObjectMetadata(src_obj_metadata)

	2228 if acl_text:

	2229 AddS3MarkerAclToObjectMetadata(dst_obj_metadata, acl_text)

	2230 else:

	2231 try:

	2232 src_obj_filestream = GetStreamFromFileUrl(src_url)

	2233 except Exception, e: # pylint: disable=broad-except

	2234 raise CommandException('Error opening file "%s": %s.' % (src_url,

	2235 e.message))

	2236 if src_url.IsStream():

	2237 src_obj_size = None

	2238 else:

	2239 src_obj_size = os.path.getsize(src_url.object_name)

	2240

	2241 if global_copy_helper_opts.use_manifest:

	2242 # Set the source size in the manifest.

	2243 manifest.Set(src_url.url_string, 'size', src_obj_size)

	2244

	2245 if (dst_url.scheme == 's3' and src_obj_size > S3_MAX_UPLOAD_SIZE

	2246 and src_url != 's3'):

	2247 raise CommandException(

	2248 '"%s" exceeds the maximum gsutil-supported size for an S3 upload. S3 '

	2249 'objects greater than %s in size require multipart uploads, which '

	2250 'gsutil does not support.' % (src_url,

	2251 MakeHumanReadable(S3_MAX_UPLOAD_SIZE)))

	2252

	2253 # On Windows, stdin is opened as text mode instead of binary which causes

	2254 # problems when piping a binary file, so this switches it to binary mode.

	2255 if IS_WINDOWS and src_url.IsFileUrl() and src_url.IsStream():

	2256 msvcrt.setmode(GetStreamFromFileUrl(src_url).fileno(), os.O_BINARY)

	2257

	2258 if global_copy_helper_opts.no_clobber:

	2259 # There are two checks to prevent clobbering:

	2260 # 1) The first check is to see if the URL

	2261 # already exists at the destination and prevent the upload/download

	2262 # from happening. This is done by the exists() call.

	2263 # 2) The second check is only relevant if we are writing to gs. We can

	2264 # enforce that the server only writes the object if it doesn't exist

	2265 # by specifying the header below. This check only happens at the

	2266 # server after the complete file has been uploaded. We specify this

	2267 # header to prevent a race condition where a destination file may

	2268 # be created after the first check and before the file is fully

	2269 # uploaded.

	2270 # In order to save on unnecessary uploads/downloads we perform both

	2271 # checks. However, this may come at the cost of additional HTTP calls.

	2272 if preconditions.gen_match:

	2273 raise ArgumentException('Specifying x-goog-if-generation-match is '

	2274 'not supported with cp -n')

	2275 else:

	2276 preconditions.gen_match = 0

	2277 if dst_url.IsFileUrl() and os.path.exists(dst_url.object_name):

	2278 # The local file may be a partial. Check the file sizes.

	2279 if src_obj_size == os.path.getsize(dst_url.object_name):

	2280 raise ItemExistsError()

	2281 elif dst_url.IsCloudUrl():

	2282 try:

	2283 dst_object = gsutil_api.GetObjectMetadata(

	2284 dst_url.bucket_name, dst_url.object_name, provider=dst_url.scheme)

	2285 except NotFoundException:

	2286 dst_object = None

	2287 if dst_object:

	2288 raise ItemExistsError()

	2289

	2290 if dst_url.IsCloudUrl():

	2291 # Cloud storage API gets object and bucket name from metadata.

	2292 dst_obj_metadata.name = dst_url.object_name

	2293 dst_obj_metadata.bucket = dst_url.bucket_name

	2294 if src_url.IsCloudUrl():

	2295 # Preserve relevant metadata from the source object if it's not already

	2296 # provided from the headers.

	2297 CopyObjectMetadata(src_obj_metadata, dst_obj_metadata, override=False)

	2298 src_obj_metadata.name = src_url.object_name

	2299 src_obj_metadata.bucket = src_url.bucket_name

	2300 else:

	2301 _SetContentTypeFromFile(src_url, dst_obj_metadata)

	2302 else:

	2303 # Files don't have Cloud API metadata.

	2304 dst_obj_metadata = None

	2305

	2306 _LogCopyOperation(logger, src_url, dst_url, dst_obj_metadata)

	2307

	2308 if src_url.IsCloudUrl():

	2309 if dst_url.IsFileUrl():

	2310 return _DownloadObjectToFile(src_url, src_obj_metadata, dst_url,

	2311 gsutil_api, logger, test_method=test_method)

	2312 elif copy_in_the_cloud:

	2313 return _CopyObjToObjInTheCloud(src_url, src_obj_metadata, dst_url,

	2314 dst_obj_metadata, preconditions,

	2315 gsutil_api, logger)

	2316 else:

	2317 return _CopyObjToObjDaisyChainMode(src_url, src_obj_metadata,

	2318 dst_url, dst_obj_metadata,

	2319 preconditions, gsutil_api, logger)

	2320 else: # src_url.IsFileUrl()

	2321 if dst_url.IsCloudUrl():

	2322 return _UploadFileToObject(

	2323 src_url, src_obj_filestream, src_obj_size, dst_url,

	2324 dst_obj_metadata, preconditions, gsutil_api, logger, command_obj,

	2325 copy_exception_handler, gzip_exts=gzip_exts,

	2326 allow_splitting=allow_splitting)

	2327 else: # dst_url.IsFileUrl()

	2328 return _CopyFileToFile(src_url, dst_url)

	2329

	2330

	2331 class Manifest(object):

	2332 """Stores the manifest items for the CpCommand class."""

	2333

	2334 def __init__(self, path):

	2335 # self.items contains a dictionary of rows

	2336 self.items = {}

	2337 self.manifest_filter = {}

	2338 self.lock = CreateLock()

	2339

	2340 self.manifest_path = os.path.expanduser(path)

	2341 self._ParseManifest()

	2342 self._CreateManifestFile()

	2343

	2344 def _ParseManifest(self):

	2345 """Load and parse a manifest file.

	2346

	2347 This information will be used to skip any files that have a skip or OK

	2348 status.

	2349 """

	2350 try:

	2351 if os.path.exists(self.manifest_path):

	2352 with open(self.manifest_path, 'rb') as f:

	2353 first_row = True

	2354 reader = csv.reader(f)

	2355 for row in reader:

	2356 if first_row:

	2357 try:

	2358 source_index = row.index('Source')

	2359 result_index = row.index('Result')

	2360 except ValueError:

	2361 # No header and thus not a valid manifest file.

	2362 raise CommandException(

	2363 'Missing headers in manifest file: %s' % self.manifest_path)

	2364 first_row = False

	2365 source = row[source_index]

	2366 result = row[result_index]

	2367 if result in ['OK', 'skip']:

	2368 # We're always guaranteed to take the last result of a specific

	2369 # source url.

	2370 self.manifest_filter[source] = result

	2371 except IOError:

	2372 raise CommandException('Could not parse %s' % self.manifest_path)

	2373

	2374 def WasSuccessful(self, src):

	2375 """Returns whether the specified src url was marked as successful."""

	2376 return src in self.manifest_filter

	2377

	2378 def _CreateManifestFile(self):

	2379 """Opens the manifest file and assigns it to the file pointer."""

	2380 try:

	2381 if ((not os.path.exists(self.manifest_path))

	2382 or (os.stat(self.manifest_path).st_size == 0)):

	2383 # Add headers to the new file.

	2384 with open(self.manifest_path, 'wb', 1) as f:

	2385 writer = csv.writer(f)

	2386 writer.writerow(['Source',

	2387 'Destination',

	2388 'Start',

	2389 'End',

	2390 'Md5',

	2391 'UploadId',

	2392 'Source Size',

	2393 'Bytes Transferred',

	2394 'Result',

	2395 'Description'])

	2396 except IOError:

	2397 raise CommandException('Could not create manifest file.')

	2398

	2399 def Set(self, url, key, value):

	2400 if value is None:

	2401 # In case we don't have any information to set we bail out here.

	2402 # This is so that we don't clobber existing information.

	2403 # To zero information pass '' instead of None.

	2404 return

	2405 if url in self.items:

	2406 self.items[url][key] = value

	2407 else:

	2408 self.items[url] = {key: value}

	2409

	2410 def Initialize(self, source_url, destination_url):

	2411 # Always use the source_url as the key for the item. This is unique.

	2412 self.Set(source_url, 'source_uri', source_url)

	2413 self.Set(source_url, 'destination_uri', destination_url)

	2414 self.Set(source_url, 'start_time', datetime.datetime.utcnow())

	2415

	2416 def SetResult(self, source_url, bytes_transferred, result,

	2417 description=''):

	2418 self.Set(source_url, 'bytes', bytes_transferred)

	2419 self.Set(source_url, 'result', result)

	2420 self.Set(source_url, 'description', description)

	2421 self.Set(source_url, 'end_time', datetime.datetime.utcnow())

	2422 self._WriteRowToManifestFile(source_url)

	2423 self._RemoveItemFromManifest(source_url)

	2424

	2425 def _WriteRowToManifestFile(self, url):

	2426 """Writes a manifest entry to the manifest file for the url argument."""

	2427 row_item = self.items[url]

	2428 data = [

	2429 str(row_item['source_uri'].encode(UTF8)),

	2430 str(row_item['destination_uri'].encode(UTF8)),

	2431 '%sZ' % row_item['start_time'].isoformat(),

	2432 '%sZ' % row_item['end_time'].isoformat(),

	2433 row_item['md5'] if 'md5' in row_item else '',

	2434 row_item['upload_id'] if 'upload_id' in row_item else '',

	2435 str(row_item['size']) if 'size' in row_item else '',

	2436 str(row_item['bytes']) if 'bytes' in row_item else '',

	2437 row_item['result'],

	2438 row_item['description'].encode(UTF8)]

	2439

	2440 # Aquire a lock to prevent multiple threads writing to the same file at

	2441 # the same time. This would cause a garbled mess in the manifest file.

	2442 with self.lock:

	2443 with open(self.manifest_path, 'a', 1) as f: # 1 == line buffered

	2444 writer = csv.writer(f)

	2445 writer.writerow(data)

	2446

	2447 def _RemoveItemFromManifest(self, url):

	2448 # Remove the item from the dictionary since we're done with it and

	2449 # we don't want the dictionary to grow too large in memory for no good

	2450 # reason.

	2451 del self.items[url]

	2452

	2453

	2454 class ItemExistsError(Exception):

	2455 """Exception class for objects that are skipped because they already exist."""

	2456 pass

	2457

	2458

	2459 class SkipUnsupportedObjectError(Exception):

	2460 """Exception for objects skipped because they are an unsupported type."""

	2461

	2462 def __init__(self):

	2463 super(SkipUnsupportedObjectError, self).__init__()

	2464 self.unsupported_type = 'Unknown'

	2465

	2466

	2467 class SkipGlacierError(SkipUnsupportedObjectError):

	2468 """Exception for objects skipped because they are an unsupported type."""

	2469

	2470 def __init__(self):

	2471 super(SkipGlacierError, self).__init__()

	2472 self.unsupported_type = 'GLACIER'

	2473

	2474

	2475 def GetPathBeforeFinalDir(url):

	2476 """Returns the path section before the final directory component of the URL.

	2477

	2478 This handles cases for file system directories, bucket, and bucket

	2479 subdirectories. Example: for gs://bucket/dir/ we'll return 'gs://bucket',

	2480 and for file://dir we'll return file://

	2481

	2482 Args:

	2483 url: StorageUrl representing a filesystem directory, cloud bucket or

	2484 bucket subdir.

	2485

	2486 Returns:

	2487 String name of above-described path, sans final path separator.

	2488 """

	2489 sep = url.delim

	2490 if url.IsFileUrl():

	2491 past_scheme = url.url_string[len('file://'):]

	2492 if past_scheme.find(sep) == -1:

	2493 return 'file://'

	2494 else:

	2495 return 'file://%s' % past_scheme.rstrip(sep).rpartition(sep)[0]

	2496 if url.IsBucket():

	2497 return '%s://' % url.scheme

	2498 # Else it names a bucket subdir.

	2499 return url.url_string.rstrip(sep).rpartition(sep)[0]

	2500

	2501

	2502 def _DivideAndCeil(dividend, divisor):

	2503 """Returns ceil(dividend / divisor).

	2504

	2505 Takes care to avoid the pitfalls of floating point arithmetic that could

	2506 otherwise yield the wrong result for large numbers.

	2507

	2508 Args:

	2509 dividend: Dividend for the operation.

	2510 divisor: Divisor for the operation.

	2511

	2512 Returns:

	2513 Quotient.

	2514 """

	2515 quotient = dividend // divisor

	2516 if (dividend % divisor) != 0:

	2517 quotient += 1

	2518 return quotient

	2519

	2520

	2521 def _GetPartitionInfo(file_size, max_components, default_component_size):

	2522 """Gets info about a file partition for parallel composite uploads.

	2523

	2524 Args:

	2525 file_size: The number of bytes in the file to be partitioned.

	2526 max_components: The maximum number of components that can be composed.

	2527 default_component_size: The size of a component, assuming that

	2528 max_components is infinite.

	2529 Returns:

	2530 The number of components in the partitioned file, and the size of each

	2531 component (except the last, which will have a different size iff

	2532 file_size != 0 (mod num_components)).

	2533 """

	2534 # num_components = ceil(file_size / default_component_size)

	2535 num_components = _DivideAndCeil(file_size, default_component_size)

	2536

	2537 # num_components must be in the range [2, max_components]

	2538 num_components = max(min(num_components, max_components), 2)

	2539

	2540 # component_size = ceil(file_size / num_components)

	2541 component_size = _DivideAndCeil(file_size, num_components)

	2542 return (num_components, component_size)

	2543

	2544

	2545 def _DeleteObjectFn(cls, url_to_delete, thread_state=None):

	2546 """Wrapper function to be used with command.Apply()."""

	2547 gsutil_api = GetCloudApiInstance(cls, thread_state)

	2548 gsutil_api.DeleteObject(

	2549 url_to_delete.bucket_name, url_to_delete.object_name,

	2550 generation=url_to_delete.generation, provider=url_to_delete.scheme)

	2551

	2552

	2553 def _ParseParallelUploadTrackerFile(tracker_file, tracker_file_lock):

	2554 """Parse the tracker file from the last parallel composite upload attempt.

	2555

	2556 If it exists, the tracker file is of the format described in

	2557 _CreateParallelUploadTrackerFile. If the file doesn't exist or cannot be

	2558 read, then the upload will start from the beginning.

	2559

	2560 Args:

	2561 tracker_file: The name of the file to parse.

	2562 tracker_file_lock: Lock protecting access to the tracker file.

	2563

	2564 Returns:

	2565 random_prefix: A randomly-generated prefix to the name of the

	2566 temporary components.

	2567 existing_objects: A list of ObjectFromTracker objects representing

	2568 the set of files that have already been uploaded.

	2569 """

	2570

	2571 def GenerateRandomPrefix():

	2572 return str(random.randint(1, (10 ** 10) - 1))

	2573

	2574 existing_objects = []

	2575 try:

	2576 with tracker_file_lock:

	2577 with open(tracker_file, 'r') as fp:

	2578 lines = fp.readlines()

	2579 lines = [line.strip() for line in lines]

	2580 if not lines:

	2581 print('Parallel upload tracker file (%s) was invalid. '

	2582 'Restarting upload from scratch.' % tracker_file)

	2583 lines = [GenerateRandomPrefix()]

	2584

	2585 except IOError as e:

	2586 # We can't read the tracker file, so generate a new random prefix.

	2587 lines = [GenerateRandomPrefix()]

	2588

	2589 # Ignore non-existent file (happens first time an upload

	2590 # is attempted on a file), but warn user for other errors.

	2591 if e.errno != errno.ENOENT:

	2592 # Will restart because we failed to read in the file.

	2593 print('Couldn\'t read parallel upload tracker file (%s): %s. '

	2594 'Restarting upload from scratch.' % (tracker_file, e.strerror))

	2595

	2596 # The first line contains the randomly-generated prefix.

	2597 random_prefix = lines[0]

	2598

	2599 # The remaining lines were written in pairs to describe a single component

	2600 # in the form:

	2601 # object_name (without random prefix)

	2602 # generation

	2603 # Newlines are used as the delimiter because only newlines and carriage

	2604 # returns are invalid characters in object names, and users can specify

	2605 # a custom prefix in the config file.

	2606 i = 1

	2607 while i < len(lines):

	2608 (name, generation) = (lines[i], lines[i+1])

	2609 if not generation:

	2610 # Cover the '' case.

	2611 generation = None

	2612 existing_objects.append(ObjectFromTracker(name, generation))

	2613 i += 2

	2614 return (random_prefix, existing_objects)

	2615

	2616

	2617 def _AppendComponentTrackerToParallelUploadTrackerFile(tracker_file, component,

	2618 tracker_file_lock):

	2619 """Appends info about the uploaded component to an existing tracker file.

	2620

	2621 Follows the format described in _CreateParallelUploadTrackerFile.

	2622

	2623 Args:

	2624 tracker_file: Tracker file to append to.

	2625 component: Component that was uploaded.

	2626 tracker_file_lock: Thread and process-safe Lock for the tracker file.

	2627 """

	2628 lines = _GetParallelUploadTrackerFileLinesForComponents([component])

	2629 lines = [line + '\n' for line in lines]

	2630 with tracker_file_lock:

	2631 with open(tracker_file, 'a') as f:

	2632 f.writelines(lines)

	2633

	2634

	2635 def _CreateParallelUploadTrackerFile(tracker_file, random_prefix, components,

	2636 tracker_file_lock):

	2637 """Writes information about components that were successfully uploaded.

	2638

	2639 This way the upload can be resumed at a later date. The tracker file has

	2640 the format:

	2641 random_prefix

	2642 temp_object_1_name

	2643 temp_object_1_generation

	2644 .

	2645 .

	2646 .

	2647 temp_object_N_name

	2648 temp_object_N_generation

	2649 where N is the number of components that have been successfully uploaded.

	2650

	2651 Args:

	2652 tracker_file: The name of the parallel upload tracker file.

	2653 random_prefix: The randomly-generated prefix that was used for

	2654 for uploading any existing components.

	2655 components: A list of ObjectFromTracker objects that were uploaded.

	2656 tracker_file_lock: The lock protecting access to the tracker file.

	2657 """

	2658 lines = [random_prefix]

	2659 lines += _GetParallelUploadTrackerFileLinesForComponents(components)

	2660 lines = [line + '\n' for line in lines]

	2661 try:

	2662 with tracker_file_lock:

	2663 open(tracker_file, 'w').close() # Clear the file.

	2664 with open(tracker_file, 'w') as f:

	2665 f.writelines(lines)

	2666 except IOError as e:

	2667 RaiseUnwritableTrackerFileException(tracker_file, e.strerror)

	2668

	2669

	2670 def _GetParallelUploadTrackerFileLinesForComponents(components):

	2671 """Return a list of the lines for use in a parallel upload tracker file.

	2672

	2673 The lines represent the given components, using the format as described in

	2674 _CreateParallelUploadTrackerFile.

	2675

	2676 Args:

	2677 components: A list of ObjectFromTracker objects that were uploaded.

	2678

	2679 Returns:

	2680 Lines describing components with their generation for outputting to the

	2681 tracker file.

	2682 """

	2683 lines = []

	2684 for component in components:

	2685 generation = None

	2686 generation = component.generation

	2687 if not generation:

	2688 generation = ''

	2689 lines += [component.object_name, str(generation)]

	2690 return lines

	2691

	2692

	2693 def FilterExistingComponents(dst_args, existing_components, bucket_url,

	2694 gsutil_api):

	2695 """Determines course of action for component objects.

	2696

	2697 Given the list of all target objects based on partitioning the file and

	2698 the list of objects that have already been uploaded successfully,

	2699 this function determines which objects should be uploaded, which

	2700 existing components are still valid, and which existing components should

	2701 be deleted.

	2702

	2703 Args:

	2704 dst_args: The map of file_name -> PerformParallelUploadFileToObjectArgs

	2705 calculated by partitioning the file.

	2706 existing_components: A list of ObjectFromTracker objects that have been

	2707 uploaded in the past.

	2708 bucket_url: CloudUrl of the bucket in which the components exist.

	2709 gsutil_api: gsutil Cloud API instance to use for retrieving object metadata.

	2710

	2711 Returns:

	2712 components_to_upload: List of components that need to be uploaded.

	2713 uploaded_components: List of components that have already been

	2714 uploaded and are still valid.

	2715 existing_objects_to_delete: List of components that have already

	2716 been uploaded, but are no longer valid

	2717 and are in a versioned bucket, and

	2718 therefore should be deleted.

	2719 """

	2720 components_to_upload = []

	2721 existing_component_names = [component.object_name

	2722 for component in existing_components]

	2723 for component_name in dst_args:

	2724 if component_name not in existing_component_names:

	2725 components_to_upload.append(dst_args[component_name])

	2726

	2727 objects_already_chosen = []

	2728

	2729 # Don't reuse any temporary components whose MD5 doesn't match the current

	2730 # MD5 of the corresponding part of the file. If the bucket is versioned,

	2731 # also make sure that we delete the existing temporary version.

	2732 existing_objects_to_delete = []

	2733 uploaded_components = []

	2734 for tracker_object in existing_components:

	2735 if (tracker_object.object_name not in dst_args.keys()

	2736 or tracker_object.object_name in objects_already_chosen):

	2737 # This could happen if the component size has changed. This also serves

	2738 # to handle object names that get duplicated in the tracker file due

	2739 # to people doing things they shouldn't (e.g., overwriting an existing

	2740 # temporary component in a versioned bucket).

	2741

	2742 url = bucket_url.Clone()

	2743 url.object_name = tracker_object.object_name

	2744 url.generation = tracker_object.generation

	2745 existing_objects_to_delete.append(url)

	2746 continue

	2747

	2748 dst_arg = dst_args[tracker_object.object_name]

	2749 file_part = FilePart(dst_arg.filename, dst_arg.file_start,

	2750 dst_arg.file_length)

	2751 # TODO: calculate MD5's in parallel when possible.

	2752 content_md5 = CalculateB64EncodedMd5FromContents(file_part)

	2753

	2754 try:

	2755 # Get the MD5 of the currently-existing component.

	2756 dst_url = dst_arg.dst_url

	2757 dst_metadata = gsutil_api.GetObjectMetadata(

	2758 dst_url.bucket_name, dst_url.object_name,

	2759 generation=dst_url.generation, provider=dst_url.scheme,

	2760 fields=['md5Hash', 'etag'])

	2761 cloud_md5 = dst_metadata.md5Hash

	2762 except Exception: # pylint: disable=broad-except

	2763 # We don't actually care what went wrong - we couldn't retrieve the

	2764 # object to check the MD5, so just upload it again.

	2765 cloud_md5 = None

	2766

	2767 if cloud_md5 != content_md5:

	2768 components_to_upload.append(dst_arg)

	2769 objects_already_chosen.append(tracker_object.object_name)

	2770 if tracker_object.generation:

	2771 # If the old object doesn't have a generation (i.e., it isn't in a

	2772 # versioned bucket), then we will just overwrite it anyway.

	2773 invalid_component_with_generation = dst_arg.dst_url.Clone()

	2774 invalid_component_with_generation.generation = tracker_object.generation

	2775 existing_objects_to_delete.append(invalid_component_with_generation)

	2776 else:

	2777 url = dst_arg.dst_url.Clone()

	2778 url.generation = tracker_object.generation

	2779 uploaded_components.append(url)

	2780 objects_already_chosen.append(tracker_object.object_name)

	2781

	2782 if uploaded_components:

	2783 logging.info('Found %d existing temporary components to reuse.',

	2784 len(uploaded_components))

	2785

	2786 return (components_to_upload, uploaded_components,

	2787 existing_objects_to_delete)

OLD	NEW

« no previous file with comments | « third_party/gsutil/gslib/commands/web.py ('k') | third_party/gsutil/gslib/cred_types.py » ('j') | no next file with comments »