third_party/gsutil/gslib/commands/cp.py - Issue 12042069: Scripts to download files from google storage based on sha1 sums

Side by Side Diff: third_party/gsutil/gslib/commands/cp.py

Issue 12042069: Scripts to download files from google storage based on sha1 sums (Closed) Base URL: https://chromium.googlesource.com/chromium/tools/depot_tools.git@master

Patch Set: Removed gsutil/tests and gsutil/docs Created 7 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # Copyright 2011 Google Inc.

	2 # Copyright 2011, Nexenta Systems Inc.

	3 #

	4 # Licensed under the Apache License, Version 2.0 (the "License");

	5 # you may not use this file except in compliance with the License.

	6 # You may obtain a copy of the License at

	7 #

	8 # http://www.apache.org/licenses/LICENSE-2.0

	9 #

	10 # Unless required by applicable law or agreed to in writing, software

	11 # distributed under the License is distributed on an "AS IS" BASIS,

	12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

	13 # See the License for the specific language governing permissions and

	14 # limitations under the License.

	15

	16 import boto

	17 import errno

	18 import gzip

	19 import hashlib

	20 import mimetypes

	21 import os

	22 import platform

	23 import re

	24 import subprocess

	25 import sys

	26 import tempfile

	27 import threading

	28 import time

	29

	30 from boto.gs.resumable_upload_handler import ResumableUploadHandler

	31 from boto import config

	32 from boto.s3.resumable_download_handler import ResumableDownloadHandler

	33 from boto.exception import GSResponseError

	34 from boto.exception import ResumableUploadException

	35 from gslib.command import Command

	36 from gslib.command import COMMAND_NAME

	37 from gslib.command import COMMAND_NAME_ALIASES

	38 from gslib.command import CONFIG_REQUIRED

	39 from gslib.command import FILE_URIS_OK

	40 from gslib.command import MAX_ARGS

	41 from gslib.command import MIN_ARGS

	42 from gslib.command import PROVIDER_URIS_OK

	43 from gslib.command import SUPPORTED_SUB_ARGS

	44 from gslib.command import URIS_START_ARG

	45 from gslib.exception import CommandException

	46 from gslib.help_provider import HELP_NAME

	47 from gslib.help_provider import HELP_NAME_ALIASES

	48 from gslib.help_provider import HELP_ONE_LINE_SUMMARY

	49 from gslib.help_provider import HELP_TEXT

	50 from gslib.help_provider import HelpType

	51 from gslib.help_provider import HELP_TYPE

	52 from gslib.name_expansion import NameExpansionIterator

	53 from gslib.util import MakeHumanReadable

	54 from gslib.util import NO_MAX

	55 from gslib.util import ONE_MB

	56 from gslib.wildcard_iterator import ContainsWildcard

	57

	58 _detailed_help_text = ("""

	59 <B>SYNOPSIS</B>

	60 gsutil cp [-a canned_acl] [-e] [-p] [-q] [-z ext1,ext2,...] src_uri dst_uri

	61 - or -

	62 gsutil cp [-a canned_acl] [-e] [-p] [-q] [-R] [-z extensions] src_uri... dst_u ri

	63 - or -

	64 gsutil cp [-a canned_acl] [-e] [-p] [-q] [-R] [-z extensions] -I dst_uri

	65

	66

	67 <B>DESCRIPTION</B>

	68 The gsutil cp command allows you to copy data between your local file

	69 system and the cloud, copy data within the cloud, and copy data between

	70 cloud storage providers. For example, to copy all text files from the

	71 local directory to a bucket you could do:

	72

	73 gsutil cp *.txt gs://my_bucket

	74

	75 Similarly, you can download text files from a bucket by doing:

	76

	77 gsutil cp gs://my_bucket/*.txt .

	78

	79 If you want to copy an entire directory tree you need to use the -R option:

	80

	81 gsutil cp -R dir gs://my_bucket

	82

	83 If you have a large number of files to upload you might want to use the

	84 gsutil -m option, to perform a parallel (multi-threaded/multi-processing)

	85 copy:

	86

	87 gsutil -m cp -R dir gs://my_bucket

	88

	89 You can pass a list of URIs to copy on STDIN instead of as command line

	90 arguments by using the -I option. This allows you to use gsutil in a

	91 pipeline to copy files and objects as generated by a program, such as:

	92

	93 some_program \| gsutil -m cp -I gs://my_bucket

	94

	95 The contents of STDIN can name files, cloud URIs, and wildcards of files

	96 and cloud URIs.

	97

	98

	99 <B>HOW NAMES ARE CONSTRUCTED</B>

	100 The gsutil cp command strives to name objects in a way consistent with how

	101 Unix cp works, which causes names to be constructed in varying ways depending

	102 on whether you're performing a recursive directory copy or copying

	103 individually named objects; and whether you're copying to an existing or

	104 non-existent directory.

	105

	106 When performing recursive directory copies, object names are constructed

	107 that mirror the source directory structure starting at the point of

	108 recursive processing. For example, the command:

	109

	110 gsutil cp -R dir1/dir2 gs://my_bucket

	111

	112 will create objects named like gs://my_bucket/dir2/a/b/c, assuming

	113 dir1/dir2 contains the file a/b/c.

	114

	115 In contrast, copying individually named files will result in objects named

	116 by the final path component of the source files. For example, the command:

	117

	118 gsutil cp dir1/dir2/** gs://my_bucket

	119

	120 will create objects named like gs://my_bucket/c.

	121

	122 The same rules apply for downloads: recursive copies of buckets and

	123 bucket subdirectories produce a mirrored filename structure, while copying

	124 individually (or wildcard) named objects produce flatly named files.

	125

	126 Note that in the above example the '**' wildcard matches all names

	127 anywhere under dir. The wildcard '*' will match names just one level deep. For

	128 more details see 'gsutil help wildcards'.

	129

	130 There's an additional wrinkle when working with subdirectories: the resulting

	131 names depend on whether the destination subdirectory exists. For example,

	132 if gs://my_bucket/subdir exists as a subdirectory, the command:

	133

	134 gsutil cp -R dir1/dir2 gs://my_bucket/subdir

	135

	136 will create objects named like gs://my_bucket/subdir/dir2/a/b/c. In contrast,

	137 if gs://my_bucket/subdir does not exist, this same gsutil cp command will

	138 create objects named like gs://my_bucket/subdir/a/b/c.

	139

	140

	141 <B>COPYING TO/FROM SUBDIRECTORIES; DISTRIBUTING TRANSFERS ACROSS MACHINES</B>

	142 You can use gsutil to copy to and from subdirectories by using a command like:

	143

	144 gsutil cp -R dir gs://my_bucket/data

	145

	146 This will cause dir and all of its files and nested subdirectories to be

	147 copied under the specified destination, resulting in objects with names like

	148 gs://my_bucket/data/dir/a/b/c. Similarly you can download from bucket

	149 subdirectories by using a command like:

	150

	151 gsutil cp -R gs://my_bucket/data dir

	152

	153 This will cause everything nested under gs://my_bucket/data to be downloaded

	154 into dir, resulting in files with names like dir/data/a/b/c.

	155

	156 Copying subdirectories is useful if you want to add data to an existing

	157 bucket directory structure over time. It's also useful if you want

	158 to parallelize uploads and downloads across multiple machines (often

	159 reducing overall transfer time compared with simply running gsutil -m

	160 cp on one machine). For example, if your bucket contains this structure:

	161

	162 gs://my_bucket/data/result_set_01/

	163 gs://my_bucket/data/result_set_02/

	164 ...

	165 gs://my_bucket/data/result_set_99/

	166

	167 you could perform concurrent downloads across 3 machines by running these

	168 commands on each machine, respectively:

	169

	170 gsutil cp -R gs://my_bucket/data/result_set_[0-3]* dir

	171 gsutil cp -R gs://my_bucket/data/result_set_[4-6]* dir

	172 gsutil cp -R gs://my_bucket/data/result_set_[7-9]* dir

	173

	174 Note that dir could be a local directory on each machine, or it could

	175 be a directory mounted off of a shared file server; whether the latter

	176 performs acceptably may depend on a number of things, so we recommend

	177 you experiment and find out what works best for you.

	178

	179

	180 <B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B>

	181 If both the source and destination URI are cloud URIs from the same

	182 provider, gsutil copies data "in the cloud" (i.e., without downloading

	183 to and uploading from the machine where you run gsutil). In addition to

	184 the performance and cost advantages of doing this, copying in the cloud

	185 preserves metadata (like Content-Type and Cache-Control). In contrast,

	186 when you download data from the cloud it ends up in a file, which has

	187 no associated metadata. Thus, unless you have some way to hold on to

	188 or re-create that metadata, downloading to a file will not retain the

	189 metadata.

	190

	191 Note that by default, the gsutil cp command does not copy the object

	192 ACL to the new object, and instead will use the default bucket ACL (see

	193 "gsutil help setdefacl"). You can override this behavior with the -p

	194 option (see OPTIONS below).

	195

	196

	197 <B>RESUMABLE TRANSFERS</B>

	198 gsutil automatically uses the Google Cloud Storage resumable upload

	199 feature whenever you use the cp command to upload an object that is larger

	200 than 1 MB. You do not need to specify any special command line options

	201 to make this happen. If your upload is interrupted you can restart the

	202 upload by running the same cp command that you ran to start the upload.

	203

	204 Similarly, gsutil automatically performs resumable downloads (using HTTP

	205 standard Range GET operations) whenever you use the cp command to download an

	206 object larger than 1 MB.

	207

	208 Resumable uploads and downloads store some state information in a file

	209 in ~/.gsutil named by the destination object or file. If you attempt to

	210 resume a transfer from a machine with a different directory, the transfer

	211 will start over from scratch.

	212

	213 See also "gsutil help prod" for details on using resumable transfers

	214 in production.

	215

	216

	217 <B>STREAMING TRANSFERS</B>

	218 Use '-' in place of src_uri or dst_uri to perform a streaming

	219 transfer. For example:

	220 long_running_computation \| gsutil cp - gs://my_bucket/obj

	221

	222 Streaming transfers do not support resumable uploads/downloads.

	223 (The Google resumable transfer protocol has a way to support streaming

	224 transers, but gsutil doesn't currently implement support for this.)

	225

	226

	227 <B>CHANGING TEMP DIRECTORIES</B>

	228 gsutil writes data to a temporary directory in several cases:

	229 - when compressing data to be uploaded (see the -z option)

	230 - when decompressing data being downloaded (when the data has

	231 Content-Encoding:gzip, e.g., as happens when uploaded using gsutil cp -z)

	232 - when copying between cloud service providers, where the destination

	233 provider does not support streaming uploads. In this case each object

	234 is downloaded from the source provider to a temp file, and then uploaded

	235 from that temp file to the destination provider.

	236

	237 In these cases it's possible the temp file location on your system that

	238 gsutil selects by default may not have enough space. If you find that

	239 gsutil runs out of space during one of these operations (e.g., raising

	240 "CommandException: Inadequate temp space available to compress <your file>"

	241 during a gsutil cp -z operation), you can change where it writes these

	242 temp files by setting the TMPDIR environment variable. On Linux and MacOS

	243 you can do this using:

	244

	245 export TMPDIR=/some/directory

	246

	247 On Windows 7 you can change the TMPDIR environment variable from Start ->

	248 Computer -> System -> Advanced System Settings -> Environment Variables.

	249 You need to reboot after making this change for it to take effect. (Rebooting

	250 is not necessary after running the export command on Linux and MacOS.)

	251

	252

	253 <B>OPTIONS</B>

	254 -a Sets named canned_acl when uploaded objects created. See

	255 'gsutil help acls' for further details.

	256

	257 -c If an error occurrs, continue to attempt to copy the remaining

	258 files.

	259

	260 -e Exclude symlinks. When specified, symbolic links will not be

	261 copied.

	262

	263 -n No-clobber. When specified, existing files or objects at the

	264 destination will not be overwritten. Any items that are skipped

	265 by this option will be reported as being skipped.

	266

	267 Please note that using this feature will make gsutil perform

	268 additional HTTP requests for every item being copied. This may

	269 increase latency and cost.

	270

	271 -p Causes ACL to be preserved when copying in the cloud. Note that

	272 this option has performance and cost implications, because it

	273 is essentially performing three requests (getacl, cp, setacl).

	274 (The performance issue can be mitigated to some degree by

	275 using gsutil -m cp to cause parallel copying.)

	276

	277 You can avoid the additional performance and cost of using cp -p

	278 if you want all objects in the destination bucket to end up with

	279 the same ACL, but setting a default ACL on that bucket instead of

	280 using cp -p. See "help gsutil setdefacl".

	281

	282 Note that it's not valid to specify both the -a and -p options

	283 together.

	284

	285 -q Causes copies to be performed quietly, i.e., without reporting

	286 progress indicators of files being copied. Errors are still

	287 reported. This option can be useful for running gsutil from a

	288 cron job that logs its output to a file, for which the only

	289 information desired in the log is failures.

	290

	291 -R, -r Causes directories, buckets, and bucket subdirectories to be

	292 copied recursively. If you neglect to use this option for

	293 an upload, gsutil will copy any files it finds and skip any

	294 directories. Similarly, neglecting to specify -R for a download

	295 will cause gsutil to copy any objects at the current bucket

	296 directory level, and skip any subdirectories.

	297

	298 -t DEPRECATED. At one time this option was used to request setting

	299 Content-Type based on file extension and/or content, which is

	300 now the default behavior. The -t option is left in place for

	301 now to avoid breaking existing scripts. It will be removed at

	302 a future date.

	303

	304 -v Parses uris for version / generation numbers (only applicable in

	305 version-enabled buckets). For example:

	306

	307 gsutil cp -v gs://bucket/object#1348772910166013 ~/Desktop

	308

	309 Note that wildcards are not permitted while using this flag.

	310

	311 -z 'txt,html' Compresses file uploads with the given extensions.

	312 If you are uploading a large file with compressible content,

	313 such as a .js, .css, or .html file, you can gzip-compress the

	314 file during the upload process by specifying the -z <extensions>

	315 option. Compressing data before upload saves on usage charges

	316 because you are uploading a smaller amount of data.

	317

	318 When you specify the -z option, the data from your files is

	319 compressed before it is uploaded, but your actual files are left

	320 uncompressed on the local disk. The uploaded objects retain the

	321 original content type and name as the original files but are given

	322 a Content-Encoding header with the value "gzip" to indicate that

	323 the object data stored are compressed on the Google Cloud Storage

	324 servers.

	325

	326 For example, the following command:

	327

	328 gsutil cp -z html -a public-read cattypes.html gs://mycats

	329

	330 will do all of the following:

	331 - Upload as the object gs://mycats/cattypes.html (cp command)

	332 - Set the Content-Type to text/html (based on file extension)

	333 - Compress the data in the file cattypes.html (-z option)

	334 - Set the Content-Encoding to gzip (-z option)

	335 - Set the ACL to public-read (-a option)

	336 - If a user tries to view cattypes.html in a browser, the

	337 browser will know to uncompress the data based on the

	338 Content-Encoding header, and to render it as HTML based on

	339 the Content-Type header.

	340 """)

	341

	342 class KeyFile():

	343 """

	344 Wrapper class to expose Key class as file to boto.

	345 """

	346 def __init__(self, key):

	347 self.key = key

	348

	349 def tell(self):

	350 raise IOError

	351

	352 def seek(self, pos):

	353 raise IOError

	354

	355 def read(self, size):

	356 return self.key.read(size)

	357

	358 def write(self, buf):

	359 raise IOError

	360

	361 def close(self):

	362 self.key.close()

	363

	364 class CpCommand(Command):

	365 """

	366 Implementation of gsutil cp command.

	367

	368 Note that CpCommand is run for both gsutil cp and gsutil mv. The latter

	369 happens by MvCommand calling CpCommand and passing the hidden (undocumented)

	370 -M option. This allows the copy and remove needed for each mv to run

	371 together (rather than first running all the cp's and then all the rm's, as

	372 we originally had implemented), which in turn avoids the following problem

	373 with removing the wrong objects: starting with a bucket containing only

	374 the object gs://bucket/obj, say the user does:

	375 gsutil mv gs://bucket/* gs://bucket/d.txt

	376 If we ran all the cp's and then all the rm's and we didn't expand the wildcard

	377 first, the cp command would first copy gs://bucket/obj to gs://bucket/d.txt,

	378 and the rm command would then remove that object. In the implementation

	379 prior to gsutil release 3.12 we avoided this by building a list of objects

	380 to process and then running the copies and then the removes; but building

	381 the list up front limits scalability (compared with the current approach

	382 of processing the bucket listing iterator on the fly).

	383 """

	384

	385 # Set default Content-Type type.

	386 DEFAULT_CONTENT_TYPE = 'application/octet-stream'

	387 USE_MAGICFILE = boto.config.getbool('GSUtil', 'use_magicfile', False)

	388

	389 # Command specification (processed by parent class).

	390 command_spec = {

	391 # Name of command.

	392 COMMAND_NAME : 'cp',

	393 # List of command name aliases.

	394 COMMAND_NAME_ALIASES : ['copy'],

	395 # Min number of args required by this command.

	396 MIN_ARGS : 1,

	397 # Max number of args required by this command, or NO_MAX.

	398 MAX_ARGS : NO_MAX,

	399 # Getopt-style string specifying acceptable sub args.

	400 # -t is deprecated but leave intact for now to avoid breakage.

	401 SUPPORTED_SUB_ARGS : 'a:ceIMnpqrRtvz:',

	402 # True if file URIs acceptable for this command.

	403 FILE_URIS_OK : True,

	404 # True if provider-only URIs acceptable for this command.

	405 PROVIDER_URIS_OK : False,

	406 # Index in args of first URI arg.

	407 URIS_START_ARG : 0,

	408 # True if must configure gsutil before running command.

	409 CONFIG_REQUIRED : True,

	410 }

	411 help_spec = {

	412 # Name of command or auxiliary help info for which this help applies.

	413 HELP_NAME : 'cp',

	414 # List of help name aliases.

	415 HELP_NAME_ALIASES : ['copy'],

	416 # Type of help:

	417 HELP_TYPE : HelpType.COMMAND_HELP,

	418 # One line summary of this help.

	419 HELP_ONE_LINE_SUMMARY : 'Copy files and objects',

	420 # The full help text.

	421 HELP_TEXT : _detailed_help_text,

	422 }

	423

	424 def _CheckFinalMd5(self, key, file_name):

	425 """

	426 Checks that etag from server agrees with md5 computed after the

	427 download completes.

	428 """

	429 obj_md5 = key.etag.strip('"\'')

	430 file_md5 = None

	431

	432 if hasattr(key, 'md5') and key.md5:

	433 file_md5 = key.md5

	434 else:

	435 print 'Computing MD5 from scratch for resumed download'

	436

	437 # Open file in binary mode to avoid surprises in Windows.

	438 fp = open(file_name, 'rb')

	439 try:

	440 file_md5 = key.compute_md5(fp)[0]

	441 finally:

	442 fp.close()

	443

	444 if self.debug:

	445 print 'Checking file md5 against etag. (%s/%s)' % (file_md5, obj_md5)

	446 if file_md5 != obj_md5:

	447 # Checksums don't match - remove file and raise exception.

	448 os.unlink(file_name)

	449 raise CommandException(

	450 'File changed during download: md5 signature doesn\'t match '

	451 'etag (incorrect downloaded file deleted)')

	452

	453 def _CheckForDirFileConflict(self, exp_src_uri, dst_uri):

	454 """Checks whether copying exp_src_uri into dst_uri is not possible.

	455

	456 This happens if a directory exists in local file system where a file

	457 needs to go or vice versa. In that case we print an error message and

	458 exits. Example: if the file "./x" exists and you try to do:

	459 gsutil cp gs://mybucket/x/y .

	460 the request can't succeed because it requires a directory where

	461 the file x exists.

	462

	463 Note that we don't enforce any corresponding restrictions for buckets,

	464 because the flat namespace semantics for buckets doesn't prohibit such

	465 cases the way hierarchical file systems do. For example, if a bucket

	466 contains an object called gs://bucket/dir and then you run the command:

	467 gsutil cp file1 file2 gs://bucket/dir

	468 you'll end up with objects gs://bucket/dir, gs://bucket/dir/file1, and

	469 gs://bucket/dir/file2.

	470

	471 Args:

	472 exp_src_uri: Expanded source StorageUri of copy.

	473 dst_uri: Destination URI.

	474

	475 Raises:

	476 CommandException: if errors encountered.

	477 """

	478 if dst_uri.is_cloud_uri():

	479 # The problem can only happen for file destination URIs.

	480 return

	481 dst_path = dst_uri.object_name

	482 final_dir = os.path.dirname(dst_path)

	483 if os.path.isfile(final_dir):

	484 raise CommandException('Cannot retrieve %s because a file exists '

	485 'where a directory needs to be created (%s).' %

	486 (exp_src_uri, final_dir))

	487 if os.path.isdir(dst_path):

	488 raise CommandException('Cannot retrieve %s because a directory exists '

	489 '(%s) where the file needs to be created.' %

	490 (exp_src_uri, dst_path))

	491

	492 def _InsistDstUriNamesContainer(self, exp_dst_uri,

	493 have_existing_dst_container, command_name):

	494 """

	495 Raises an exception if URI doesn't name a directory, bucket, or bucket

	496 subdir, with special exception for cp -R (see comments below).

	497

	498 Args:

	499 exp_dst_uri: Wildcard-expanding dst_uri.

	500 have_existing_dst_container: bool indicator of whether exp_dst_uri

	501 names a container (directory, bucket, or existing bucket subdir).

	502 command_name: Name of command making call. May not be the same as

	503 self.command_name in the case of commands implemented atop other

	504 commands (like mv command).

	505

	506 Raises:

	507 CommandException: if the URI being checked does not name a container.

	508 """

	509 if exp_dst_uri.is_file_uri():

	510 ok = exp_dst_uri.names_directory()

	511 else:

	512 if have_existing_dst_container:

	513 ok = True

	514 else:

	515 # It's ok to specify a non-existing bucket subdir, for example:

	516 # gsutil cp -R dir gs://bucket/abc

	517 # where gs://bucket/abc isn't an existing subdir.

	518 ok = exp_dst_uri.names_object()

	519 if not ok:

	520 raise CommandException('Destination URI must name a directory, bucket, '

	521 'or bucket\nsubdirectory for the multiple '

	522 'source form of the %s command.' % command_name)

	523

	524 class _FileCopyCallbackHandler(object):

	525 """Outputs progress info for large copy requests."""

	526

	527 def __init__(self, upload):

	528 if upload:

	529 self.announce_text = 'Uploading'

	530 else:

	531 self.announce_text = 'Downloading'

	532

	533 def call(self, total_bytes_transferred, total_size):

	534 sys.stderr.write('%s: %s/%s \r' % (

	535 self.announce_text,

	536 MakeHumanReadable(total_bytes_transferred),

	537 MakeHumanReadable(total_size)))

	538 if total_bytes_transferred == total_size:

	539 sys.stderr.write('\n')

	540

	541 class _StreamCopyCallbackHandler(object):

	542 """Outputs progress info for Stream copy to cloud.

	543 Total Size of the stream is not known, so we output

	544 only the bytes transferred.

	545 """

	546

	547 def call(self, total_bytes_transferred, total_size):

	548 sys.stderr.write('Uploading: %s \r' % (

	549 MakeHumanReadable(total_bytes_transferred)))

	550 if total_size and total_bytes_transferred == total_size:

	551 sys.stderr.write('\n')

	552

	553 def _GetTransferHandlers(self, dst_uri, size, upload):

	554 """

	555 Selects upload/download and callback handlers.

	556

	557 We use a callback handler that shows a simple textual progress indicator

	558 if size is above the configurable threshold.

	559

	560 We use a resumable transfer handler if size is >= the configurable

	561 threshold and resumable transfers are supported by the given provider.

	562 boto supports resumable downloads for all providers, but resumable

	563 uploads are currently only supported by GS.

	564

	565 Args:

	566 dst_uri: the destination URI.

	567 size: size of file (object) being uploaded (downloaded).

	568 upload: bool indication of whether transfer is an upload.

	569 """

	570 config = boto.config

	571 resumable_threshold = config.getint('GSUtil', 'resumable_threshold', ONE_MB)

	572 transfer_handler = None

	573 cb = None

	574 num_cb = None

	575

	576 if size >= resumable_threshold:

	577 if not self.quiet:

	578 cb = self._FileCopyCallbackHandler(upload).call

	579 num_cb = int(size / ONE_MB)

	580

	581 resumable_tracker_dir = config.get(

	582 'GSUtil', 'resumable_tracker_dir',

	583 os.path.expanduser('~' + os.sep + '.gsutil'))

	584 if not os.path.exists(resumable_tracker_dir):

	585 os.makedirs(resumable_tracker_dir)

	586

	587 if upload:

	588 # Encode the dest bucket and object name into the tracker file name.

	589 res_tracker_file_name = (

	590 re.sub('[/\\\\]', '_', 'resumable_upload__%s__%s.url' %

	591 (dst_uri.bucket_name, dst_uri.object_name)))

	592 else:

	593 # Encode the fully-qualified dest file name into the tracker file name.

	594 res_tracker_file_name = (

	595 re.sub('[/\\\\]', '_', 'resumable_download__%s.etag' %

	596 (os.path.realpath(dst_uri.object_name))))

	597

	598 res_tracker_file_name = _hash_filename(res_tracker_file_name)

	599 tracker_file = '%s%s%s' % (resumable_tracker_dir, os.sep,

	600 res_tracker_file_name)

	601 if upload:

	602 if dst_uri.scheme == 'gs':

	603 transfer_handler = ResumableUploadHandler(tracker_file)

	604 else:

	605 transfer_handler = ResumableDownloadHandler(tracker_file)

	606

	607 return (cb, num_cb, transfer_handler)

	608

	609 def _LogCopyOperation(self, src_uri, dst_uri, headers):

	610 """

	611 Logs copy operation being performed, including Content-Type if appropriate.

	612 """

	613 if self.quiet:

	614 return

	615 if 'Content-Type' in headers and dst_uri.is_cloud_uri():

	616 content_type_msg = ' [Content-Type=%s]' % headers['Content-Type']

	617 else:

	618 content_type_msg = ''

	619 if src_uri.is_stream():

	620 self.THREADED_LOGGER.info('Copying from <STDIN>%s...', content_type_msg)

	621 else:

	622 self.THREADED_LOGGER.info('Copying %s%s...', src_uri, content_type_msg)

	623

	624 # We pass the headers explicitly to this call instead of using self.headers

	625 # so we can set different metadata (like Content-Type type) for each object.

	626 def _CopyObjToObjSameProvider(self, src_key, src_uri, dst_uri, headers):

	627 self._SetContentTypeHeader(src_uri, headers)

	628 self._LogCopyOperation(src_uri, dst_uri, headers)

	629 # Do Object -> object copy within same provider (uses

	630 # x-<provider>-copy-source metadata HTTP header to request copying at the

	631 # server).

	632 src_bucket = src_uri.get_bucket(False, headers)

	633 dst_bucket = dst_uri.get_bucket(False, headers)

	634 preserve_acl = False

	635 canned_acl = None

	636 if self.sub_opts:

	637 for o, a in self.sub_opts:

	638 if o == '-a':

	639 canned_acls = dst_uri.canned_acls()

	640 if a not in canned_acls:

	641 raise CommandException('Invalid canned ACL "%s".' % a)

	642 canned_acl = a

	643 headers[dst_uri.get_provider().acl_header] = canned_acl

	644 if o == '-p':

	645 preserve_acl = True

	646 if preserve_acl and canned_acl:

	647 raise CommandException(

	648 'Specifying both the -p and -a options together is invalid.')

	649 start_time = time.time()

	650 # Pass headers in headers param not metadata param, so boto will copy

	651 # existing key's metadata and just set the additional headers specified

	652 # in the headers param (rather than using the headers to override existing

	653 # metadata). In particular this allows us to copy the existing key's

	654 # Content-Type and other metadata users need while still being able to

	655 # set headers the API needs (like x-goog-project-id). Note that this means

	656 # you can't do something like:

	657 # gsutil cp -t Content-Type text/html gs://bucket/* gs://bucket2

	658 # to change the Content-Type while copying.

	659

	660 dst_uri.copy_key(src_bucket.name, src_uri.object_name,

	661 preserve_acl=False, headers=headers,

	662 src_version_id=src_uri.version_id,

	663 src_generation=src_uri.generation)

	664 end_time = time.time()

	665 return (end_time - start_time, src_key.size)

	666

	667 def _CheckFreeSpace(self, path):

	668 """Return path/drive free space (in bytes)."""

	669 if platform.system() == 'Windows':

	670 from ctypes import c_int, c_uint64, c_wchar_p, windll, POINTER, WINFUNCTYP E, WinError

	671 try:

	672 GetDiskFreeSpaceEx = WINFUNCTYPE(c_int, c_wchar_p, POINTER(c_uint64),

	673 POINTER(c_uint64), POINTER(c_uint64))

	674 GetDiskFreeSpaceEx = GetDiskFreeSpaceEx(

	675 ('GetDiskFreeSpaceExW', windll.kernel32), (

	676 (1, 'lpszPathName'),

	677 (2, 'lpFreeUserSpace'),

	678 (2, 'lpTotalSpace'),

	679 (2, 'lpFreeSpace'),))

	680 except AttributeError:

	681 GetDiskFreeSpaceEx = WINFUNCTYPE(c_int, c_char_p, POINTER(c_uint64),

	682 POINTER(c_uint64), POINTER(c_uint64))

	683 GetDiskFreeSpaceEx = GetDiskFreeSpaceEx(

	684 ('GetDiskFreeSpaceExA', windll.kernel32), (

	685 (1, 'lpszPathName'),

	686 (2, 'lpFreeUserSpace'),

	687 (2, 'lpTotalSpace'),

	688 (2, 'lpFreeSpace'),))

	689

	690 def GetDiskFreeSpaceEx_errcheck(result, func, args):

	691 if not result:

	692 raise WinError()

	693 return args[1].value

	694 GetDiskFreeSpaceEx.errcheck = GetDiskFreeSpaceEx_errcheck

	695

	696 return GetDiskFreeSpaceEx(os.getenv('SystemDrive'))

	697 else:

	698 (_, f_frsize, _, _, f_bavail, _, _, _, _, _) = os.statvfs(path)

	699 return f_frsize * f_bavail

	700

	701 def _PerformResumableUploadIfApplies(self, fp, dst_uri, canned_acl, headers):

	702 """

	703 Performs resumable upload if supported by provider and file is above

	704 threshold, else performs non-resumable upload.

	705

	706 Returns (elapsed_time, bytes_transferred).

	707 """

	708 start_time = time.time()

	709 file_size = os.path.getsize(fp.name)

	710 dst_key = dst_uri.new_key(False, headers)

	711 (cb, num_cb, res_upload_handler) = self._GetTransferHandlers(

	712 dst_uri, file_size, True)

	713 if dst_uri.scheme == 'gs':

	714 # Resumable upload protocol is Google Cloud Storage-specific.

	715 dst_key.set_contents_from_file(fp, headers, policy=canned_acl,

	716 cb=cb, num_cb=num_cb,

	717 res_upload_handler=res_upload_handler)

	718 else:

	719 dst_key.set_contents_from_file(fp, headers, policy=canned_acl,

	720 cb=cb, num_cb=num_cb)

	721 if res_upload_handler:

	722 # ResumableUploadHandler does not update upload_start_point from its

	723 # initial value of -1 if transferring the whole file, so clamp at 0

	724 bytes_transferred = file_size - max(

	725 res_upload_handler.upload_start_point, 0)

	726 else:

	727 bytes_transferred = file_size

	728 end_time = time.time()

	729 return (end_time - start_time, bytes_transferred)

	730

	731 def _PerformStreamingUpload(self, fp, dst_uri, headers, canned_acl=None):

	732 """

	733 Performs a streaming upload to the cloud.

	734

	735 Args:

	736 fp: The file whose contents to upload.

	737 dst_uri: Destination StorageUri.

	738 headers: A copy of the headers dictionary.

	739 canned_acl: Optional canned ACL to set on the object.

	740

	741 Returns (elapsed_time, bytes_transferred).

	742 """

	743 start_time = time.time()

	744 dst_key = dst_uri.new_key(False, headers)

	745

	746 if self.quiet:

	747 cb = None

	748 else:

	749 cb = self._StreamCopyCallbackHandler().call

	750 dst_key.set_contents_from_stream(fp, headers, policy=canned_acl, cb=cb)

	751 try:

	752 bytes_transferred = fp.tell()

	753 except:

	754 bytes_transferred = 0

	755

	756 end_time = time.time()

	757 return (end_time - start_time, bytes_transferred)

	758

	759 def _SetContentTypeHeader(self, src_uri, headers):

	760 """

	761 Sets content type header to value specified in '-h Content-Type' option (if

	762 specified); else sets using Content-Type detection.

	763 """

	764 if 'Content-Type' in headers:

	765 # If empty string specified (i.e., -h "Content-Type:") set header to None,

	766 # which will inhibit boto from sending the CT header. Otherwise, boto will

	767 # pass through the user specified CT header.

	768 if not headers['Content-Type']:

	769 headers['Content-Type'] = None

	770 # else we'll keep the value passed in via -h option (not performing

	771 # content type detection).

	772 else:

	773 # Only do content type recognition is src_uri is a file. Object-to-object

	774 # copies with no -h Content-Type specified re-use the content type of the

	775 # source object.

	776 if src_uri.is_file_uri():

	777 object_name = src_uri.object_name

	778 content_type = None

	779 # Streams (denoted by '-') are expected to be 'application/octet-stream'

	780 # and 'file' would partially consume them.

	781 if object_name != '-':

	782 if self.USE_MAGICFILE:

	783 p = subprocess.Popen(['file', '--mime-type', object_name],

	784 stdout=subprocess.PIPE, stderr=subprocess.PIPE)

	785 output, error = p.communicate()

	786 if p.returncode != 0 or error:

	787 raise CommandException(

	788 'Encountered error running "file --mime-type %s" '

	789 '(returncode=%d).\n%s' % (object_name, p.returncode, error))

	790 # Parse output by removing line delimiter and splitting on last ":

	791 content_type = output.rstrip().rpartition(': ')[2]

	792 else:

	793 content_type = mimetypes.guess_type(object_name)[0]

	794 if not content_type:

	795 content_type = self.DEFAULT_CONTENT_TYPE

	796 headers['Content-Type'] = content_type

	797

	798 def _UploadFileToObject(self, src_key, src_uri, dst_uri, headers,

	799 should_log=True):

	800 """Helper method for uploading a local file to an object.

	801

	802 Args:

	803 src_key: Source StorageUri. Must be a file URI.

	804 src_uri: Source StorageUri.

	805 dst_uri: Destination StorageUri.

	806 headers: The headers dictionary.

	807 should_log: bool indicator whether we should log this operation.

	808 Returns:

	809 (elapsed_time, bytes_transferred) excluding overhead like initial HEAD.

	810

	811 Raises:

	812 CommandException: if errors encountered.

	813 """

	814 gzip_exts = []

	815 canned_acl = None

	816 if self.sub_opts:

	817 for o, a in self.sub_opts:

	818 if o == '-a':

	819 canned_acls = dst_uri.canned_acls()

	820 if a not in canned_acls:

	821 raise CommandException('Invalid canned ACL "%s".' % a)

	822 canned_acl = a

	823 elif o == '-t':

	824 print('Warning: -t is deprecated, and will be removed in the future. '

	825 'Content type\ndetection is '

	826 'now performed by default, unless inhibited by specifying '

	827 'a\nContent-Type header via the -h option.')

	828 elif o == '-z':

	829 gzip_exts = a.split(',')

	830

	831 self._SetContentTypeHeader(src_uri, headers)

	832 if should_log:

	833 self._LogCopyOperation(src_uri, dst_uri, headers)

	834

	835 if 'Content-Language' not in headers:

	836 content_language = config.get_value('GSUtil', 'content_language')

	837 if content_language:

	838 headers['Content-Language'] = content_language

	839

	840 fname_parts = src_uri.object_name.split('.')

	841 if len(fname_parts) > 1 and fname_parts[-1] in gzip_exts:

	842 if self.debug:

	843 print 'Compressing %s (to tmp)...' % src_key

	844 (gzip_fh, gzip_path) = tempfile.mkstemp()

	845 gzip_fp = None

	846 try:

	847 # Check for temp space. Assume the compressed object is at most 2x

	848 # the size of the object (normally should compress to smaller than

	849 # the object)

	850 if (self._CheckFreeSpace(gzip_path)

	851 < 2*int(os.path.getsize(src_key.name))):

	852 raise CommandException('Inadequate temp space available to compress '

	853 '%s' % src_key.name)

	854 gzip_fp = gzip.open(gzip_path, 'wb')

	855 gzip_fp.writelines(src_key.fp)

	856 finally:

	857 if gzip_fp:

	858 gzip_fp.close()

	859 os.close(gzip_fh)

	860 headers['Content-Encoding'] = 'gzip'

	861 gzip_fp = open(gzip_path, 'rb')

	862 try:

	863 (elapsed_time, bytes_transferred) = (

	864 self._PerformResumableUploadIfApplies(gzip_fp, dst_uri,

	865 canned_acl, headers))

	866 finally:

	867 gzip_fp.close()

	868 try:

	869 os.unlink(gzip_path)

	870 # Windows sometimes complains the temp file is locked when you try to

	871 # delete it.

	872 except Exception, e:

	873 pass

	874 elif (src_key.is_stream()

	875 and dst_uri.get_provider().supports_chunked_transfer()):

	876 (elapsed_time, bytes_transferred) = self._PerformStreamingUpload(

	877 src_key.fp, dst_uri, headers, canned_acl)

	878 else:

	879 if src_key.is_stream():

	880 # For Providers that doesn't support chunked Transfers

	881 tmp = tempfile.NamedTemporaryFile()

	882 file_uri = self.suri_builder.StorageUri('file://%s' % tmp.name)

	883 try:

	884 file_uri.new_key(False, headers).set_contents_from_file(

	885 src_key.fp, headers)

	886 src_key = file_uri.get_key()

	887 finally:

	888 file_uri.close()

	889 try:

	890 (elapsed_time, bytes_transferred) = (

	891 self._PerformResumableUploadIfApplies(src_key.fp, dst_uri,

	892 canned_acl, headers))

	893 finally:

	894 if src_key.is_stream():

	895 tmp.close()

	896 else:

	897 src_key.close()

	898

	899 return (elapsed_time, bytes_transferred)

	900

	901 def _DownloadObjectToFile(self, src_key, src_uri, dst_uri, headers,

	902 should_log=True):

	903 if should_log:

	904 self._LogCopyOperation(src_uri, dst_uri, headers)

	905 (cb, num_cb, res_download_handler) = self._GetTransferHandlers(

	906 dst_uri, src_key.size, False)

	907 file_name = dst_uri.object_name

	908 dir_name = os.path.dirname(file_name)

	909 if dir_name and not os.path.exists(dir_name):

	910 # Do dir creation in try block so can ignore case where dir already

	911 # exists. This is needed to avoid a race condition when running gsutil

	912 # -m cp.

	913 try:

	914 os.makedirs(dir_name)

	915 except OSError, e:

	916 if e.errno != errno.EEXIST:

	917 raise

	918 # For gzipped objects not named *.gz download to a temp file and unzip.

	919 if (hasattr(src_key, 'content_encoding')

	920 and src_key.content_encoding == 'gzip'

	921 and not file_name.endswith('.gz')):

	922 # We can't use tempfile.mkstemp() here because we need a predictable

	923 # filename for resumable downloads.

	924 download_file_name = '%s_.gztmp' % file_name

	925 need_to_unzip = True

	926 else:

	927 download_file_name = file_name

	928 need_to_unzip = False

	929 fp = None

	930 try:

	931 if res_download_handler:

	932 fp = open(download_file_name, 'ab')

	933 else:

	934 fp = open(download_file_name, 'wb')

	935 start_time = time.time()

	936 if not self.parse_versions:

	937 src_key.generation = None

	938 src_key.get_contents_to_file(fp, headers, cb=cb, num_cb=num_cb,

	939 res_download_handler=res_download_handler)

	940 # If a custom test method is defined, call it here. For the copy command,

	941 # test methods are expected to take one argument: an open file pointer,

	942 # and are used to perturb the open file during download to exercise

	943 # download error detection.

	944 if self.test_method:

	945 self.test_method(fp)

	946 end_time = time.time()

	947 finally:

	948 if fp:

	949 fp.close()

	950

	951 # Discard the md5 if we are resuming a partial download.

	952 if res_download_handler and res_download_handler.download_start_point:

	953 src_key.md5 = None

	954

	955 # Verify downloaded file checksum matched source object's checksum.

	956 self._CheckFinalMd5(src_key, download_file_name)

	957

	958 if res_download_handler:

	959 bytes_transferred = (

	960 src_key.size - res_download_handler.download_start_point)

	961 else:

	962 bytes_transferred = src_key.size

	963 if need_to_unzip:

	964 # Log that we're uncompressing if the file is big enough that

	965 # decompressing would make it look like the transfer "stalled" at the end.

	966 if not self.quiet and bytes_transferred > 10 * 1024 * 1024:

	967 self.THREADED_LOGGER.info('Uncompressing downloaded tmp file to %s...',

	968 file_name)

	969 # Downloaded gzipped file to a filename w/o .gz extension, so unzip.

	970 f_in = gzip.open(download_file_name, 'rb')

	971 f_out = open(file_name, 'wb')

	972 try:

	973 while True:

	974 data = f_in.read(8192)

	975 if not data:

	976 break

	977 f_out.write(data)

	978 finally:

	979 f_out.close()

	980 f_in.close()

	981 os.unlink(download_file_name)

	982 return (end_time - start_time, bytes_transferred)

	983

	984 def _PerformDownloadToStream(self, src_key, src_uri, str_fp, headers):

	985 (cb, num_cb, res_download_handler) = self._GetTransferHandlers(

	986 src_uri, src_key.size, False)

	987 start_time = time.time()

	988 src_key.get_contents_to_file(str_fp, headers, cb=cb, num_cb=num_cb)

	989 end_time = time.time()

	990 bytes_transferred = src_key.size

	991 end_time = time.time()

	992 return (end_time - start_time, bytes_transferred)

	993

	994 def _CopyFileToFile(self, src_key, src_uri, dst_uri, headers):

	995 self._LogCopyOperation(src_uri, dst_uri, headers)

	996 dst_key = dst_uri.new_key(False, headers)

	997 start_time = time.time()

	998 dst_key.set_contents_from_file(src_key.fp, headers)

	999 end_time = time.time()

	1000 return (end_time - start_time, os.path.getsize(src_key.fp.name))

	1001

	1002 def _CopyObjToObjDiffProvider(self, src_key, src_uri, dst_uri, headers):

	1003 self._SetContentTypeHeader(src_uri, headers)

	1004 self._LogCopyOperation(src_uri, dst_uri, headers)

	1005 # If destination is GS we can avoid the local copying through a local file

	1006 # as GS supports chunked transfer. This also allows us to preserve metadata

	1007 # between original and destination object.

	1008 if dst_uri.scheme == 'gs':

	1009 canned_acl = None

	1010 if self.sub_opts:

	1011 for o, a in self.sub_opts:

	1012 if o == '-a':

	1013 canned_acls = dst_uri.canned_acls()

	1014 if a not in canned_acls:

	1015 raise CommandException('Invalid canned ACL "%s".' % a)

	1016 canned_acl = a

	1017 elif o == '-p':

	1018 # We don't attempt to preserve ACLs across providers because

	1019 # GCS and S3 support different ACLs.

	1020 raise NotImplementedError('Cross-provider cp -p not supported')

	1021

	1022 # TODO: This _PerformStreamingUpload call passes in a Key for fp

	1023 # param, relying on Python "duck typing" (the fact that the lower-level

	1024 # methods that expect an fp only happen to call fp methods that are

	1025 # defined and semantically equivalent to those defined on src_key). This

	1026 # should be replaced by a class that wraps an fp interface around the

	1027 # Key, throwing 'not implemented' for methods (like seek) that aren't

	1028 # implemented by non-file Keys.

	1029 # NOTE: As of 7/28/2012 this bug now makes cross-provider copies into gs

	1030 # fail, because of boto changes that make that code now attempt to perform

	1031 # additional operations on the fp parameter, like seek() and tell().

	1032 return self._PerformStreamingUpload(KeyFile(src_key), dst_uri, headers, ca nned_acl)

	1033

	1034 # If destination is not GS we implement object copy through a local

	1035 # temp file. There are at least 3 downsides of this approach:

	1036 # 1. It doesn't preserve metadata from the src object when uploading to

	1037 # the dst object.

	1038 # 2. It requires enough temp space on the local disk to hold the file

	1039 # while transferring.

	1040 # 3. Killing the gsutil process partway through and then restarting will

	1041 # always repeat the download and upload, because the temp file name is

	1042 # different for each incarnation. (If however you just leave the

	1043 # process running and failures happen along the way, they will

	1044 # continue to restart and make progress as long as not too many

	1045 # failures happen in a row with no progress.)

	1046 tmp = tempfile.NamedTemporaryFile()

	1047 if self._CheckFreeSpace(tempfile.tempdir) < src_key.size:

	1048 raise CommandException('Inadequate temp space available to perform the '

	1049 'requested copy')

	1050 start_time = time.time()

	1051 file_uri = self.suri_builder.StorageUri('file://%s' % tmp.name)

	1052 try:

	1053 self._DownloadObjectToFile(src_key, src_uri, file_uri, headers, False)

	1054 self._UploadFileToObject(file_uri.get_key(), file_uri, dst_uri, headers,

	1055 False)

	1056 finally:

	1057 tmp.close()

	1058 end_time = time.time()

	1059 return (end_time - start_time, src_key.size)

	1060

	1061 def _PerformCopy(self, src_uri, dst_uri):

	1062 """Performs copy from src_uri to dst_uri, handling various special cases.

	1063

	1064 Args:

	1065 src_uri: Source StorageUri.

	1066 dst_uri: Destination StorageUri.

	1067

	1068 Returns:

	1069 (elapsed_time, bytes_transferred) excluding overhead like initial HEAD.

	1070

	1071 Raises:

	1072 CommandException: if errors encountered.

	1073 """

	1074 # Make a copy of the input headers each time so we can set a different

	1075 # content type for each object.

	1076 if self.headers:

	1077 headers = self.headers.copy()

	1078 else:

	1079 headers = {}

	1080

	1081 src_key = src_uri.get_key(False, headers)

	1082 if not src_key:

	1083 raise CommandException('"%s" does not exist.' % src_uri)

	1084

	1085 if self.no_clobber:

	1086 # There are two checks to prevent clobbering:

	1087 # 1) The first check is to see if the item

	1088 # already exists at the destination and prevent the upload/download

	1089 # from happening. This is done by the exists() call.

	1090 # 2) The second check is only relevant if we are writing to gs. We can

	1091 # enforce that the server only writes the object if it doesn't exist

	1092 # by specifying the header below. This check only happens at the

	1093 # server after the complete file has been uploaded. We specify this

	1094 # header to prevent a race condition where a destination file may

	1095 # be created after the first check and before the file is fully

	1096 # uploaded.

	1097 # In order to save on unneccesary uploads/downloads we perform both

	1098 # checks. However, this may come at the cost of additional HTTP calls.

	1099 if dst_uri.exists(headers):

	1100 if not self.quiet:

	1101 self.THREADED_LOGGER.info('Skipping existing item: %s' %

	1102 dst_uri.uri)

	1103 return (0, 0)

	1104 if dst_uri.is_cloud_uri() and dst_uri.scheme == 'gs':

	1105 headers['x-goog-if-generation-match'] = '0'

	1106

	1107 if src_uri.is_cloud_uri() and dst_uri.is_cloud_uri():

	1108 if src_uri.scheme == dst_uri.scheme:

	1109 return self._CopyObjToObjSameProvider(src_key, src_uri, dst_uri,

	1110 headers)

	1111 else:

	1112 return self._CopyObjToObjDiffProvider(src_key, src_uri, dst_uri,

	1113 headers)

	1114 elif src_uri.is_file_uri() and dst_uri.is_cloud_uri():

	1115 return self._UploadFileToObject(src_key, src_uri, dst_uri, headers)

	1116 elif src_uri.is_cloud_uri() and dst_uri.is_file_uri():

	1117 return self._DownloadObjectToFile(src_key, src_uri, dst_uri, headers)

	1118 elif src_uri.is_file_uri() and dst_uri.is_file_uri():

	1119 return self._CopyFileToFile(src_key, src_uri, dst_uri, headers)

	1120 else:

	1121 raise CommandException('Unexpected src/dest case')

	1122

	1123 def _ExpandDstUri(self, dst_uri_str):

	1124 """

	1125 Expands wildcard if present in dst_uri_str.

	1126

	1127 Args:

	1128 dst_uri_str: String representation of requested dst_uri.

	1129

	1130 Returns:

	1131 (exp_dst_uri, have_existing_dst_container)

	1132 where have_existing_dst_container is a bool indicating whether

	1133 exp_dst_uri names an existing directory, bucket, or bucket subdirectory.

	1134

	1135 Raises:

	1136 CommandException: if dst_uri_str matched more than 1 URI.

	1137 """

	1138 dst_uri = self.suri_builder.StorageUri(dst_uri_str)

	1139

	1140 # Handle wildcarded dst_uri case.

	1141 if ContainsWildcard(dst_uri):

	1142 blr_expansion = list(self.WildcardIterator(dst_uri))

	1143 if len(blr_expansion) != 1:

	1144 raise CommandException('Destination (%s) must match exactly 1 URI' %

	1145 dst_uri_str)

	1146 blr = blr_expansion[0]

	1147 uri = blr.GetUri()

	1148 if uri.is_cloud_uri():

	1149 return (uri, uri.names_bucket() or blr.HasPrefix()

	1150 or blr.GetKey().endswith('/'))

	1151 else:

	1152 return (uri, uri.names_directory())

	1153

	1154 # Handle non-wildcarded dst_uri:

	1155 if dst_uri.is_file_uri():

	1156 return (dst_uri, dst_uri.names_directory())

	1157 if dst_uri.names_bucket():

	1158 return (dst_uri, True)

	1159 # For object URIs check 3 cases: (a) if the name ends with '/' treat as a

	1160 # subdir; else, perform a wildcard expansion with dst_uri + "*" and then

	1161 # find if (b) there's a Prefix matching dst_uri, or (c) name is of form

	1162 # dir_$folder$ (and in both these cases also treat dir as a subdir).

	1163 if dst_uri.is_cloud_uri() and dst_uri_str.endswith('/'):

	1164 return (dst_uri, True)

	1165 blr_expansion = list(self.WildcardIterator(

	1166 '%s*' % dst_uri_str.rstrip(dst_uri.delim)))

	1167 for blr in blr_expansion:

	1168 if blr.GetRStrippedUriString().endswith('_$folder$'):

	1169 return (dst_uri, True)

	1170 if blr.GetRStrippedUriString() == dst_uri_str.rstrip(dst_uri.delim):

	1171 return (dst_uri, blr.HasPrefix())

	1172 return (dst_uri, False)

	1173

	1174 def _ConstructDstUri(self, src_uri, exp_src_uri,

	1175 src_uri_names_container, src_uri_expands_to_multi,

	1176 have_multiple_srcs, exp_dst_uri,

	1177 have_existing_dest_subdir):

	1178 """

	1179 Constructs the destination URI for a given exp_src_uri/exp_dst_uri pair,

	1180 using context-dependent naming rules that mimic Unix cp and mv behavior.

	1181

	1182 Args:

	1183 src_uri: src_uri to be copied.

	1184 exp_src_uri: Single StorageUri from wildcard expansion of src_uri.

	1185 src_uri_names_container: True if src_uri names a container (including the

	1186 case of a wildcard-named bucket subdir (like gs://bucket/abc,

	1187 where gs://bucket/abc/* matched some objects). Note that this is

	1188 additional semantics tha src_uri.names_container() doesn't understand

	1189 because the latter only understands StorageUris, not wildcards.

	1190 src_uri_expands_to_multi: True if src_uri expanded to multiple URIs.

	1191 have_multiple_srcs: True if this is a multi-source request. This can be

	1192 true if src_uri wildcard-expanded to multiple URIs or if there were

	1193 multiple source URIs in the request.

	1194 exp_dst_uri: the expanded StorageUri requested for the cp destination.

	1195 Final written path is constructed from this plus a context-dependent

	1196 variant of src_uri.

	1197 have_existing_dest_subdir: bool indicator whether dest is an existing

	1198 subdirectory.

	1199

	1200 Returns:

	1201 StorageUri to use for copy.

	1202

	1203 Raises:

	1204 CommandException if destination object name not specified for

	1205 source and source is a stream.

	1206 """

	1207 if self._ShouldTreatDstUriAsSingleton(

	1208 have_multiple_srcs, have_existing_dest_subdir, exp_dst_uri):

	1209 # We're copying one file or object to one file or object.

	1210 return exp_dst_uri

	1211

	1212 if exp_src_uri.is_stream():

	1213 if exp_dst_uri.names_container():

	1214 raise CommandException('Destination object name needed when '

	1215 'source is a stream')

	1216 return exp_dst_uri

	1217

	1218 if not self.recursion_requested and not have_multiple_srcs:

	1219 # We're copying one file or object to a subdirectory. Append final comp

	1220 # of exp_src_uri to exp_dest_uri.

	1221 src_final_comp = exp_src_uri.object_name.rpartition(src_uri.delim)[-1]

	1222 return self.suri_builder.StorageUri('%s%s%s' % (

	1223 exp_dst_uri.uri.rstrip(exp_dst_uri.delim), exp_dst_uri.delim,

	1224 src_final_comp))

	1225

	1226 # Else we're copying multiple sources to a directory, bucket, or a bucket

	1227 # "sub-directory".

	1228

	1229 # Ensure exp_dst_uri ends in delim char if we're doing a multi-src copy or

	1230 # a copy to a directory. (The check for copying to a directory needs

	1231 # special-case handling so that the command:

	1232 # gsutil cp gs://bucket/obj dir

	1233 # will turn into file://dir/ instead of file://dir -- the latter would cause

	1234 # the file "dirobj" to be created.)

	1235 # Note: need to check have_multiple_srcs or src_uri.names_container()

	1236 # because src_uri could be a bucket containing a single object, named

	1237 # as gs://bucket.

	1238 if ((have_multiple_srcs or src_uri.names_container()

	1239 or os.path.isdir(exp_dst_uri.object_name))

	1240 and not exp_dst_uri.uri.endswith(exp_dst_uri.delim)):

	1241 exp_dst_uri = exp_dst_uri.clone_replace_name(

	1242 '%s%s' % (exp_dst_uri.object_name, exp_dst_uri.delim)

	1243 )

	1244

	1245 # Making naming behavior match how things work with local Unix cp and mv

	1246 # operations depends on many factors, including whether the destination is a

	1247 # container, the plurality of the source(s), and whether the mv command is

	1248 # being used:

	1249 # 1. For the "mv" command that specifies a non-existent destination subdir,

	1250 # renaming should occur at the level of the src subdir, vs appending that

	1251 # subdir beneath the dst subdir like is done for copying. For example:

	1252 # gsutil rm -R gs://bucket

	1253 # gsutil cp -R cloudreader gs://bucket

	1254 # gsutil cp -R cloudauth gs://bucket/subdir1

	1255 # gsutil mv gs://bucket/subdir1 gs://bucket/subdir2

	1256 # would (if using cp naming behavior) end up with paths like:

	1257 # gs://bucket/subdir2/subdir1/cloudauth/.svn/all-wcprops

	1258 # whereas mv naming behavior should result in:

	1259 # gs://bucket/subdir2/cloudauth/.svn/all-wcprops

	1260 # 2. Copying from directories, buckets, or bucket subdirs should result in

	1261 # objects/files mirroring the source directory hierarchy. For example:

	1262 # gsutil cp dir1/dir2 gs://bucket

	1263 # should create the object gs://bucket/dir2/file2, assuming dir1/dir2

	1264 # contains file2).

	1265 # To be consistent with Unix cp behavior, there's one more wrinkle when

	1266 # working with subdirs: The resulting object names depend on whether the

	1267 # destination subdirectory exists. For example, if gs://bucket/subdir

	1268 # exists, the command:

	1269 # gsutil cp -R dir1/dir2 gs://bucket/subdir

	1270 # should create objects named like gs://bucket/subdir/dir2/a/b/c. In

	1271 # contrast, if gs://bucket/subdir does not exist, this same command

	1272 # should create objects named like gs://bucket/subdir/a/b/c.

	1273 # 3. Copying individual files or objects to dirs, buckets or bucket subdirs

	1274 # should result in objects/files named by the final source file name

	1275 # component. Example:

	1276 # gsutil cp dir1/*.txt gs://bucket

	1277 # should create the objects gs://bucket/f1.txt and gs://bucket/f2.txt,

	1278 # assuming dir1 contains f1.txt and f2.txt.

	1279

	1280 if (self.perform_mv and self.recursion_requested

	1281 and src_uri_expands_to_multi and not have_existing_dest_subdir):

	1282 # Case 1. Handle naming rules for bucket subdir mv. Here we want to

	1283 # line up the src_uri against its expansion, to find the base to build

	1284 # the new name. For example, running the command:

	1285 # gsutil mv gs://bucket/abcd gs://bucket/xyz

	1286 # when processing exp_src_uri=gs://bucket/abcd/123

	1287 # exp_src_uri_tail should become /123

	1288 # Note: mv.py code disallows wildcard specification of source URI.

	1289 exp_src_uri_tail = exp_src_uri.uri[len(src_uri.uri):]

	1290 dst_key_name = '%s/%s' % (exp_dst_uri.object_name.rstrip('/'),

	1291 exp_src_uri_tail.strip('/'))

	1292 return exp_dst_uri.clone_replace_name(dst_key_name)

	1293

	1294 if src_uri_names_container and not exp_dst_uri.names_file():

	1295 # Case 2. Build dst_key_name from subpath of exp_src_uri past

	1296 # where src_uri ends. For example, for src_uri=gs://bucket/ and

	1297 # exp_src_uri=gs://bucket/src_subdir/obj, dst_key_name should be

	1298 # src_subdir/obj.

	1299 src_uri_path_sans_final_dir = _GetPathBeforeFinalDir(src_uri)

	1300 dst_key_name = exp_src_uri.uri[

	1301 len(src_uri_path_sans_final_dir):].lstrip(src_uri.delim)

	1302 # Handle case where dst_uri is a non-existent subdir.

	1303 if not have_existing_dest_subdir:

	1304 dst_key_name = dst_key_name.partition(exp_dst_uri.delim)[-1]

	1305 # Handle special case where src_uri was a directory named with '.' or

	1306 # './', so that running a command like:

	1307 # gsutil cp -r . gs://dest

	1308 # will produce obj names of the form gs://dest/abc instead of

	1309 # gs://dest/./abc.

	1310 if dst_key_name.startswith('./'):

	1311 dst_key_name = dst_key_name[2:]

	1312

	1313 else:

	1314 # Case 3.

	1315 dst_key_name = exp_src_uri.object_name.rpartition(src_uri.delim)[-1]

	1316

	1317 if (exp_dst_uri.is_file_uri()

	1318 or self._ShouldTreatDstUriAsBucketSubDir(

	1319 have_multiple_srcs, exp_dst_uri, have_existing_dest_subdir)):

	1320 if exp_dst_uri.object_name.endswith(exp_dst_uri.delim):

	1321 dst_key_name = '%s%s%s' % (

	1322 exp_dst_uri.object_name.rstrip(exp_dst_uri.delim),

	1323 exp_dst_uri.delim, dst_key_name)

	1324 else:

	1325 dst_key_name = '%s%s' % (exp_dst_uri.object_name, dst_key_name)

	1326

	1327 return exp_dst_uri.clone_replace_name(dst_key_name)

	1328

	1329 def _FixWindowsNaming(self, src_uri, dst_uri):

	1330 """

	1331 Rewrites the destination URI built by _ConstructDstUri() to translate

	1332 Windows pathnames to cloud pathnames if needed.

	1333

	1334 Args:

	1335 src_uri: Source URI to be copied.

	1336 dst_uri: The destination URI built by _ConstructDstUri().

	1337

	1338 Returns:

	1339 StorageUri to use for copy.

	1340 """

	1341 if (src_uri.is_file_uri() and src_uri.delim == '\\'

	1342 and dst_uri.is_cloud_uri()):

	1343 trans_uri_str = re.sub(r'\\', '/', dst_uri.uri)

	1344 dst_uri = self.suri_builder.StorageUri(trans_uri_str)

	1345 return dst_uri

	1346

	1347 # Command entry point.

	1348 def RunCommand(self):

	1349

	1350 # Inner funcs.

	1351 def _CopyExceptionHandler(e):

	1352 """Simple exception handler to allow post-completion status."""

	1353 self.THREADED_LOGGER.error(str(e))

	1354 self.copy_failure_count += 1

	1355

	1356 def _CopyFunc(name_expansion_result):

	1357 """Worker function for performing the actual copy (and rm, for mv)."""

	1358 if self.perform_mv:

	1359 cmd_name = 'mv'

	1360 else:

	1361 cmd_name = self.command_name

	1362 src_uri = self.suri_builder.StorageUri(

	1363 name_expansion_result.GetSrcUriStr())

	1364 exp_src_uri = self.suri_builder.StorageUri(

	1365 name_expansion_result.GetExpandedUriStr(),

	1366 parse_version=self.parse_versions)

	1367 src_uri_names_container = name_expansion_result.NamesContainer()

	1368 src_uri_expands_to_multi = name_expansion_result.NamesContainer()

	1369 have_multiple_srcs = name_expansion_result.IsMultiSrcRequest()

	1370 have_existing_dest_subdir = (

	1371 name_expansion_result.HaveExistingDstContainer())

	1372

	1373 if src_uri.names_provider():

	1374 raise CommandException(

	1375 'The %s command does not allow provider-only source URIs (%s)' %

	1376 (cmd_name, src_uri))

	1377 if have_multiple_srcs:

	1378 self._InsistDstUriNamesContainer(exp_dst_uri,

	1379 have_existing_dst_container,

	1380 cmd_name)

	1381

	1382 if self.perform_mv:

	1383 # Disallow files as source arguments to protect users from deleting

	1384 # data off the local disk. Note that we can't simply set FILE_URIS_OK

	1385 # to False in command_spec because we do allow a file URI for the dest

	1386 # URI. (We allow users to move data out of the cloud to the local disk,

	1387 # but we disallow commands that would delete data off the local disk,

	1388 # and instead require the user to delete data separately, using local

	1389 # commands/tools.)

	1390 if src_uri.is_file_uri():

	1391 raise CommandException('The mv command disallows files as source '

	1392 'arguments.\nDid you mean to use a gs:// URI? '

	1393 'If you meant to use a file as a source, you\n'

	1394 'might consider using the "cp" command '

	1395 'instead.')

	1396 if name_expansion_result.NamesContainer():

	1397 # Use recursion_requested when performing name expansion for the

	1398 # directory mv case so we can determine if any of the source URIs are

	1399 # directories (and then use cp -R and rm -R to perform the move, to

	1400 # match the behavior of Unix mv (which when moving a directory moves

	1401 # all the contained files).

	1402 self.recursion_requested = True

	1403 # Disallow wildcard src URIs when moving directories, as supporting it

	1404 # would make the name transformation too complex and would also be

	1405 # dangerous (e.g., someone could accidentally move many objects to the

	1406 # wrong name, or accidentally overwrite many objects).

	1407 if ContainsWildcard(src_uri):

	1408 raise CommandException('The mv command disallows naming source '

	1409 'directories using wildcards')

	1410

	1411 if (exp_dst_uri.is_file_uri()

	1412 and not os.path.exists(exp_dst_uri.object_name)

	1413 and have_multiple_srcs):

	1414 os.makedirs(exp_dst_uri.object_name)

	1415

	1416 dst_uri = self._ConstructDstUri(src_uri, exp_src_uri,

	1417 src_uri_names_container,

	1418 src_uri_expands_to_multi,

	1419 have_multiple_srcs, exp_dst_uri,

	1420 have_existing_dest_subdir)

	1421 dst_uri = self._FixWindowsNaming(src_uri, dst_uri)

	1422

	1423 self._CheckForDirFileConflict(exp_src_uri, dst_uri)

	1424 if self._SrcDstSame(exp_src_uri, dst_uri):

	1425 raise CommandException('%s: "%s" and "%s" are the same file - '

	1426 'abort.' % (cmd_name, exp_src_uri, dst_uri))

	1427

	1428 elapsed_time = bytes_transferred = 0

	1429 try:

	1430 (elapsed_time, bytes_transferred) = self._PerformCopy(exp_src_uri,

	1431 dst_uri)

	1432 except Exception, e:

	1433 if self._IsNoClobberServerException(e):

	1434 if not self.quiet:

	1435 self.THREADED_LOGGER.info('Rejected (noclobber): %s' % dst_uri.uri)

	1436 elif self.continue_on_error:

	1437 if not self.quiet:

	1438 self.THREADED_LOGGER.error('Error copying %s: %s' % (src_uri.uri,

	1439 str(e)))

	1440 self.copy_failure_count += 1

	1441 else:

	1442 raise

	1443

	1444 # TODO: If we ever use -n (noclobber) with -M (move) (not possible today

	1445 # since we call copy internally from move and don't specify the -n flag)

	1446 # we'll need to only remove the source when we have not skipped the

	1447 # destination.

	1448 if self.perform_mv:

	1449 if not self.quiet:

	1450 self.THREADED_LOGGER.info('Removing %s...', exp_src_uri)

	1451 exp_src_uri.delete_key(validate=False, headers=self.headers)

	1452 stats_lock.acquire()

	1453 self.total_elapsed_time += elapsed_time

	1454 self.total_bytes_transferred += bytes_transferred

	1455 stats_lock.release()

	1456

	1457 # Start of RunCommand code.

	1458 self._ParseArgs()

	1459

	1460 self.total_elapsed_time = self.total_bytes_transferred = 0

	1461 if self.args[-1] == '-' or self.args[-1] == 'file://-':

	1462 self._HandleStreamingDownload()

	1463 return 0

	1464

	1465 if self.read_args_from_stdin:

	1466 if len(self.args) != 1:

	1467 raise CommandException('Source URIs cannot be specified with -I option')

	1468 uri_strs = self._StdinIterator()

	1469 else:

	1470 if len(self.args) < 2:

	1471 raise CommandException('Wrong number of arguments for "cp" command.')

	1472 uri_strs = self.args[0:len(self.args)-1]

	1473

	1474 (exp_dst_uri, have_existing_dst_container) = self._ExpandDstUri(

	1475 self.args[-1])

	1476 name_expansion_iterator = NameExpansionIterator(

	1477 self.command_name, self.proj_id_handler, self.headers, self.debug,

	1478 self.bucket_storage_uri_class, uri_strs,

	1479 self.recursion_requested or self.perform_mv,

	1480 have_existing_dst_container)

	1481

	1482 # Use a lock to ensure accurate statistics in the face of

	1483 # multi-threading/multi-processing.

	1484 stats_lock = threading.Lock()

	1485

	1486 # Tracks if any copies failed.

	1487 self.copy_failure_count = 0

	1488

	1489 # Start the clock.

	1490 start_time = time.time()

	1491

	1492 # Tuple of attributes to share/manage across multiple processes in

	1493 # parallel (-m) mode.

	1494 shared_attrs = ('copy_failure_count', 'total_bytes_transferred')

	1495

	1496 # Perform copy requests in parallel (-m) mode, if requested, using

	1497 # configured number of parallel processes and threads. Otherwise,

	1498 # perform requests with sequential function calls in current process.

	1499 self.Apply(_CopyFunc, name_expansion_iterator, _CopyExceptionHandler,

	1500 shared_attrs)

	1501 if self.debug:

	1502 print 'total_bytes_transferred:' + str(self.total_bytes_transferred)

	1503

	1504 end_time = time.time()

	1505 self.total_elapsed_time = end_time - start_time

	1506

	1507 if self.debug == 3:

	1508 # Note that this only counts the actual GET and PUT bytes for the copy

	1509 # - not any transfers for doing wildcard expansion, the initial HEAD

	1510 # request boto performs when doing a bucket.get_key() operation, etc.

	1511 if self.total_bytes_transferred != 0:

	1512 self.THREADED_LOGGER.info(

	1513 'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)',

	1514 self.total_bytes_transferred, self.total_elapsed_time,

	1515 MakeHumanReadable(float(self.total_bytes_transferred) /

	1516 float(self.total_elapsed_time)))

	1517 if self.copy_failure_count:

	1518 plural_str = ''

	1519 if self.copy_failure_count > 1:

	1520 plural_str = 's'

	1521 raise CommandException('%d file%s/object%s could not be transferred.' % (

	1522 self.copy_failure_count, plural_str, plural_str))

	1523

	1524 return 0

	1525

	1526 # Test specification. See definition of test_steps in base class for

	1527 # details on how to populate these fields.

	1528 num_test_buckets = 3

	1529 test_steps = [

	1530 # (test name, cmd line, ret code, (result_file, expect_file))

	1531 #

	1532 # Testing of no-clobber behavior.

	1533 ('upload', 'gsutil cp $F1 gs://$B1/$O1', 0, None),

	1534 ('setup noclobber', 'echo Skipping >noclob.ct', 0, None),

	1535 ('download', 'gsutil cp gs://$B1/$O1 $F9', 0, ('$F9', '$F1')),

	1536 ('noclobber upload',

	1537 'gsutil cp -n $F1 gs://$B1/$O1 2>&1 >/dev/null \| cut -c1-8 >noclob.ct1',

	1538 0, ('noclob.ct', 'noclob.ct1')),

	1539 ('noclobber download',

	1540 'gsutil cp -n gs://$B1/$O1 $F9 2>&1 >/dev/null \| cut -c1-8 >noclob.ct2',

	1541 0, ('noclob.ct', 'noclob.ct2')),

	1542 #

	1543 # Testing of copy-in-the-cloud:

	1544 ('noclobber setup copy-in-the-cloud',

	1545 'gsutil cp gs://$B1/$O1 gs://$B2', 0, None),

	1546 ('noclobber verify copy-in-the-cloud',

	1547 'gsutil cp -n gs://$B1/$O1 gs://$B2 2>&1 '

	1548 '>/dev/null \| cut -c1-8 > noclob.ct3', 0, ('noclob.ct', 'noclob.ct3')),

	1549 ('stream upload', 'cat $F1 \| gsutil cp - gs://$B1/$O1', 0, None),

	1550 ('check stream upload', 'gsutil cp gs://$B1/$O1 $F9', 0, ('$F9', '$F1')),

	1551 # Clean up if we got interrupted.

	1552 ('remove test files',

	1553 'rm -f test.mp3 test_mp3.ct test.gif test_gif.ct test.foo noclob.ct*',

	1554 0, None),

	1555 #

	1556 # Testing of Content-Type detection:

	1557 ('setup mp3 file', 'cp $G/gslib/test_data/test.mp3 test.mp3', 0, None),

	1558 ('setup mp3 CT', 'echo audio/mpeg >test_mp3.ct', 0, None),

	1559 ('setup gif file', 'cp $G/gslib/test_data/test.gif test.gif', 0, None),

	1560 ('setup gif CT', 'echo image/gif >test_gif.ct', 0, None),

	1561 # TODO: we don't need test.app and test.bin anymore if

	1562 # USE_MAGICFILE=True. Implement a way to test both with and without using

	1563 # magic file.

	1564 #('setup app file', 'echo application/octet-stream >test.app', 0, None),

	1565 ('setup foo file', 'echo foo/bar >test.foo', 0, None),

	1566 ('upload mp3', 'gsutil cp test.mp3 gs://$B1/$O1', 0, None),

	1567 ('verify mp3',

	1568 'gsutil ls -L gs://$B1/$O1 \| grep Content-Type \| cut -f3 >$F1',

	1569 0, ('$F1', 'test_mp3.ct')),

	1570 ('upload gif', 'gsutil cp test.gif gs://$B1/$O1', 0, None),

	1571 ('verify gif',

	1572 'gsutil ls -L gs://$B1/$O1 \| grep Content-Type \| cut -f3 >$F1',

	1573 0, ('$F1', 'test_gif.ct')),

	1574 # TODO: The commented-out /noCT test below fails with USE_MAGICFILE=True.

	1575 ('upload mp3/noCT',

	1576 'gsutil -h "Content-Type:" cp test.mp3 gs://$B1/$O1', 0, None),

	1577 ('verify mp3/noCT',

	1578 'gsutil ls -L gs://$B1/$O1 \| grep Content-Type \| cut -f3 >$F1',

	1579 0, ('$F1', 'test_mp3.ct')),

	1580 ('upload gif/noCT',

	1581 'gsutil -h "Content-Type:" cp test.gif gs://$B1/$O1', 0, None),

	1582 ('verify gif/noCT',

	1583 'gsutil ls -L gs://$B1/$O1 \| grep Content-Type \| cut -f3 >$F1',

	1584 0, ('$F1', 'test_gif.ct')),

	1585 #('upload foo/noCT', 'gsutil -h "Content-Type:" cp test.foo gs://$B1/$O1',

	1586 # 0, None),

	1587 #('verify foo/noCT',

	1588 # 'gsutil ls -L gs://$B1/$O1 \| grep Content-Type \| cut -f3 >$F1',

	1589 # 0, ('$F1', 'test_bin.ct')),

	1590 ('upload mp3/-h gif',

	1591 'gsutil -h "Content-Type:image/gif" cp test.mp3 gs://$B1/$O1', 0, None),

	1592 ('verify mp3/-h gif',

	1593 'gsutil ls -L gs://$B1/$O1 \| grep Content-Type \| cut -f3 >$F1',

	1594 0, ('$F1', 'test_gif.ct')),

	1595 ('upload gif/-h gif',

	1596 'gsutil -h "Content-Type:image/gif" cp test.gif gs://$B1/$O1', 0, None),

	1597 ('verify gif/-h gif',

	1598 'gsutil ls -L gs://$B1/$O1 \| grep Content-Type \| cut -f3 >$F1',

	1599 0, ('$F1', 'test_gif.ct')),

	1600 ('upload foo/-h gif',

	1601 'gsutil -h "Content-Type: image/gif" cp test.foo gs://$B1/$O1', 0, None),

	1602 ('verify foo/-h gif',

	1603 'gsutil ls -L gs://$B1/$O1 \| grep Content-Type \| cut -f3 >$F1',

	1604 0, ('$F1', 'test_gif.ct')),

	1605 ('remove test files',

	1606 'rm -f test.mp3 test_mp3.ct test.gif test_gif.ct test.foo', 0, None),

	1607 #

	1608 # Testing of versioning:

	1609 # To make the following tests simpler to understand, we note that bucket $B2

	1610 # is unused before this point.

	1611 ('enable versioning', 'gsutil setversioning on gs://$B2', 0, None),

	1612 ('upload new version', 'echo \'data1\' \| gsutil cp - gs://$B2/$O3', 0,

	1613 None),

	1614 ('cloud cp new version', 'gsutil cp gs://$B2/$O0 gs://$B2/$O3', 0, None),

	1615 ('upload new version', 'echo \'data2\' \| gsutil cp - gs://$B2/$O3', 0,

	1616 None),

	1617 ('get first generation name', 'gsutil ls -a gs://$B2/$O3 \| head -n 1 > $F9',

	1618 0, None),

	1619 ('setup data1 file', 'echo \'data1\' > $F1', 0, None),

	1620 ('setup data2 file', 'echo \'data2\' > $F2', 0, None),

	1621 ('download current version', 'gsutil cp gs://$B2/$O3 $F3', 0,

	1622 ('$F3', '$F2')),

	1623 ('download first version', 'gsutil cp -v `cat $F9` $F3', 0,

	1624 ('$F3', '$F1')),

	1625 ('cp first verion to current', 'gsutil cp -v `cat $F9` gs://$B2/$O3', 0,

	1626 None),

	1627 ('download current version', 'gsutil cp gs://$B2/$O3 $F3', 0,

	1628 ('$F3', '$F1')),

	1629 ('remove versioning test files', 'rm -f $F3 $F9', 0, None),

	1630 #

	1631 # Testing of taking args from stdin:

	1632 ('remove bucket B2 contents', 'gsutil -m rm -r gs://$B2', 0, None),

	1633 ('setup inputlist file, pt1', 'echo $F1 > $F3', 0, None),

	1634 ('setup inputlist file, pt1', 'echo $F2 >> $F3', 0, None),

	1635 ('setup expected ls output, pt1', 'echo gs://$B2/$F1 > $F4', 0, None),

	1636 ('setup expected ls output, pt2', 'echo gs://$B2/$F2 >> $F4', 0, None),

	1637 ('perform inputlist copy', 'gsutil cp -I gs://$B2 < $F3', 0, None),

	1638 ('verify inputlist copy', 'gsutil ls gs://$B2 > $F5', 0,

	1639 ('$F5', '$F4')),

	1640 ]

	1641

	1642 def _ParseArgs(self):

	1643 self.perform_mv = False

	1644 self.exclude_symlinks = False

	1645 self.quiet = False

	1646 self.no_clobber = False

	1647 self.continue_on_error = False

	1648 self.read_args_from_stdin = False

	1649 # self.recursion_requested initialized in command.py (so can be checked

	1650 # in parent class for all commands).

	1651 if self.sub_opts:

	1652 for o, unused_a in self.sub_opts:

	1653 if o == '-c':

	1654 self.continue_on_error = True

	1655 elif o == '-e':

	1656 self.exclude_symlinks = True

	1657 elif o == '-I':

	1658 self.read_args_from_stdin = True

	1659 elif o == '-M':

	1660 # Note that we signal to the cp command to perform a move (copy

	1661 # followed by remove) and use directory-move naming rules by passing

	1662 # the undocumented (for internal use) -M option when running the cp

	1663 # command from mv.py.

	1664 self.perform_mv = True

	1665 elif o == '-n':

	1666 self.no_clobber = True

	1667 elif o == '-q':

	1668 self.quiet = True

	1669 elif o == '-r' or o == '-R':

	1670 self.recursion_requested = True

	1671 elif o == '-v':

	1672 self.parse_versions = True

	1673

	1674 def _HandleStreamingDownload(self):

	1675 # Destination is <STDOUT>. Manipulate sys.stdout so as to redirect all

	1676 # debug messages to <STDERR>.

	1677 stdout_fp = sys.stdout

	1678 sys.stdout = sys.stderr

	1679 did_some_work = False

	1680 for uri_str in self.args[0:len(self.args)-1]:

	1681 for uri in self.WildcardIterator(uri_str).IterUris():

	1682 if not uri.names_object():

	1683 raise CommandException('Destination Stream requires that '

	1684 'source URI %s should represent an object!')

	1685 did_some_work = True

	1686 key = uri.get_key(False, self.headers)

	1687 (elapsed_time, bytes_transferred) = self._PerformDownloadToStream(

	1688 key, uri, stdout_fp, self.headers)

	1689 self.total_elapsed_time += elapsed_time

	1690 self.total_bytes_transferred += bytes_transferred

	1691 if not did_some_work:

	1692 raise CommandException('No URIs matched')

	1693 if self.debug == 3:

	1694 if self.total_bytes_transferred != 0:

	1695 self.THREADED_LOGGER.info(

	1696 'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)',

	1697 self.total_bytes_transferred, self.total_elapsed_time,

	1698 MakeHumanReadable(float(self.total_bytes_transferred) /

	1699 float(self.total_elapsed_time)))

	1700 def _StdinIterator(self):

	1701 """A generator function that returns lines from stdin."""

	1702 for line in sys.stdin:

	1703 # Strip CRLF.

	1704 yield line.rstrip()

	1705

	1706 def _SrcDstSame(self, src_uri, dst_uri):

	1707 """Checks if src_uri and dst_uri represent the same object or file.

	1708

	1709 We don't handle anything about hard or symbolic links.

	1710

	1711 Args:

	1712 src_uri: Source StorageUri.

	1713 dst_uri: Destination StorageUri.

	1714

	1715 Returns:

	1716 Bool indicator.

	1717 """

	1718 if src_uri.is_file_uri() and dst_uri.is_file_uri():

	1719 # Translate a/b/./c to a/b/c, so src=dst comparison below works.

	1720 new_src_path = re.sub('%s+\.%s+' % (os.sep, os.sep), os.sep,

	1721 src_uri.object_name)

	1722 new_src_path = re.sub('^.%s+' % os.sep, '', new_src_path)

	1723 new_dst_path = re.sub('%s+\.%s+' % (os.sep, os.sep), os.sep,

	1724 dst_uri.object_name)

	1725 new_dst_path = re.sub('^.%s+' % os.sep, '', new_dst_path)

	1726 return (src_uri.clone_replace_name(new_src_path).uri ==

	1727 dst_uri.clone_replace_name(new_dst_path).uri)

	1728 else:

	1729 return (src_uri.uri == dst_uri.uri and

	1730 src_uri.generation == dst_uri.generation and

	1731 src_uri.version_id == dst_uri.version_id)

	1732

	1733 def _ShouldTreatDstUriAsBucketSubDir(self, have_multiple_srcs, dst_uri,

	1734 have_existing_dest_subdir):

	1735 """

	1736 Checks whether dst_uri should be treated as a bucket "sub-directory". The

	1737 decision about whether something constitutes a bucket "sub-directory"

	1738 depends on whether there are multiple sources in this request and whether

	1739 there is an existing bucket subdirectory. For example, when running the

	1740 command:

	1741 gsutil cp file gs://bucket/abc

	1742 if there's no existing gs://bucket/abc bucket subdirectory we should copy

	1743 file to the object gs://bucket/abc. In contrast, if

	1744 there's an existing gs://bucket/abc bucket subdirectory we should copy

	1745 file to gs://bucket/abc/file. And regardless of whether gs://bucket/abc

	1746 exists, when running the command:

	1747 gsutil cp file1 file2 gs://bucket/abc

	1748 we should copy file1 to gs://bucket/abc/file1 (and similarly for file2).

	1749

	1750 Note that we don't disallow naming a bucket "sub-directory" where there's

	1751 already an object at that URI. For example it's legitimate (albeit

	1752 confusing) to have an object called gs://bucket/dir and

	1753 then run the command

	1754 gsutil cp file1 file2 gs://bucket/dir

	1755 Doing so will end up with objects gs://bucket/dir, gs://bucket/dir/file1,

	1756 and gs://bucket/dir/file2.

	1757

	1758 Args:

	1759 have_multiple_srcs: Bool indicator of whether this is a multi-source

	1760 operation.

	1761 dst_uri: StorageUri to check.

	1762 have_existing_dest_subdir: bool indicator whether dest is an existing

	1763 subdirectory.

	1764

	1765 Returns:

	1766 bool indicator.

	1767 """

	1768 return ((have_multiple_srcs and dst_uri.is_cloud_uri())

	1769 or (have_existing_dest_subdir))

	1770

	1771 def _ShouldTreatDstUriAsSingleton(self, have_multiple_srcs,

	1772 have_existing_dest_subdir, dst_uri):

	1773 """

	1774 Checks that dst_uri names a singleton (file or object) after

	1775 dir/wildcard expansion. The decision is more nuanced than simply

	1776 dst_uri.names_singleton()) because of the possibility that an object path

	1777 might name a bucket sub-directory.

	1778

	1779 Args:

	1780 have_multiple_srcs: Bool indicator of whether this is a multi-source

	1781 operation.

	1782 have_existing_dest_subdir: bool indicator whether dest is an existing

	1783 subdirectory.

	1784 dst_uri: StorageUri to check.

	1785

	1786 Returns:

	1787 bool indicator.

	1788 """

	1789 if have_multiple_srcs:

	1790 # Only a file meets the criteria in this case.

	1791 return dst_uri.names_file()

	1792 return not have_existing_dest_subdir and dst_uri.names_singleton()

	1793

	1794 def _IsNoClobberServerException(self, e):

	1795 """

	1796 Checks to see if the server attempted to clobber a file after we specified

	1797 in the header that we didn't want the file clobbered.

	1798

	1799 Args:

	1800 e: The Exception that was generated by a failed copy operation

	1801

	1802 Returns:

	1803 bool indicator - True indicates that the server did attempt to clobber

	1804 an existing file.

	1805 """

	1806 return self.no_clobber and (

	1807 (isinstance(e, GSResponseError) and e.status==412) or

	1808 (isinstance(e, ResumableUploadException) and 'code 412' in e.message))

	1809

	1810 def _GetPathBeforeFinalDir(uri):

	1811 """

	1812 Returns the part of the path before the final directory component for the

	1813 given URI, handling cases for file system directories, bucket, and bucket

	1814 subdirectories. Example: for gs://bucket/dir/ we'll return 'gs://bucket',

	1815 and for file://dir we'll return file://

	1816

	1817 Args:

	1818 uri: StorageUri.

	1819

	1820 Returns:

	1821 String name of above-described path, sans final path separator.

	1822 """

	1823 sep = uri.delim

	1824 assert not uri.names_file()

	1825 if uri.names_directory():

	1826 past_scheme = uri.uri[len('file://'):]

	1827 if past_scheme.find(sep) == -1:

	1828 return 'file://'

	1829 else:

	1830 return 'file://%s' % past_scheme.rstrip(sep).rpartition(sep)[0]

	1831 if uri.names_bucket():

	1832 return '%s://' % uri.scheme

	1833 # Else it names a bucket subdir.

	1834 return uri.uri.rstrip(sep).rpartition(sep)[0]

	1835

	1836 def _hash_filename(filename):

	1837 """

	1838 Apply a hash function (SHA1) to shorten the passed file name. The spec

	1839 for the hashed file name is as follows:

	1840

	1841 TRACKER_<hash>_<trailing>

	1842

	1843 where hash is a SHA1 hash on the original file name and trailing is

	1844 the last 16 chars from the original file name. Max file name lengths

	1845 vary by operating system so the goal of this function is to ensure

	1846 the hashed version takes fewer than 100 characters.

	1847

	1848 Args:

	1849 filename: file name to be hashed.

	1850

	1851 Returns:

	1852 shorter, hashed version of passed file name

	1853 """

	1854 m = hashlib.sha1(filename.encode('utf-8'))

	1855 hashed_name = ("TRACKER_" + m.hexdigest() + '.' + filename[-16:])

	1856 return hashed_name

OLD	NEW

« download_from_google_storage.py ('K') | « third_party/gsutil/gslib/commands/config.py ('k') | third_party/gsutil/gslib/commands/disablelogging.py » ('j') | upload_to_google_storage.py » ('J')