third_party/gsutil/gslib/commands/cp.py - Issue 1377933002: [catapult] - Copy Telemetry's gsutilz over to third_party.

Side by Side Diff: third_party/gsutil/gslib/commands/cp.py

Issue 1377933002: [catapult] - Copy Telemetry's gsutilz over to third_party. (Closed) Base URL: https://github.com/catapult-project/catapult.git@master

Patch Set: Rename to gsutil. Created 5 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 # -- coding: utf-8 --

	2 # Copyright 2011 Google Inc. All Rights Reserved.

	3 # Copyright 2011, Nexenta Systems Inc.

	4 #

	5 # Licensed under the Apache License, Version 2.0 (the "License");

	6 # you may not use this file except in compliance with the License.

	7 # You may obtain a copy of the License at

	8 #

	9 # http://www.apache.org/licenses/LICENSE-2.0

	10 #

	11 # Unless required by applicable law or agreed to in writing, software

	12 # distributed under the License is distributed on an "AS IS" BASIS,

	13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

	14 # See the License for the specific language governing permissions and

	15 # limitations under the License.

	16 """Implementation of Unix-like cp command for cloud storage providers."""

	17

	18 from __future__ import absolute_import

	19

	20 import os

	21 import time

	22 import traceback

	23

	24 from gslib import copy_helper

	25 from gslib.cat_helper import CatHelper

	26 from gslib.cloud_api import AccessDeniedException

	27 from gslib.cloud_api import NotFoundException

	28 from gslib.command import Command

	29 from gslib.command_argument import CommandArgument

	30 from gslib.commands.compose import MAX_COMPONENT_COUNT

	31 from gslib.copy_helper import CreateCopyHelperOpts

	32 from gslib.copy_helper import ItemExistsError

	33 from gslib.copy_helper import Manifest

	34 from gslib.copy_helper import PARALLEL_UPLOAD_TEMP_NAMESPACE

	35 from gslib.copy_helper import SkipUnsupportedObjectError

	36 from gslib.cs_api_map import ApiSelector

	37 from gslib.exception import CommandException

	38 from gslib.name_expansion import NameExpansionIterator

	39 from gslib.storage_url import ContainsWildcard

	40 from gslib.util import CreateLock

	41 from gslib.util import GetCloudApiInstance

	42 from gslib.util import IsCloudSubdirPlaceholder

	43 from gslib.util import MakeHumanReadable

	44 from gslib.util import NO_MAX

	45 from gslib.util import RemoveCRLFFromString

	46 from gslib.util import StdinIterator

	47

	48 _SYNOPSIS = """

	49 gsutil cp [OPTION]... src_url dst_url

	50 gsutil cp [OPTION]... src_url... dst_url

	51 gsutil cp [OPTION]... -I dst_url

	52 """

	53

	54 _SYNOPSIS_TEXT = """

	55 <B>SYNOPSIS</B>

	56 """ + _SYNOPSIS

	57

	58 _DESCRIPTION_TEXT = """

	59 <B>DESCRIPTION</B>

	60 The gsutil cp command allows you to copy data between your local file

	61 system and the cloud, copy data within the cloud, and copy data between

	62 cloud storage providers. For example, to copy all text files from the

	63 local directory to a bucket you could do:

	64

	65 gsutil cp *.txt gs://my_bucket

	66

	67 Similarly, you can download text files from a bucket by doing:

	68

	69 gsutil cp gs://my_bucket/*.txt .

	70

	71 If you want to copy an entire directory tree you need to use the -r option:

	72

	73 gsutil cp -r dir gs://my_bucket

	74

	75 If you have a large number of files to upload you might want to use the

	76 gsutil -m option, to perform a parallel (multi-threaded/multi-processing)

	77 copy:

	78

	79 gsutil -m cp -r dir gs://my_bucket

	80

	81 You can pass a list of URLs (one per line) to copy on stdin instead of as

	82 command line arguments by using the -I option. This allows you to use gsutil

	83 in a pipeline to upload or download files / objects as generated by a program,

	84 such as:

	85

	86 some_program \| gsutil -m cp -I gs://my_bucket

	87

	88 or:

	89

	90 some_program \| gsutil -m cp -I ./download_dir

	91

	92 The contents of stdin can name files, cloud URLs, and wildcards of files

	93 and cloud URLs.

	94 """

	95

	96 _NAME_CONSTRUCTION_TEXT = """

	97 <B>HOW NAMES ARE CONSTRUCTED</B>

	98 The gsutil cp command strives to name objects in a way consistent with how

	99 Linux cp works, which causes names to be constructed in varying ways depending

	100 on whether you're performing a recursive directory copy or copying

	101 individually named objects; and whether you're copying to an existing or

	102 non-existent directory.

	103

	104 When performing recursive directory copies, object names are constructed

	105 that mirror the source directory structure starting at the point of

	106 recursive processing. For example, the command:

	107

	108 gsutil cp -r dir1/dir2 gs://my_bucket

	109

	110 will create objects named like gs://my_bucket/dir2/a/b/c, assuming

	111 dir1/dir2 contains the file a/b/c.

	112

	113 In contrast, copying individually named files will result in objects named

	114 by the final path component of the source files. For example, the command:

	115

	116 gsutil cp dir1/dir2/** gs://my_bucket

	117

	118 will create objects named like gs://my_bucket/c.

	119

	120 The same rules apply for downloads: recursive copies of buckets and

	121 bucket subdirectories produce a mirrored filename structure, while copying

	122 individually (or wildcard) named objects produce flatly named files.

	123

	124 Note that in the above example the '**' wildcard matches all names

	125 anywhere under dir. The wildcard '*' will match names just one level deep. For

	126 more details see 'gsutil help wildcards'.

	127

	128 There's an additional wrinkle when working with subdirectories: the resulting

	129 names depend on whether the destination subdirectory exists. For example,

	130 if gs://my_bucket/subdir exists as a subdirectory, the command:

	131

	132 gsutil cp -r dir1/dir2 gs://my_bucket/subdir

	133

	134 will create objects named like gs://my_bucket/subdir/dir2/a/b/c. In contrast,

	135 if gs://my_bucket/subdir does not exist, this same gsutil cp command will

	136 create objects named like gs://my_bucket/subdir/a/b/c.

	137

	138 Note: If you use the

	139 `Google Developers Console <https://console.developers.google.com>`_

	140 to create folders, it does so by creating a "placeholder" object that ends

	141 with a "/" character. gsutil skips these objects when downloading from the

	142 cloud to the local file system, because attempting to create a file that

	143 ends with a "/" is not allowed on Linux and MacOS. Because of this, it is

	144 recommended that you not create objects that end with "/" (unless you don't

	145 need to be able to download such objects using gsutil).

	146 """

	147

	148 _SUBDIRECTORIES_TEXT = """

	149 <B>COPYING TO/FROM SUBDIRECTORIES; DISTRIBUTING TRANSFERS ACROSS MACHINES</B>

	150 You can use gsutil to copy to and from subdirectories by using a command

	151 like:

	152

	153 gsutil cp -r dir gs://my_bucket/data

	154

	155 This will cause dir and all of its files and nested subdirectories to be

	156 copied under the specified destination, resulting in objects with names like

	157 gs://my_bucket/data/dir/a/b/c. Similarly you can download from bucket

	158 subdirectories by using a command like:

	159

	160 gsutil cp -r gs://my_bucket/data dir

	161

	162 This will cause everything nested under gs://my_bucket/data to be downloaded

	163 into dir, resulting in files with names like dir/data/a/b/c.

	164

	165 Copying subdirectories is useful if you want to add data to an existing

	166 bucket directory structure over time. It's also useful if you want

	167 to parallelize uploads and downloads across multiple machines (often

	168 reducing overall transfer time compared with simply running gsutil -m

	169 cp on one machine). For example, if your bucket contains this structure:

	170

	171 gs://my_bucket/data/result_set_01/

	172 gs://my_bucket/data/result_set_02/

	173 ...

	174 gs://my_bucket/data/result_set_99/

	175

	176 you could perform concurrent downloads across 3 machines by running these

	177 commands on each machine, respectively:

	178

	179 gsutil -m cp -r gs://my_bucket/data/result_set_[0-3]* dir

	180 gsutil -m cp -r gs://my_bucket/data/result_set_[4-6]* dir

	181 gsutil -m cp -r gs://my_bucket/data/result_set_[7-9]* dir

	182

	183 Note that dir could be a local directory on each machine, or it could

	184 be a directory mounted off of a shared file server; whether the latter

	185 performs acceptably may depend on a number of things, so we recommend

	186 you experiment and find out what works best for you.

	187 """

	188

	189 _COPY_IN_CLOUD_TEXT = """

	190 <B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B>

	191 If both the source and destination URL are cloud URLs from the same

	192 provider, gsutil copies data "in the cloud" (i.e., without downloading

	193 to and uploading from the machine where you run gsutil). In addition to

	194 the performance and cost advantages of doing this, copying in the cloud

	195 preserves metadata (like Content-Type and Cache-Control). In contrast,

	196 when you download data from the cloud it ends up in a file, which has

	197 no associated metadata. Thus, unless you have some way to hold on to

	198 or re-create that metadata, downloading to a file will not retain the

	199 metadata.

	200

	201 Copies spanning locations and/or storage classes cause data to be rewritten

	202 in the cloud, which may take some time. Such operations can be resumed with

	203 the same command if they are interrupted, so long as the command parameters

	204 are identical.

	205

	206 Note that by default, the gsutil cp command does not copy the object

	207 ACL to the new object, and instead will use the default bucket ACL (see

	208 "gsutil help defacl"). You can override this behavior with the -p

	209 option (see OPTIONS below).

	210

	211 One additional note about copying in the cloud: If the destination bucket has

	212 versioning enabled, gsutil cp will copy all versions of the source object(s).

	213 For example:

	214

	215 gsutil cp gs://bucket1/obj gs://bucket2

	216

	217 will cause all versions of gs://bucket1/obj to be copied to gs://bucket2.

	218 """

	219

	220 _CHECKSUM_VALIDATION_TEXT = """

	221 <B>CHECKSUM VALIDATION</B>

	222 At the end of every upload or download the gsutil cp command validates that

	223 the checksum it computes for the source file/object matches the checksum

	224 the service computes. If the checksums do not match, gsutil will delete the

	225 corrupted object and print a warning message. This very rarely happens, but

	226 if it does, please contact gs-team@google.com.

	227

	228 If you know the MD5 of a file before uploading you can specify it in the

	229 Content-MD5 header, which will cause the cloud storage service to reject the

	230 upload if the MD5 doesn't match the value computed by the service. For

	231 example:

	232

	233 % gsutil hash obj

	234 Hashing obj:

	235 Hashes [base64] for obj:

	236 Hash (crc32c): lIMoIw==

	237 Hash (md5): VgyllJgiiaRAbyUUIqDMmw==

	238

	239 % gsutil -h Content-MD5:VgyllJgiiaRAbyUUIqDMmw== cp obj gs://your-bucket/obj

	240 Copying file://obj [Content-Type=text/plain]...

	241 Uploading gs://your-bucket/obj: 182 b/182 B

	242

	243 If the checksum didn't match the service would instead reject the upload and

	244 gsutil would print a message like:

	245

	246 BadRequestException: 400 Provided MD5 hash "VgyllJgiiaRAbyUUIqDMmw=="

	247 doesn't match calculated MD5 hash "7gyllJgiiaRAbyUUIqDMmw==".

	248

	249 Even if you don't do this gsutil will delete the object if the computed

	250 checksum mismatches, but specifying the Content-MD5 header has three

	251 advantages:

	252

	253 1. It prevents the corrupted object from becoming visible at all, whereas

	254 otherwise it would be visible for 1-3 seconds before gsutil deletes it.

	255

	256 2. It will definitively prevent the corrupted object from being left in

	257 the cloud, whereas the gsutil approach of deleting after the upload

	258 completes could fail if (for example) the gsutil process gets ^C'd

	259 between upload and deletion request.

	260

	261 3. It supports a customer-to-service integrity check handoff. For example,

	262 if you have a content production pipeline that generates data to be

	263 uploaded to the cloud along with checksums of that data, specifying the

	264 MD5 computed by your content pipeline when you run gsutil cp will ensure

	265 that the checksums match all the way through the process (e.g., detecting

	266 if data gets corrupted on your local disk between the time it was written

	267 by your content pipeline and the time it was uploaded to GCS).

	268

	269 Note: The Content-MD5 header is ignored for composite objects, because such

	270 objects only have a CRC32C checksum.

	271 """

	272

	273 _RETRY_HANDLING_TEXT = """

	274 <B>RETRY HANDLING</B>

	275 The cp command will retry when failures occur, but if enough failures happen

	276 during a particular copy or delete operation the command will skip that object

	277 and move on. At the end of the copy run if any failures were not successfully

	278 retried, the cp command will report the count of failures, and exit with

	279 non-zero status.

	280

	281 Note that there are cases where retrying will never succeed, such as if you

	282 don't have write permission to the destination bucket or if the destination

	283 path for some objects is longer than the maximum allowed length.

	284

	285 For more details about gsutil's retry handling, please see

	286 "gsutil help retries".

	287 """

	288

	289 _RESUMABLE_TRANSFERS_TEXT = """

	290 <B>RESUMABLE TRANSFERS</B>

	291 gsutil automatically uses the Google Cloud Storage resumable upload feature

	292 whenever you use the cp command to upload an object that is larger than 2

	293 MiB. You do not need to specify any special command line options to make this

	294 happen. If your upload is interrupted you can restart the upload by running

	295 the same cp command that you ran to start the upload. Until the upload

	296 has completed successfully, it will not be visible at the destination object

	297 and will not replace any existing object the upload is intended to overwrite.

	298 (However, see the section on PARALLEL COMPOSITE UPLOADS, which may leave

	299 temporary component objects in place during the upload process.)

	300

	301 Similarly, gsutil automatically performs resumable downloads (using HTTP

	302 standard Range GET operations) whenever you use the cp command, unless the

	303 destination is a stream or null. In this case the partially downloaded file

	304 will be visible as soon as it starts being written. Thus, before you attempt

	305 to use any files downloaded by gsutil you should make sure the download

	306 completed successfully, by checking the exit status from the gsutil command.

	307 This can be done in a bash script, for example, by doing:

	308

	309 gsutil cp gs://your-bucket/your-object ./local-file

	310 if [ "$status" -ne "0" ] ; then

	311 << Code that handles failures >>

	312 fi

	313

	314 Resumable uploads and downloads store some state information in a file

	315 in ~/.gsutil named by the destination object or file. If you attempt to

	316 resume a transfer from a machine with a different directory, the transfer

	317 will start over from scratch.

	318

	319 See also "gsutil help prod" for details on using resumable transfers

	320 in production.

	321 """

	322

	323 _STREAMING_TRANSFERS_TEXT = """

	324 <B>STREAMING TRANSFERS</B>

	325 Use '-' in place of src_url or dst_url to perform a streaming

	326 transfer. For example:

	327

	328 long_running_computation \| gsutil cp - gs://my_bucket/obj

	329

	330 Streaming uploads using the JSON API (see "gsutil help apis") are buffered in

	331 memory and can retry in the event of network flakiness or service errors.

	332

	333 Streaming transfers (other than uploads using the JSON API) do not support

	334 resumable uploads/downloads. If you have a large amount of data to upload

	335 (say, more than 100 MiB) it is recommended to write the data to a local file

	336 and then copy that file to the cloud rather than streaming it (and similarly

	337 for large downloads).

	338

	339 WARNING: When performing streaming transfers gsutil does not compute a

	340 checksum of the uploaded or downloaded data. Therefore, we recommend that

	341 users either perform their own validation of the data or use non-streaming

	342 transfers (which perform integrity checking automatically).

	343 """

	344

	345 _PARALLEL_COMPOSITE_UPLOADS_TEXT = """

	346 <B>PARALLEL COMPOSITE UPLOADS</B>

	347 gsutil can automatically use

	348 `object composition <https://developers.google.com/storage/docs/composite-obje cts>`_

	349 to perform uploads in parallel for large, local files being uploaded to Google

	350 Cloud Storage. This means that, if enabled (see next paragraph), a large file

	351 will be split into component pieces that will be uploaded in parallel. Those

	352 components will then be composed in the cloud, and the temporary components in

	353 the cloud will be deleted after successful composition. No additional local

	354 disk space is required for this operation.

	355

	356 Using parallel composite uploads presents a tradeoff between upload

	357 performance and download configuration: If you enable parallel composite

	358 uploads your uploads will run faster, but someone will need to install a

	359 compiled crcmod (see "gsutil help crcmod") on every machine where objects are

	360 downloaded by gsutil or other Python applications. For some distributions this

	361 is easy (e.g., it comes pre-installed on MacOS), but in some cases users have

	362 found it difficult. Because of this at present parallel composite uploads are

	363 disabled by default. Google is actively working with a number of the Linux

	364 distributions to get crcmod included with the stock distribution. Once that is

	365 done we will re-enable parallel composite uploads by default in gsutil.

	366

	367 To try parallel composite uploads you can run the command:

	368

	369 gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp bigfile gs://yo ur-bucket

	370

	371 where bigfile is larger than 150 MiB. When you do this notice that the upload

	372 progress indicator continuously updates for several different uploads at once

	373 (corresponding to each of the sections of the file being uploaded in

	374 parallel), until the parallel upload completes. If you then want to enable

	375 parallel composite uploads for all of your future uploads (notwithstanding the

	376 caveats mentioned earlier), you can uncomment and set the

	377 "parallel_composite_upload_threshold" config value in your .boto configuration

	378 file to this value.

	379

	380 Note that the crcmod problem only impacts downloads via Python applications

	381 (such as gsutil). If any users who need to download the data using gsutil or

	382 other Python applications can install crcmod, it makes sense to enable

	383 parallel composite uploads (see above). For example, if you use gsutil to

	384 upload video assets and those assets will only ever be served via a Java

	385 application (there are efficient crc32c implementations available in Java), it

	386 would make sense to enable parallel composite uploads on your machine.

	387

	388 If a parallel composite upload fails prior to composition, re-running the

	389 gsutil command will take advantage of resumable uploads for those components

	390 that failed, and the component objects will be deleted after the first

	391 successful attempt. Any temporary objects that were uploaded successfully

	392 before gsutil failed will still exist until the upload is completed

	393 successfully. The temporary objects will be named in the following fashion:

	394

	395 <random ID>%s<hash>

	396

	397 where <random ID> is some numerical value, and <hash> is an MD5 hash (not

	398 related to the hash of the contents of the file or object).

	399

	400 To avoid leaving temporary objects around, you should make sure to check the

	401 exit status from the gsutil command. This can be done in a bash script, for

	402 example, by doing:

	403

	404 gsutil cp ./local-file gs://your-bucket/your-object

	405 if [ "$status" -ne "0" ] ; then

	406 << Code that handles failures >>

	407 fi

	408

	409 Or, for copying a directory, use this instead:

	410

	411 gsutil cp -c -L cp.log -r ./dir gs://bucket

	412 if [ "$status" -ne "0" ] ; then

	413 << Code that handles failures >>

	414 fi

	415

	416 One important caveat is that files uploaded in this fashion are still subject

	417 to the maximum number of components limit. For example, if you upload a large

	418 file that gets split into %d components, and try to compose it with another

	419 object with %d components, the operation will fail because it exceeds the %d

	420 component limit. If you wish to compose an object later and the component

	421 limit is a concern, it is recommended that you disable parallel composite

	422 uploads for that transfer.

	423

	424 Also note that an object uploaded using this feature will have a CRC32C hash,

	425 but it will not have an MD5 hash (and because of that, requires users who

	426 download the object to have crcmod installed, as noted earlier). For details

	427 see 'gsutil help crc32c'.

	428

	429 Note that this feature can be completely disabled by setting the

	430 "parallel_composite_upload_threshold" variable in the .boto config file to 0.

	431 """ % (PARALLEL_UPLOAD_TEMP_NAMESPACE, 10, MAX_COMPONENT_COUNT - 9,

	432 MAX_COMPONENT_COUNT)

	433

	434

	435 _CHANGING_TEMP_DIRECTORIES_TEXT = """

	436 <B>CHANGING TEMP DIRECTORIES</B>

	437 gsutil writes data to a temporary directory in several cases:

	438

	439 - when compressing data to be uploaded (see the -z option)

	440 - when decompressing data being downloaded (when the data has

	441 Content-Encoding:gzip, e.g., as happens when uploaded using gsutil cp -z)

	442 - when running integration tests (using the gsutil test command)

	443

	444 In these cases it's possible the temp file location on your system that

	445 gsutil selects by default may not have enough space. If you find that

	446 gsutil runs out of space during one of these operations (e.g., raising

	447 "CommandException: Inadequate temp space available to compress <your file>"

	448 during a gsutil cp -z operation), you can change where it writes these

	449 temp files by setting the TMPDIR environment variable. On Linux and MacOS

	450 you can do this either by running gsutil this way:

	451

	452 TMPDIR=/some/directory gsutil cp ...

	453

	454 or by adding this line to your ~/.bashrc file and then restarting the shell

	455 before running gsutil:

	456

	457 export TMPDIR=/some/directory

	458

	459 On Windows 7 you can change the TMPDIR environment variable from Start ->

	460 Computer -> System -> Advanced System Settings -> Environment Variables.

	461 You need to reboot after making this change for it to take effect. (Rebooting

	462 is not necessary after running the export command on Linux and MacOS.)

	463 """

	464

	465 _OPTIONS_TEXT = """

	466 <B>OPTIONS</B>

	467 -a canned_acl Sets named canned_acl when uploaded objects created. See

	468 'gsutil help acls' for further details.

	469

	470 -c If an error occurs, continue to attempt to copy the remaining

	471 files. If any copies were unsuccessful, gsutil's exit status

	472 will be non-zero even if this flag is set. This option is

	473 implicitly set when running "gsutil -m cp...". Note: -c only

	474 applies to the actual copying operation. If an error occurs

	475 while iterating over the files in the local directory (e.g.,

	476 invalid Unicode file name) gsutil will print an error message

	477 and abort.

	478

	479 -D Copy in "daisy chain" mode, i.e., copying between two buckets

	480 by hooking a download to an upload, via the machine where

	481 gsutil is run. By default, data are copied between two buckets

	482 "in the cloud", i.e., without needing to copy via the machine

	483 where gsutil runs.

	484

	485 By default, a "copy in the cloud" when the source is a

	486 composite object will retain the composite nature of the

	487 object. However, Daisy chain mode can be used to change a

	488 composite object into a non-composite object. For example:

	489

	490 gsutil cp -D -p gs://bucket/obj gs://bucket/obj_tmp

	491 gsutil mv -p gs://bucket/obj_tmp gs://bucket/obj

	492

	493 Note: Daisy chain mode is automatically used when copying

	494 between providers (e.g., to copy data from Google Cloud Storage

	495 to another provider).

	496

	497 -e Exclude symlinks. When specified, symbolic links will not be

	498 copied.

	499

	500 -I Causes gsutil to read the list of files or objects to copy from

	501 stdin. This allows you to run a program that generates the list

	502 of files to upload/download.

	503

	504 -L <file> Outputs a manifest log file with detailed information about

	505 each item that was copied. This manifest contains the following

	506 information for each item:

	507

	508 - Source path.

	509 - Destination path.

	510 - Source size.

	511 - Bytes transferred.

	512 - MD5 hash.

	513 - UTC date and time transfer was started in ISO 8601 format.

	514 - UTC date and time transfer was completed in ISO 8601 format.

	515 - Upload id, if a resumable upload was performed.

	516 - Final result of the attempted transfer, success or failure.

	517 - Failure details, if any.

	518

	519 If the log file already exists, gsutil will use the file as an

	520 input to the copy process, and will also append log items to

	521 the existing file. Files/objects that are marked in the

	522 existing log file as having been successfully copied (or

	523 skipped) will be ignored. Files/objects without entries will be

	524 copied and ones previously marked as unsuccessful will be

	525 retried. This can be used in conjunction with the -c option to

	526 build a script that copies a large number of objects reliably,

	527 using a bash script like the following:

	528

	529 until gsutil cp -c -L cp.log -r ./dir gs://bucket; do

	530 sleep 1

	531 done

	532

	533 The -c option will cause copying to continue after failures

	534 occur, and the -L option will allow gsutil to pick up where it

	535 left off without duplicating work. The loop will continue

	536 running as long as gsutil exits with a non-zero status (such a

	537 status indicates there was at least one failure during the

	538 gsutil run).

	539

	540 Note: If you're trying to synchronize the contents of a

	541 directory and a bucket (or two buckets), see

	542 'gsutil help rsync'.

	543

	544 -n No-clobber. When specified, existing files or objects at the

	545 destination will not be overwritten. Any items that are skipped

	546 by this option will be reported as being skipped. This option

	547 will perform an additional GET request to check if an item

	548 exists before attempting to upload the data. This will save

	549 retransmitting data, but the additional HTTP requests may make

	550 small object transfers slower and more expensive.

	551

	552 -p Causes ACLs to be preserved when copying in the cloud. Note

	553 that this option has performance and cost implications when

	554 using the XML API, as it requires separate HTTP calls for

	555 interacting with ACLs. The performance issue can be mitigated

	556 to some degree by using gsutil -m cp to cause parallel copying.

	557 Also, this option only works if you have OWNER access to all of

	558 the objects that are copied.

	559

	560 You can avoid the additional performance and cost of using

	561 cp -p if you want all objects in the destination bucket to end

	562 up with the same ACL by setting a default object ACL on that

	563 bucket instead of using cp -p. See "help gsutil defacl".

	564

	565 Note that it's not valid to specify both the -a and -p options

	566 together.

	567

	568 -R, -r Causes directories, buckets, and bucket subdirectories to be

	569 copied recursively. If you neglect to use this option for

	570 an upload, gsutil will copy any files it finds and skip any

	571 directories. Similarly, neglecting to specify -r for a download

	572 will cause gsutil to copy any objects at the current bucket

	573 directory level, and skip any subdirectories.

	574

	575 -U Skip objects with unsupported object types instead of failing.

	576 Unsupported object types are s3 glacier objects.

	577

	578 -v Requests that the version-specific URL for each uploaded object

	579 be printed. Given this URL you can make future upload requests

	580 that are safe in the face of concurrent updates, because Google

	581 Cloud Storage will refuse to perform the update if the current

	582 object version doesn't match the version-specific URL. See

	583 'gsutil help versions' for more details.

	584

	585 -z <ext,...> Applies gzip content-encoding to file uploads with the given

	586 extensions. This is useful when uploading files with

	587 compressible content (such as .js, .css, or .html files)

	588 because it saves network bandwidth and space in Google Cloud

	589 Storage, which in turn reduces storage costs.

	590

	591 When you specify the -z option, the data from your files is

	592 compressed before it is uploaded, but your actual files are

	593 left uncompressed on the local disk. The uploaded objects

	594 retain the Content-Type and name of the original files but are

	595 given a Content-Encoding header with the value "gzip" to

	596 indicate that the object data stored are compressed on the

	597 Google Cloud Storage servers.

	598

	599 For example, the following command:

	600

	601 gsutil cp -z html -a public-read cattypes.html gs://mycats

	602

	603 will do all of the following:

	604

	605 - Upload as the object gs://mycats/cattypes.html (cp command)

	606 - Set the Content-Type to text/html (based on file extension)

	607 - Compress the data in the file cattypes.html (-z option)

	608 - Set the Content-Encoding to gzip (-z option)

	609 - Set the ACL to public-read (-a option)

	610 - If a user tries to view cattypes.html in a browser, the

	611 browser will know to uncompress the data based on the

	612 Content-Encoding header, and to render it as HTML based on

	613 the Content-Type header.

	614

	615 Note that if you download an object with Content-Encoding:gzip

	616 gsutil will decompress the content before writing the local

	617 file.

	618 """

	619

	620 _DETAILED_HELP_TEXT = '\n\n'.join([_SYNOPSIS_TEXT,

	621 _DESCRIPTION_TEXT,

	622 _NAME_CONSTRUCTION_TEXT,

	623 _SUBDIRECTORIES_TEXT,

	624 _COPY_IN_CLOUD_TEXT,

	625 _CHECKSUM_VALIDATION_TEXT,

	626 _RETRY_HANDLING_TEXT,

	627 _RESUMABLE_TRANSFERS_TEXT,

	628 _STREAMING_TRANSFERS_TEXT,

	629 _PARALLEL_COMPOSITE_UPLOADS_TEXT,

	630 _CHANGING_TEMP_DIRECTORIES_TEXT,

	631 _OPTIONS_TEXT])

	632

	633

	634 CP_SUB_ARGS = 'a:cDeIL:MNnprRtUvz:'

	635

	636

	637 def _CopyFuncWrapper(cls, args, thread_state=None):

	638 cls.CopyFunc(args, thread_state=thread_state)

	639

	640

	641 def _CopyExceptionHandler(cls, e):

	642 """Simple exception handler to allow post-completion status."""

	643 cls.logger.error(str(e))

	644 cls.op_failure_count += 1

	645 cls.logger.debug('\n\nEncountered exception while copying:\n%s\n',

	646 traceback.format_exc())

	647

	648

	649 def _RmExceptionHandler(cls, e):

	650 """Simple exception handler to allow post-completion status."""

	651 cls.logger.error(str(e))

	652

	653

	654 class CpCommand(Command):

	655 """Implementation of gsutil cp command.

	656

	657 Note that CpCommand is run for both gsutil cp and gsutil mv. The latter

	658 happens by MvCommand calling CpCommand and passing the hidden (undocumented)

	659 -M option. This allows the copy and remove needed for each mv to run

	660 together (rather than first running all the cp's and then all the rm's, as

	661 we originally had implemented), which in turn avoids the following problem

	662 with removing the wrong objects: starting with a bucket containing only

	663 the object gs://bucket/obj, say the user does:

	664 gsutil mv gs://bucket/* gs://bucket/d.txt

	665 If we ran all the cp's and then all the rm's and we didn't expand the wildcard

	666 first, the cp command would first copy gs://bucket/obj to gs://bucket/d.txt,

	667 and the rm command would then remove that object. In the implementation

	668 prior to gsutil release 3.12 we avoided this by building a list of objects

	669 to process and then running the copies and then the removes; but building

	670 the list up front limits scalability (compared with the current approach

	671 of processing the bucket listing iterator on the fly).

	672 """

	673

	674 # Command specification. See base class for documentation.

	675 command_spec = Command.CreateCommandSpec(

	676 'cp',

	677 command_name_aliases=['copy'],

	678 usage_synopsis=_SYNOPSIS,

	679 min_args=1,

	680 max_args=NO_MAX,

	681 # -t is deprecated but leave intact for now to avoid breakage.

	682 supported_sub_args=CP_SUB_ARGS,

	683 file_url_ok=True,

	684 provider_url_ok=False,

	685 urls_start_arg=0,

	686 gs_api_support=[ApiSelector.XML, ApiSelector.JSON],

	687 gs_default_api=ApiSelector.JSON,

	688 supported_private_args=['testcallbackfile='],

	689 argparse_arguments=[

	690 CommandArgument.MakeZeroOrMoreCloudOrFileURLsArgument()

	691 ]

	692 )

	693 # Help specification. See help_provider.py for documentation.

	694 help_spec = Command.HelpSpec(

	695 help_name='cp',

	696 help_name_aliases=['copy'],

	697 help_type='command_help',

	698 help_one_line_summary='Copy files and objects',

	699 help_text=_DETAILED_HELP_TEXT,

	700 subcommand_help_text={},

	701 )

	702

	703 # pylint: disable=too-many-statements

	704 def CopyFunc(self, name_expansion_result, thread_state=None):

	705 """Worker function for performing the actual copy (and rm, for mv)."""

	706 gsutil_api = GetCloudApiInstance(self, thread_state=thread_state)

	707

	708 copy_helper_opts = copy_helper.GetCopyHelperOpts()

	709 if copy_helper_opts.perform_mv:

	710 cmd_name = 'mv'

	711 else:

	712 cmd_name = self.command_name

	713 src_url = name_expansion_result.source_storage_url

	714 exp_src_url = name_expansion_result.expanded_storage_url

	715 src_url_names_container = name_expansion_result.names_container

	716 have_multiple_srcs = name_expansion_result.is_multi_source_request

	717

	718 if src_url.IsCloudUrl() and src_url.IsProvider():

	719 raise CommandException(

	720 'The %s command does not allow provider-only source URLs (%s)' %

	721 (cmd_name, src_url))

	722 if have_multiple_srcs:

	723 copy_helper.InsistDstUrlNamesContainer(

	724 self.exp_dst_url, self.have_existing_dst_container, cmd_name)

	725

	726 # Various GUI tools (like the GCS web console) create placeholder objects

	727 # ending with '/' when the user creates an empty directory. Normally these

	728 # tools should delete those placeholders once objects have been written

	729 # "under" the directory, but sometimes the placeholders are left around. We

	730 # need to filter them out here, otherwise if the user tries to rsync from

	731 # GCS to a local directory it will result in a directory/file conflict

	732 # (e.g., trying to download an object called "mydata/" where the local

	733 # directory "mydata" exists).

	734 if IsCloudSubdirPlaceholder(exp_src_url):

	735 self.logger.info('Skipping cloud sub-directory placeholder object (%s) '

	736 'because such objects aren\'t needed in (and would '

	737 'interfere with) directories in the local file system',

	738 exp_src_url)

	739 return

	740

	741 if copy_helper_opts.use_manifest and self.manifest.WasSuccessful(

	742 exp_src_url.url_string):

	743 return

	744

	745 if copy_helper_opts.perform_mv:

	746 if name_expansion_result.names_container:

	747 # Use recursion_requested when performing name expansion for the

	748 # directory mv case so we can determine if any of the source URLs are

	749 # directories (and then use cp -r and rm -r to perform the move, to

	750 # match the behavior of Linux mv (which when moving a directory moves

	751 # all the contained files).

	752 self.recursion_requested = True

	753 # Disallow wildcard src URLs when moving directories, as supporting it

	754 # would make the name transformation too complex and would also be

	755 # dangerous (e.g., someone could accidentally move many objects to the

	756 # wrong name, or accidentally overwrite many objects).

	757 if ContainsWildcard(src_url.url_string):

	758 raise CommandException('The mv command disallows naming source '

	759 'directories using wildcards')

	760

	761 if (self.exp_dst_url.IsFileUrl()

	762 and not os.path.exists(self.exp_dst_url.object_name)

	763 and have_multiple_srcs):

	764 os.makedirs(self.exp_dst_url.object_name)

	765

	766 dst_url = copy_helper.ConstructDstUrl(

	767 src_url, exp_src_url, src_url_names_container, have_multiple_srcs,

	768 self.exp_dst_url, self.have_existing_dst_container,

	769 self.recursion_requested)

	770 dst_url = copy_helper.FixWindowsNaming(src_url, dst_url)

	771

	772 copy_helper.CheckForDirFileConflict(exp_src_url, dst_url)

	773 if copy_helper.SrcDstSame(exp_src_url, dst_url):

	774 raise CommandException('%s: "%s" and "%s" are the same file - '

	775 'abort.' % (cmd_name, exp_src_url, dst_url))

	776

	777 if dst_url.IsCloudUrl() and dst_url.HasGeneration():

	778 raise CommandException('%s: a version-specific URL\n(%s)\ncannot be '

	779 'the destination for gsutil cp - abort.'

	780 % (cmd_name, dst_url))

	781

	782 elapsed_time = bytes_transferred = 0

	783 try:

	784 if copy_helper_opts.use_manifest:

	785 self.manifest.Initialize(

	786 exp_src_url.url_string, dst_url.url_string)

	787 (elapsed_time, bytes_transferred, result_url, md5) = (

	788 copy_helper.PerformCopy(

	789 self.logger, exp_src_url, dst_url, gsutil_api,

	790 self, _CopyExceptionHandler, allow_splitting=True,

	791 headers=self.headers, manifest=self.manifest,

	792 gzip_exts=self.gzip_exts, test_method=self.test_method))

	793 if copy_helper_opts.use_manifest:

	794 if md5:

	795 self.manifest.Set(exp_src_url.url_string, 'md5', md5)

	796 self.manifest.SetResult(

	797 exp_src_url.url_string, bytes_transferred, 'OK')

	798 if copy_helper_opts.print_ver:

	799 # Some cases don't return a version-specific URL (e.g., if destination

	800 # is a file).

	801 self.logger.info('Created: %s', result_url)

	802 except ItemExistsError:

	803 message = 'Skipping existing item: %s' % dst_url

	804 self.logger.info(message)

	805 if copy_helper_opts.use_manifest:

	806 self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)

	807 except SkipUnsupportedObjectError, e:

	808 message = ('Skipping item %s with unsupported object type %s' %

	809 (exp_src_url.url_string, e.unsupported_type))

	810 self.logger.info(message)

	811 if copy_helper_opts.use_manifest:

	812 self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)

	813 except copy_helper.FileConcurrencySkipError, e:

	814 self.logger.warn('Skipping copy of source URL %s because destination URL '

	815 '%s is already being copied by another gsutil process '

	816 'or thread (did you specify the same source URL twice?) '

	817 % (src_url, dst_url))

	818 except Exception, e:

	819 if (copy_helper_opts.no_clobber and

	820 copy_helper.IsNoClobberServerException(e)):

	821 message = 'Rejected (noclobber): %s' % dst_url

	822 self.logger.info(message)

	823 if copy_helper_opts.use_manifest:

	824 self.manifest.SetResult(

	825 exp_src_url.url_string, 0, 'skip', message)

	826 elif self.continue_on_error:

	827 message = 'Error copying %s: %s' % (src_url, str(e))

	828 self.op_failure_count += 1

	829 self.logger.error(message)

	830 if copy_helper_opts.use_manifest:

	831 self.manifest.SetResult(

	832 exp_src_url.url_string, 0, 'error',

	833 RemoveCRLFFromString(message))

	834 else:

	835 if copy_helper_opts.use_manifest:

	836 self.manifest.SetResult(

	837 exp_src_url.url_string, 0, 'error', str(e))

	838 raise

	839 else:

	840 if copy_helper_opts.perform_mv:

	841 self.logger.info('Removing %s...', exp_src_url)

	842 if exp_src_url.IsCloudUrl():

	843 gsutil_api.DeleteObject(exp_src_url.bucket_name,

	844 exp_src_url.object_name,

	845 generation=exp_src_url.generation,

	846 provider=exp_src_url.scheme)

	847 else:

	848 os.unlink(exp_src_url.object_name)

	849

	850 with self.stats_lock:

	851 self.total_elapsed_time += elapsed_time

	852 self.total_bytes_transferred += bytes_transferred

	853

	854 # Command entry point.

	855 def RunCommand(self):

	856 copy_helper_opts = self._ParseOpts()

	857

	858 self.total_elapsed_time = self.total_bytes_transferred = 0

	859 if self.args[-1] == '-' or self.args[-1] == 'file://-':

	860 return CatHelper(self).CatUrlStrings(self.args[:-1])

	861

	862 if copy_helper_opts.read_args_from_stdin:

	863 if len(self.args) != 1:

	864 raise CommandException('Source URLs cannot be specified with -I option')

	865 url_strs = StdinIterator()

	866 else:

	867 if len(self.args) < 2:

	868 raise CommandException('Wrong number of arguments for "cp" command.')

	869 url_strs = self.args[:-1]

	870

	871 (self.exp_dst_url, self.have_existing_dst_container) = (

	872 copy_helper.ExpandUrlToSingleBlr(self.args[-1], self.gsutil_api,

	873 self.debug, self.project_id))

	874

	875 # If the destination bucket has versioning enabled iterate with

	876 # all_versions=True. That way we'll copy all versions if the source bucket

	877 # is versioned; and by leaving all_versions=False if the destination bucket

	878 # has versioning disabled we will avoid copying old versions all to the same

	879 # un-versioned destination object.

	880 all_versions = False

	881 try:

	882 bucket = self._GetBucketWithVersioningConfig(self.exp_dst_url)

	883 if bucket and bucket.versioning and bucket.versioning.enabled:

	884 all_versions = True

	885 except AccessDeniedException:

	886 # This happens (in the XML API only) if the user doesn't have OWNER access

	887 # on the bucket (needed to check if versioning is enabled). In this case

	888 # fall back to copying all versions (which can be inefficient for the

	889 # reason noted in the comment above). We don't try to warn the user

	890 # because that would result in false positive warnings (since we can't

	891 # check if versioning is enabled on the destination bucket).

	892 #

	893 # For JSON, we will silently not return versioning if we don't have

	894 # access.

	895 all_versions = True

	896

	897 name_expansion_iterator = NameExpansionIterator(

	898 self.command_name, self.debug,

	899 self.logger, self.gsutil_api, url_strs,

	900 self.recursion_requested or copy_helper_opts.perform_mv,

	901 project_id=self.project_id, all_versions=all_versions,

	902 continue_on_error=self.continue_on_error or self.parallel_operations)

	903

	904 # Use a lock to ensure accurate statistics in the face of

	905 # multi-threading/multi-processing.

	906 self.stats_lock = CreateLock()

	907

	908 # Tracks if any copies failed.

	909 self.op_failure_count = 0

	910

	911 # Start the clock.

	912 start_time = time.time()

	913

	914 # Tuple of attributes to share/manage across multiple processes in

	915 # parallel (-m) mode.

	916 shared_attrs = ('op_failure_count', 'total_bytes_transferred')

	917

	918 # Perform copy requests in parallel (-m) mode, if requested, using

	919 # configured number of parallel processes and threads. Otherwise,

	920 # perform requests with sequential function calls in current process.

	921 self.Apply(_CopyFuncWrapper, name_expansion_iterator,

	922 _CopyExceptionHandler, shared_attrs,

	923 fail_on_error=(not self.continue_on_error))

	924 self.logger.debug(

	925 'total_bytes_transferred: %d', self.total_bytes_transferred)

	926

	927 end_time = time.time()

	928 self.total_elapsed_time = end_time - start_time

	929

	930 # Sometimes, particularly when running unit tests, the total elapsed time

	931 # is really small. On Windows, the timer resolution is too small and

	932 # causes total_elapsed_time to be zero.

	933 try:

	934 float(self.total_bytes_transferred) / float(self.total_elapsed_time)

	935 except ZeroDivisionError:

	936 self.total_elapsed_time = 0.01

	937

	938 self.total_bytes_per_second = (float(self.total_bytes_transferred) /

	939 float(self.total_elapsed_time))

	940

	941 if self.debug == 3:

	942 # Note that this only counts the actual GET and PUT bytes for the copy

	943 # - not any transfers for doing wildcard expansion, the initial

	944 # HEAD/GET request performed to get the object metadata, etc.

	945 if self.total_bytes_transferred != 0:

	946 self.logger.info(

	947 'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)',

	948 self.total_bytes_transferred, self.total_elapsed_time,

	949 MakeHumanReadable(self.total_bytes_per_second))

	950 if self.op_failure_count:

	951 plural_str = 's' if self.op_failure_count else ''

	952 raise CommandException('%d file%s/object%s could not be transferred.' % (

	953 self.op_failure_count, plural_str, plural_str))

	954

	955 return 0

	956

	957 def _ParseOpts(self):

	958 perform_mv = False

	959 # exclude_symlinks is handled by Command parent class, so save in Command

	960 # state rather than CopyHelperOpts.

	961 self.exclude_symlinks = False

	962 no_clobber = False

	963 # continue_on_error is handled by Command parent class, so save in Command

	964 # state rather than CopyHelperOpts.

	965 self.continue_on_error = False

	966 daisy_chain = False

	967 read_args_from_stdin = False

	968 print_ver = False

	969 use_manifest = False

	970 preserve_acl = False

	971 canned_acl = None

	972 # canned_acl is handled by a helper function in parent

	973 # Command class, so save in Command state rather than CopyHelperOpts.

	974 self.canned = None

	975

	976 self.skip_unsupported_objects = False

	977

	978 # Files matching these extensions should be gzipped before uploading.

	979 self.gzip_exts = []

	980

	981 test_callback_file = None

	982

	983 # self.recursion_requested initialized in command.py (so can be checked

	984 # in parent class for all commands).

	985 self.manifest = None

	986 if self.sub_opts:

	987 for o, a in self.sub_opts:

	988 if o == '-a':

	989 canned_acl = a

	990 self.canned = True

	991 if o == '-c':

	992 self.continue_on_error = True

	993 elif o == '-D':

	994 daisy_chain = True

	995 elif o == '-e':

	996 self.exclude_symlinks = True

	997 elif o == '--testcallbackfile':

	998 # File path of a pickled class that implements ProgressCallback.call.

	999 # Used for testing transfer interruptions and resumes.

	1000 test_callback_file = a

	1001 elif o == '-I':

	1002 read_args_from_stdin = True

	1003 elif o == '-L':

	1004 use_manifest = True

	1005 self.manifest = Manifest(a)

	1006 elif o == '-M':

	1007 # Note that we signal to the cp command to perform a move (copy

	1008 # followed by remove) and use directory-move naming rules by passing

	1009 # the undocumented (for internal use) -M option when running the cp

	1010 # command from mv.py.

	1011 perform_mv = True

	1012 elif o == '-n':

	1013 no_clobber = True

	1014 elif o == '-p':

	1015 preserve_acl = True

	1016 elif o == '-r' or o == '-R':

	1017 self.recursion_requested = True

	1018 elif o == '-U':

	1019 self.skip_unsupported_objects = True

	1020 elif o == '-v':

	1021 print_ver = True

	1022 elif o == '-z':

	1023 self.gzip_exts = [x.strip() for x in a.split(',')]

	1024 if preserve_acl and canned_acl:

	1025 raise CommandException(

	1026 'Specifying both the -p and -a options together is invalid.')

	1027 return CreateCopyHelperOpts(

	1028 perform_mv=perform_mv,

	1029 no_clobber=no_clobber,

	1030 daisy_chain=daisy_chain,

	1031 read_args_from_stdin=read_args_from_stdin,

	1032 print_ver=print_ver,

	1033 use_manifest=use_manifest,

	1034 preserve_acl=preserve_acl,

	1035 canned_acl=canned_acl,

	1036 skip_unsupported_objects=self.skip_unsupported_objects,

	1037 test_callback_file=test_callback_file)

	1038

	1039 def _GetBucketWithVersioningConfig(self, exp_dst_url):

	1040 """Gets versioning config for a bucket and ensures that it exists.

	1041

	1042 Args:

	1043 exp_dst_url: Wildcard-expanded destination StorageUrl.

	1044

	1045 Raises:

	1046 AccessDeniedException: if there was a permissions problem accessing the

	1047 bucket or its versioning config.

	1048 CommandException: if URL refers to a cloud bucket that does not exist.

	1049

	1050 Returns:

	1051 apitools Bucket with versioning configuration.

	1052 """

	1053 bucket = None

	1054 if exp_dst_url.IsCloudUrl() and exp_dst_url.IsBucket():

	1055 try:

	1056 bucket = self.gsutil_api.GetBucket(

	1057 exp_dst_url.bucket_name, provider=exp_dst_url.scheme,

	1058 fields=['versioning'])

	1059 except AccessDeniedException, e:

	1060 raise

	1061 except NotFoundException, e:

	1062 raise CommandException('Destination bucket %s does not exist.' %

	1063 exp_dst_url)

	1064 except Exception, e:

	1065 raise CommandException('Error retrieving destination bucket %s: %s' %

	1066 (exp_dst_url, e.message))

	1067 return bucket

OLD	NEW

« no previous file with comments | « third_party/gsutil/gslib/commands/cors.py ('k') | third_party/gsutil/gslib/commands/defacl.py » ('j') | no next file with comments »