tools/telemetry/third_party/gsutilz/gslib/commands/cp.py - Issue 1493973002: Remove telemetry/third_party/gsutilz

Unified Diff: tools/telemetry/third_party/gsutilz/gslib/commands/cp.py

Issue 1493973002: Remove telemetry/third_party/gsutilz (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@gsutil_changes

Patch Set: rebase Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « tools/telemetry/third_party/gsutilz/gslib/commands/cors.py ('k') | tools/telemetry/third_party/gsutilz/gslib/commands/defacl.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: tools/telemetry/third_party/gsutilz/gslib/commands/cp.py

diff --git a/tools/telemetry/third_party/gsutilz/gslib/commands/cp.py b/tools/telemetry/third_party/gsutilz/gslib/commands/cp.py

deleted file mode 100644

index 34636dc47d5df460909c9a9e65b38f139aed9a10..0000000000000000000000000000000000000000

--- a/tools/telemetry/third_party/gsutilz/gslib/commands/cp.py

+++ /dev/null

@@ -1,1067 +0,0 @@

-# -*- coding: utf-8 -*-

-# Licensed under the Apache License, Version 2.0 (the "License");

-# you may not use this file except in compliance with the License.

-# You may obtain a copy of the License at

-# http://www.apache.org/licenses/LICENSE-2.0

-# Unless required by applicable law or agreed to in writing, software

-# distributed under the License is distributed on an "AS IS" BASIS,

-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

-# See the License for the specific language governing permissions and

-# limitations under the License.

-"""Implementation of Unix-like cp command for cloud storage providers."""

-from __future__ import absolute_import

-import os

-import time

-import traceback

-from gslib import copy_helper

-from gslib.cat_helper import CatHelper

-from gslib.cloud_api import AccessDeniedException

-from gslib.cloud_api import NotFoundException

-from gslib.command import Command

-from gslib.command_argument import CommandArgument

-from gslib.commands.compose import MAX_COMPONENT_COUNT

-from gslib.copy_helper import CreateCopyHelperOpts

-from gslib.copy_helper import ItemExistsError

-from gslib.copy_helper import Manifest

-from gslib.copy_helper import PARALLEL_UPLOAD_TEMP_NAMESPACE

-from gslib.copy_helper import SkipUnsupportedObjectError

-from gslib.cs_api_map import ApiSelector

-from gslib.exception import CommandException

-from gslib.name_expansion import NameExpansionIterator

-from gslib.storage_url import ContainsWildcard

-from gslib.util import CreateLock

-from gslib.util import GetCloudApiInstance

-from gslib.util import IsCloudSubdirPlaceholder

-from gslib.util import MakeHumanReadable

-from gslib.util import NO_MAX

-from gslib.util import RemoveCRLFFromString

-from gslib.util import StdinIterator

-_SYNOPSIS = """

- gsutil cp [OPTION]... src_url dst_url

- gsutil cp [OPTION]... src_url... dst_url

- gsutil cp [OPTION]... -I dst_url

-"""

-_SYNOPSIS_TEXT = """

-SYNOPSIS

-""" + _SYNOPSIS

-_DESCRIPTION_TEXT = """

-DESCRIPTION

- The gsutil cp command allows you to copy data between your local file

- system and the cloud, copy data within the cloud, and copy data between

- cloud storage providers. For example, to copy all text files from the

- local directory to a bucket you could do:

- gsutil cp *.txt gs://my_bucket

- Similarly, you can download text files from a bucket by doing:

- gsutil cp gs://my_bucket/*.txt .

- If you want to copy an entire directory tree you need to use the -r option:

- gsutil cp -r dir gs://my_bucket

- If you have a large number of files to upload you might want to use the

- gsutil -m option, to perform a parallel (multi-threaded/multi-processing)

- copy:

- gsutil -m cp -r dir gs://my_bucket

- You can pass a list of URLs (one per line) to copy on stdin instead of as

- command line arguments by using the -I option. This allows you to use gsutil

- in a pipeline to upload or download files / objects as generated by a program,

- such as:

- some_program | gsutil -m cp -I gs://my_bucket

- or:

- some_program | gsutil -m cp -I ./download_dir

- The contents of stdin can name files, cloud URLs, and wildcards of files

- and cloud URLs.

-"""

-_NAME_CONSTRUCTION_TEXT = """

-HOW NAMES ARE CONSTRUCTED

- The gsutil cp command strives to name objects in a way consistent with how

- Linux cp works, which causes names to be constructed in varying ways depending

- on whether you're performing a recursive directory copy or copying

- individually named objects; and whether you're copying to an existing or

- non-existent directory.

- When performing recursive directory copies, object names are constructed

- that mirror the source directory structure starting at the point of

- recursive processing. For example, the command:

- gsutil cp -r dir1/dir2 gs://my_bucket

- will create objects named like gs://my_bucket/dir2/a/b/c, assuming

- dir1/dir2 contains the file a/b/c.

- In contrast, copying individually named files will result in objects named

- by the final path component of the source files. For example, the command:

- gsutil cp dir1/dir2/** gs://my_bucket

- will create objects named like gs://my_bucket/c.

- The same rules apply for downloads: recursive copies of buckets and

- bucket subdirectories produce a mirrored filename structure, while copying

- individually (or wildcard) named objects produce flatly named files.

- Note that in the above example the '**' wildcard matches all names

- anywhere under dir. The wildcard '*' will match names just one level deep. For

- more details see 'gsutil help wildcards'.

- There's an additional wrinkle when working with subdirectories: the resulting

- names depend on whether the destination subdirectory exists. For example,

- if gs://my_bucket/subdir exists as a subdirectory, the command:

- gsutil cp -r dir1/dir2 gs://my_bucket/subdir

- will create objects named like gs://my_bucket/subdir/dir2/a/b/c. In contrast,

- if gs://my_bucket/subdir does not exist, this same gsutil cp command will

- create objects named like gs://my_bucket/subdir/a/b/c.

- Note: If you use the

- `Google Developers Console <https://console.developers.google.com>`_

- to create folders, it does so by creating a "placeholder" object that ends

- with a "/" character. gsutil skips these objects when downloading from the

- cloud to the local file system, because attempting to create a file that

- ends with a "/" is not allowed on Linux and MacOS. Because of this, it is

- recommended that you not create objects that end with "/" (unless you don't

- need to be able to download such objects using gsutil).

-"""

-_SUBDIRECTORIES_TEXT = """

-COPYING TO/FROM SUBDIRECTORIES; DISTRIBUTING TRANSFERS ACROSS MACHINES

- You can use gsutil to copy to and from subdirectories by using a command

- like:

- gsutil cp -r dir gs://my_bucket/data

- This will cause dir and all of its files and nested subdirectories to be

- copied under the specified destination, resulting in objects with names like

- gs://my_bucket/data/dir/a/b/c. Similarly you can download from bucket

- subdirectories by using a command like:

- gsutil cp -r gs://my_bucket/data dir

- This will cause everything nested under gs://my_bucket/data to be downloaded

- into dir, resulting in files with names like dir/data/a/b/c.

- Copying subdirectories is useful if you want to add data to an existing

- bucket directory structure over time. It's also useful if you want

- to parallelize uploads and downloads across multiple machines (often

- reducing overall transfer time compared with simply running gsutil -m

- cp on one machine). For example, if your bucket contains this structure:

- gs://my_bucket/data/result_set_01/

- gs://my_bucket/data/result_set_02/

- ...

- gs://my_bucket/data/result_set_99/

- you could perform concurrent downloads across 3 machines by running these

- commands on each machine, respectively:

- gsutil -m cp -r gs://my_bucket/data/result_set_[0-3]* dir

- gsutil -m cp -r gs://my_bucket/data/result_set_[4-6]* dir

- gsutil -m cp -r gs://my_bucket/data/result_set_[7-9]* dir

- Note that dir could be a local directory on each machine, or it could

- be a directory mounted off of a shared file server; whether the latter

- performs acceptably may depend on a number of things, so we recommend

- you experiment and find out what works best for you.

-"""

-_COPY_IN_CLOUD_TEXT = """

-COPYING IN THE CLOUD AND METADATA PRESERVATION

- If both the source and destination URL are cloud URLs from the same

- provider, gsutil copies data "in the cloud" (i.e., without downloading

- to and uploading from the machine where you run gsutil). In addition to

- the performance and cost advantages of doing this, copying in the cloud

- preserves metadata (like Content-Type and Cache-Control). In contrast,

- when you download data from the cloud it ends up in a file, which has

- no associated metadata. Thus, unless you have some way to hold on to

- or re-create that metadata, downloading to a file will not retain the

- metadata.

- Copies spanning locations and/or storage classes cause data to be rewritten

- in the cloud, which may take some time. Such operations can be resumed with

- the same command if they are interrupted, so long as the command parameters

- are identical.

- Note that by default, the gsutil cp command does not copy the object

- ACL to the new object, and instead will use the default bucket ACL (see

- "gsutil help defacl"). You can override this behavior with the -p

- option (see OPTIONS below).

- One additional note about copying in the cloud: If the destination bucket has

- versioning enabled, gsutil cp will copy all versions of the source object(s).

- For example:

- gsutil cp gs://bucket1/obj gs://bucket2

- will cause all versions of gs://bucket1/obj to be copied to gs://bucket2.

-"""

-_CHECKSUM_VALIDATION_TEXT = """

-CHECKSUM VALIDATION

- At the end of every upload or download the gsutil cp command validates that

- the checksum it computes for the source file/object matches the checksum

- the service computes. If the checksums do not match, gsutil will delete the

- corrupted object and print a warning message. This very rarely happens, but

- if it does, please contact gs-team@google.com.

- If you know the MD5 of a file before uploading you can specify it in the

- Content-MD5 header, which will cause the cloud storage service to reject the

- upload if the MD5 doesn't match the value computed by the service. For

- example:

- % gsutil hash obj

- Hashing obj:

- Hashes [base64] for obj:

- Hash (crc32c): lIMoIw==

- Hash (md5): VgyllJgiiaRAbyUUIqDMmw==

- % gsutil -h Content-MD5:VgyllJgiiaRAbyUUIqDMmw== cp obj gs://your-bucket/obj

- Copying file://obj [Content-Type=text/plain]...

- Uploading gs://your-bucket/obj: 182 b/182 B

- If the checksum didn't match the service would instead reject the upload and

- gsutil would print a message like:

- BadRequestException: 400 Provided MD5 hash "VgyllJgiiaRAbyUUIqDMmw=="

- doesn't match calculated MD5 hash "7gyllJgiiaRAbyUUIqDMmw==".

- Even if you don't do this gsutil will delete the object if the computed

- checksum mismatches, but specifying the Content-MD5 header has three

- advantages:

- 1. It prevents the corrupted object from becoming visible at all, whereas

- otherwise it would be visible for 1-3 seconds before gsutil deletes it.

- 2. It will definitively prevent the corrupted object from being left in

- the cloud, whereas the gsutil approach of deleting after the upload

- completes could fail if (for example) the gsutil process gets ^C'd

- between upload and deletion request.

- 3. It supports a customer-to-service integrity check handoff. For example,

- if you have a content production pipeline that generates data to be

- uploaded to the cloud along with checksums of that data, specifying the

- MD5 computed by your content pipeline when you run gsutil cp will ensure

- that the checksums match all the way through the process (e.g., detecting

- if data gets corrupted on your local disk between the time it was written

- by your content pipeline and the time it was uploaded to GCS).

- Note: The Content-MD5 header is ignored for composite objects, because such

- objects only have a CRC32C checksum.

-"""

-_RETRY_HANDLING_TEXT = """

-RETRY HANDLING

- The cp command will retry when failures occur, but if enough failures happen

- during a particular copy or delete operation the command will skip that object

- and move on. At the end of the copy run if any failures were not successfully

- retried, the cp command will report the count of failures, and exit with

- non-zero status.

- Note that there are cases where retrying will never succeed, such as if you

- don't have write permission to the destination bucket or if the destination

- path for some objects is longer than the maximum allowed length.

- For more details about gsutil's retry handling, please see

- "gsutil help retries".

-"""

-_RESUMABLE_TRANSFERS_TEXT = """

-RESUMABLE TRANSFERS

- gsutil automatically uses the Google Cloud Storage resumable upload feature

- whenever you use the cp command to upload an object that is larger than 2

- MiB. You do not need to specify any special command line options to make this

- happen. If your upload is interrupted you can restart the upload by running

- the same cp command that you ran to start the upload. Until the upload

- has completed successfully, it will not be visible at the destination object

- and will not replace any existing object the upload is intended to overwrite.

- (However, see the section on PARALLEL COMPOSITE UPLOADS, which may leave

- temporary component objects in place during the upload process.)

- Similarly, gsutil automatically performs resumable downloads (using HTTP

- standard Range GET operations) whenever you use the cp command, unless the

- destination is a stream or null. In this case the partially downloaded file

- will be visible as soon as it starts being written. Thus, before you attempt

- to use any files downloaded by gsutil you should make sure the download

- completed successfully, by checking the exit status from the gsutil command.

- This can be done in a bash script, for example, by doing:

- gsutil cp gs://your-bucket/your-object ./local-file

- if [ "$status" -ne "0" ] ; then

- << Code that handles failures >>

- fi

- Resumable uploads and downloads store some state information in a file

- in ~/.gsutil named by the destination object or file. If you attempt to

- resume a transfer from a machine with a different directory, the transfer

- will start over from scratch.

- See also "gsutil help prod" for details on using resumable transfers

- in production.

-"""

-_STREAMING_TRANSFERS_TEXT = """

-STREAMING TRANSFERS

- Use '-' in place of src_url or dst_url to perform a streaming

- transfer. For example:

- long_running_computation | gsutil cp - gs://my_bucket/obj

- Streaming uploads using the JSON API (see "gsutil help apis") are buffered in

- memory and can retry in the event of network flakiness or service errors.

- Streaming transfers (other than uploads using the JSON API) do not support

- resumable uploads/downloads. If you have a large amount of data to upload

- (say, more than 100 MiB) it is recommended to write the data to a local file

- and then copy that file to the cloud rather than streaming it (and similarly

- for large downloads).

- WARNING: When performing streaming transfers gsutil does not compute a

- checksum of the uploaded or downloaded data. Therefore, we recommend that

- users either perform their own validation of the data or use non-streaming

- transfers (which perform integrity checking automatically).

-"""

-_PARALLEL_COMPOSITE_UPLOADS_TEXT = """

-PARALLEL COMPOSITE UPLOADS

- gsutil can automatically use

- `object composition <https://developers.google.com/storage/docs/composite-objects>`_

- to perform uploads in parallel for large, local files being uploaded to Google

- Cloud Storage. This means that, if enabled (see next paragraph), a large file

- will be split into component pieces that will be uploaded in parallel. Those

- components will then be composed in the cloud, and the temporary components in

- the cloud will be deleted after successful composition. No additional local

- disk space is required for this operation.

- Using parallel composite uploads presents a tradeoff between upload

- performance and download configuration: If you enable parallel composite

- uploads your uploads will run faster, but someone will need to install a

- compiled crcmod (see "gsutil help crcmod") on every machine where objects are

- downloaded by gsutil or other Python applications. For some distributions this

- is easy (e.g., it comes pre-installed on MacOS), but in some cases users have

- found it difficult. Because of this at present parallel composite uploads are

- disabled by default. Google is actively working with a number of the Linux

- distributions to get crcmod included with the stock distribution. Once that is

- done we will re-enable parallel composite uploads by default in gsutil.

- To try parallel composite uploads you can run the command:

- gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp bigfile gs://your-bucket

- where bigfile is larger than 150 MiB. When you do this notice that the upload

- progress indicator continuously updates for several different uploads at once

- (corresponding to each of the sections of the file being uploaded in

- parallel), until the parallel upload completes. If you then want to enable

- parallel composite uploads for all of your future uploads (notwithstanding the

- caveats mentioned earlier), you can uncomment and set the

- "parallel_composite_upload_threshold" config value in your .boto configuration

- file to this value.

- Note that the crcmod problem only impacts downloads via Python applications

- (such as gsutil). If any users who need to download the data using gsutil or

- other Python applications can install crcmod, it makes sense to enable

- parallel composite uploads (see above). For example, if you use gsutil to

- upload video assets and those assets will only ever be served via a Java

- application (there are efficient crc32c implementations available in Java), it

- would make sense to enable parallel composite uploads on your machine.

- If a parallel composite upload fails prior to composition, re-running the

- gsutil command will take advantage of resumable uploads for those components

- that failed, and the component objects will be deleted after the first

- successful attempt. Any temporary objects that were uploaded successfully

- before gsutil failed will still exist until the upload is completed

- successfully. The temporary objects will be named in the following fashion:

- <random ID>%s<hash>

- where <random ID> is some numerical value, and <hash> is an MD5 hash (not

- related to the hash of the contents of the file or object).

- To avoid leaving temporary objects around, you should make sure to check the

- exit status from the gsutil command. This can be done in a bash script, for

- example, by doing:

- gsutil cp ./local-file gs://your-bucket/your-object

- if [ "$status" -ne "0" ] ; then

- << Code that handles failures >>

- fi

- Or, for copying a directory, use this instead:

- gsutil cp -c -L cp.log -r ./dir gs://bucket

- if [ "$status" -ne "0" ] ; then

- << Code that handles failures >>

- fi

- One important caveat is that files uploaded in this fashion are still subject

- to the maximum number of components limit. For example, if you upload a large

- file that gets split into %d components, and try to compose it with another

- object with %d components, the operation will fail because it exceeds the %d

- component limit. If you wish to compose an object later and the component

- limit is a concern, it is recommended that you disable parallel composite

- uploads for that transfer.

- Also note that an object uploaded using this feature will have a CRC32C hash,

- but it will not have an MD5 hash (and because of that, requires users who

- download the object to have crcmod installed, as noted earlier). For details

- see 'gsutil help crc32c'.

- Note that this feature can be completely disabled by setting the

- "parallel_composite_upload_threshold" variable in the .boto config file to 0.

-""" % (PARALLEL_UPLOAD_TEMP_NAMESPACE, 10, MAX_COMPONENT_COUNT - 9,

- MAX_COMPONENT_COUNT)

-_CHANGING_TEMP_DIRECTORIES_TEXT = """

-CHANGING TEMP DIRECTORIES

- gsutil writes data to a temporary directory in several cases:

- - when compressing data to be uploaded (see the -z option)

- - when decompressing data being downloaded (when the data has

- Content-Encoding:gzip, e.g., as happens when uploaded using gsutil cp -z)

- - when running integration tests (using the gsutil test command)

- In these cases it's possible the temp file location on your system that

- gsutil selects by default may not have enough space. If you find that

- gsutil runs out of space during one of these operations (e.g., raising

- "CommandException: Inadequate temp space available to compress <your file>"

- during a gsutil cp -z operation), you can change where it writes these

- temp files by setting the TMPDIR environment variable. On Linux and MacOS

- you can do this either by running gsutil this way:

- TMPDIR=/some/directory gsutil cp ...

- or by adding this line to your ~/.bashrc file and then restarting the shell

- before running gsutil:

- export TMPDIR=/some/directory

- On Windows 7 you can change the TMPDIR environment variable from Start ->

- Computer -> System -> Advanced System Settings -> Environment Variables.

- You need to reboot after making this change for it to take effect. (Rebooting

- is not necessary after running the export command on Linux and MacOS.)

-"""

-_OPTIONS_TEXT = """

-OPTIONS

- -a canned_acl Sets named canned_acl when uploaded objects created. See

- 'gsutil help acls' for further details.

- -c If an error occurs, continue to attempt to copy the remaining

- files. If any copies were unsuccessful, gsutil's exit status

- will be non-zero even if this flag is set. This option is

- implicitly set when running "gsutil -m cp...". Note: -c only

- applies to the actual copying operation. If an error occurs

- while iterating over the files in the local directory (e.g.,

- invalid Unicode file name) gsutil will print an error message

- and abort.

- -D Copy in "daisy chain" mode, i.e., copying between two buckets

- by hooking a download to an upload, via the machine where

- gsutil is run. By default, data are copied between two buckets

- "in the cloud", i.e., without needing to copy via the machine

- where gsutil runs.

- By default, a "copy in the cloud" when the source is a

- composite object will retain the composite nature of the

- object. However, Daisy chain mode can be used to change a

- composite object into a non-composite object. For example:

- gsutil cp -D -p gs://bucket/obj gs://bucket/obj_tmp

- gsutil mv -p gs://bucket/obj_tmp gs://bucket/obj

- Note: Daisy chain mode is automatically used when copying

- between providers (e.g., to copy data from Google Cloud Storage

- to another provider).

- -e Exclude symlinks. When specified, symbolic links will not be

- copied.

- -I Causes gsutil to read the list of files or objects to copy from

- stdin. This allows you to run a program that generates the list

- of files to upload/download.

- -L <file> Outputs a manifest log file with detailed information about

- each item that was copied. This manifest contains the following

- information for each item:

- - Source path.

- - Destination path.

- - Source size.

- - Bytes transferred.

- - MD5 hash.

- - UTC date and time transfer was started in ISO 8601 format.

- - UTC date and time transfer was completed in ISO 8601 format.

- - Upload id, if a resumable upload was performed.

- - Final result of the attempted transfer, success or failure.

- - Failure details, if any.

- If the log file already exists, gsutil will use the file as an

- input to the copy process, and will also append log items to

- the existing file. Files/objects that are marked in the

- existing log file as having been successfully copied (or

- skipped) will be ignored. Files/objects without entries will be

- copied and ones previously marked as unsuccessful will be

- retried. This can be used in conjunction with the -c option to

- build a script that copies a large number of objects reliably,

- using a bash script like the following:

- until gsutil cp -c -L cp.log -r ./dir gs://bucket; do

- sleep 1

- done

- The -c option will cause copying to continue after failures

- occur, and the -L option will allow gsutil to pick up where it

- left off without duplicating work. The loop will continue

- running as long as gsutil exits with a non-zero status (such a

- status indicates there was at least one failure during the

- gsutil run).

- Note: If you're trying to synchronize the contents of a

- directory and a bucket (or two buckets), see

- 'gsutil help rsync'.

- -n No-clobber. When specified, existing files or objects at the

- destination will not be overwritten. Any items that are skipped

- by this option will be reported as being skipped. This option

- will perform an additional GET request to check if an item

- exists before attempting to upload the data. This will save

- retransmitting data, but the additional HTTP requests may make

- small object transfers slower and more expensive.

- -p Causes ACLs to be preserved when copying in the cloud. Note

- that this option has performance and cost implications when

- using the XML API, as it requires separate HTTP calls for

- interacting with ACLs. The performance issue can be mitigated

- to some degree by using gsutil -m cp to cause parallel copying.

- Also, this option only works if you have OWNER access to all of

- the objects that are copied.

- You can avoid the additional performance and cost of using

- cp -p if you want all objects in the destination bucket to end

- up with the same ACL by setting a default object ACL on that

- bucket instead of using cp -p. See "help gsutil defacl".

- Note that it's not valid to specify both the -a and -p options

- together.

- -R, -r Causes directories, buckets, and bucket subdirectories to be

- copied recursively. If you neglect to use this option for

- an upload, gsutil will copy any files it finds and skip any

- directories. Similarly, neglecting to specify -r for a download

- will cause gsutil to copy any objects at the current bucket

- directory level, and skip any subdirectories.

- -U Skip objects with unsupported object types instead of failing.

- Unsupported object types are s3 glacier objects.

- -v Requests that the version-specific URL for each uploaded object

- be printed. Given this URL you can make future upload requests

- that are safe in the face of concurrent updates, because Google

- Cloud Storage will refuse to perform the update if the current

- object version doesn't match the version-specific URL. See

- 'gsutil help versions' for more details.

- -z <ext,...> Applies gzip content-encoding to file uploads with the given

- extensions. This is useful when uploading files with

- compressible content (such as .js, .css, or .html files)

- because it saves network bandwidth and space in Google Cloud

- Storage, which in turn reduces storage costs.

- When you specify the -z option, the data from your files is

- compressed before it is uploaded, but your actual files are

- left uncompressed on the local disk. The uploaded objects

- retain the Content-Type and name of the original files but are

- given a Content-Encoding header with the value "gzip" to

- indicate that the object data stored are compressed on the

- Google Cloud Storage servers.

- For example, the following command:

- gsutil cp -z html -a public-read cattypes.html gs://mycats

- will do all of the following:

- - Upload as the object gs://mycats/cattypes.html (cp command)

- - Set the Content-Type to text/html (based on file extension)

- - Compress the data in the file cattypes.html (-z option)

- - Set the Content-Encoding to gzip (-z option)

- - Set the ACL to public-read (-a option)

- - If a user tries to view cattypes.html in a browser, the

- browser will know to uncompress the data based on the

- Content-Encoding header, and to render it as HTML based on

- the Content-Type header.

- Note that if you download an object with Content-Encoding:gzip

- gsutil will decompress the content before writing the local

- file.

-"""

-_DETAILED_HELP_TEXT = '\n\n'.join([_SYNOPSIS_TEXT,

- _DESCRIPTION_TEXT,

- _NAME_CONSTRUCTION_TEXT,

- _SUBDIRECTORIES_TEXT,

- _COPY_IN_CLOUD_TEXT,

- _CHECKSUM_VALIDATION_TEXT,

- _RETRY_HANDLING_TEXT,

- _RESUMABLE_TRANSFERS_TEXT,

- _STREAMING_TRANSFERS_TEXT,

- _PARALLEL_COMPOSITE_UPLOADS_TEXT,

- _CHANGING_TEMP_DIRECTORIES_TEXT,

- _OPTIONS_TEXT])

-CP_SUB_ARGS = 'a:cDeIL:MNnprRtUvz:'

-def _CopyFuncWrapper(cls, args, thread_state=None):

- cls.CopyFunc(args, thread_state=thread_state)

-def _CopyExceptionHandler(cls, e):

- """Simple exception handler to allow post-completion status."""

- cls.logger.error(str(e))

- cls.op_failure_count += 1

- cls.logger.debug('\n\nEncountered exception while copying:\n%s\n',

- traceback.format_exc())

-def _RmExceptionHandler(cls, e):

- """Simple exception handler to allow post-completion status."""

- cls.logger.error(str(e))

-class CpCommand(Command):

- """Implementation of gsutil cp command.

- Note that CpCommand is run for both gsutil cp and gsutil mv. The latter

- happens by MvCommand calling CpCommand and passing the hidden (undocumented)

- -M option. This allows the copy and remove needed for each mv to run

- together (rather than first running all the cp's and then all the rm's, as

- we originally had implemented), which in turn avoids the following problem

- with removing the wrong objects: starting with a bucket containing only

- the object gs://bucket/obj, say the user does:

- gsutil mv gs://bucket/* gs://bucket/d.txt

- If we ran all the cp's and then all the rm's and we didn't expand the wildcard

- first, the cp command would first copy gs://bucket/obj to gs://bucket/d.txt,

- and the rm command would then remove that object. In the implementation

- prior to gsutil release 3.12 we avoided this by building a list of objects

- to process and then running the copies and then the removes; but building

- the list up front limits scalability (compared with the current approach

- of processing the bucket listing iterator on the fly).

- """

- # Command specification. See base class for documentation.

- command_spec = Command.CreateCommandSpec(

- 'cp',

- command_name_aliases=['copy'],

- usage_synopsis=_SYNOPSIS,

- min_args=1,

- max_args=NO_MAX,

- # -t is deprecated but leave intact for now to avoid breakage.

- supported_sub_args=CP_SUB_ARGS,

- file_url_ok=True,

- provider_url_ok=False,

- urls_start_arg=0,

- gs_api_support=[ApiSelector.XML, ApiSelector.JSON],

- gs_default_api=ApiSelector.JSON,

- supported_private_args=['testcallbackfile='],

- argparse_arguments=[

- CommandArgument.MakeZeroOrMoreCloudOrFileURLsArgument()

- ]

- )

- # Help specification. See help_provider.py for documentation.

- help_spec = Command.HelpSpec(

- help_name='cp',

- help_name_aliases=['copy'],

- help_type='command_help',

- help_one_line_summary='Copy files and objects',

- help_text=_DETAILED_HELP_TEXT,

- subcommand_help_text={},

- )

- # pylint: disable=too-many-statements

- def CopyFunc(self, name_expansion_result, thread_state=None):

- """Worker function for performing the actual copy (and rm, for mv)."""

- gsutil_api = GetCloudApiInstance(self, thread_state=thread_state)

- copy_helper_opts = copy_helper.GetCopyHelperOpts()

- if copy_helper_opts.perform_mv:

- cmd_name = 'mv'

- else:

- cmd_name = self.command_name

- src_url = name_expansion_result.source_storage_url

- exp_src_url = name_expansion_result.expanded_storage_url

- src_url_names_container = name_expansion_result.names_container

- have_multiple_srcs = name_expansion_result.is_multi_source_request

- if src_url.IsCloudUrl() and src_url.IsProvider():

- raise CommandException(

- 'The %s command does not allow provider-only source URLs (%s)' %

- (cmd_name, src_url))

- if have_multiple_srcs:

- copy_helper.InsistDstUrlNamesContainer(

- self.exp_dst_url, self.have_existing_dst_container, cmd_name)

- # Various GUI tools (like the GCS web console) create placeholder objects

- # ending with '/' when the user creates an empty directory. Normally these

- # tools should delete those placeholders once objects have been written

- # "under" the directory, but sometimes the placeholders are left around. We

- # need to filter them out here, otherwise if the user tries to rsync from

- # GCS to a local directory it will result in a directory/file conflict

- # (e.g., trying to download an object called "mydata/" where the local

- # directory "mydata" exists).

- if IsCloudSubdirPlaceholder(exp_src_url):

- self.logger.info('Skipping cloud sub-directory placeholder object (%s) '

- 'because such objects aren\'t needed in (and would '

- 'interfere with) directories in the local file system',

- exp_src_url)

- return

- if copy_helper_opts.use_manifest and self.manifest.WasSuccessful(

- exp_src_url.url_string):

- return

- if copy_helper_opts.perform_mv:

- if name_expansion_result.names_container:

- # Use recursion_requested when performing name expansion for the

- # directory mv case so we can determine if any of the source URLs are

- # directories (and then use cp -r and rm -r to perform the move, to

- # match the behavior of Linux mv (which when moving a directory moves

- # all the contained files).

- self.recursion_requested = True

- # Disallow wildcard src URLs when moving directories, as supporting it

- # would make the name transformation too complex and would also be

- # dangerous (e.g., someone could accidentally move many objects to the

- # wrong name, or accidentally overwrite many objects).

- if ContainsWildcard(src_url.url_string):

- raise CommandException('The mv command disallows naming source '

- 'directories using wildcards')

- if (self.exp_dst_url.IsFileUrl()

- and not os.path.exists(self.exp_dst_url.object_name)

- and have_multiple_srcs):

- os.makedirs(self.exp_dst_url.object_name)

- dst_url = copy_helper.ConstructDstUrl(

- src_url, exp_src_url, src_url_names_container, have_multiple_srcs,

- self.exp_dst_url, self.have_existing_dst_container,

- self.recursion_requested)

- dst_url = copy_helper.FixWindowsNaming(src_url, dst_url)

- copy_helper.CheckForDirFileConflict(exp_src_url, dst_url)

- if copy_helper.SrcDstSame(exp_src_url, dst_url):

- raise CommandException('%s: "%s" and "%s" are the same file - '

- 'abort.' % (cmd_name, exp_src_url, dst_url))

- if dst_url.IsCloudUrl() and dst_url.HasGeneration():

- raise CommandException('%s: a version-specific URL\n(%s)\ncannot be '

- 'the destination for gsutil cp - abort.'

- % (cmd_name, dst_url))

- elapsed_time = bytes_transferred = 0

- try:

- if copy_helper_opts.use_manifest:

- self.manifest.Initialize(

- exp_src_url.url_string, dst_url.url_string)

- (elapsed_time, bytes_transferred, result_url, md5) = (

- copy_helper.PerformCopy(

- self.logger, exp_src_url, dst_url, gsutil_api,

- self, _CopyExceptionHandler, allow_splitting=True,

- headers=self.headers, manifest=self.manifest,

- gzip_exts=self.gzip_exts, test_method=self.test_method))

- if copy_helper_opts.use_manifest:

- if md5:

- self.manifest.Set(exp_src_url.url_string, 'md5', md5)

- self.manifest.SetResult(

- exp_src_url.url_string, bytes_transferred, 'OK')

- if copy_helper_opts.print_ver:

- # Some cases don't return a version-specific URL (e.g., if destination

- # is a file).

- self.logger.info('Created: %s', result_url)

- except ItemExistsError:

- message = 'Skipping existing item: %s' % dst_url

- self.logger.info(message)

- if copy_helper_opts.use_manifest:

- self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)

- except SkipUnsupportedObjectError, e:

- message = ('Skipping item %s with unsupported object type %s' %

- (exp_src_url.url_string, e.unsupported_type))

- self.logger.info(message)

- if copy_helper_opts.use_manifest:

- self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message)

- except copy_helper.FileConcurrencySkipError, e:

- self.logger.warn('Skipping copy of source URL %s because destination URL '

- '%s is already being copied by another gsutil process '

- 'or thread (did you specify the same source URL twice?) '

- % (src_url, dst_url))

- except Exception, e:

- if (copy_helper_opts.no_clobber and

- copy_helper.IsNoClobberServerException(e)):

- message = 'Rejected (noclobber): %s' % dst_url

- self.logger.info(message)

- if copy_helper_opts.use_manifest:

- self.manifest.SetResult(

- exp_src_url.url_string, 0, 'skip', message)

- elif self.continue_on_error:

- message = 'Error copying %s: %s' % (src_url, str(e))

- self.op_failure_count += 1

- self.logger.error(message)

- if copy_helper_opts.use_manifest:

- self.manifest.SetResult(

- exp_src_url.url_string, 0, 'error',

- RemoveCRLFFromString(message))

- else:

- if copy_helper_opts.use_manifest:

- self.manifest.SetResult(

- exp_src_url.url_string, 0, 'error', str(e))

- raise

- else:

- if copy_helper_opts.perform_mv:

- self.logger.info('Removing %s...', exp_src_url)

- if exp_src_url.IsCloudUrl():

- gsutil_api.DeleteObject(exp_src_url.bucket_name,

- exp_src_url.object_name,

- generation=exp_src_url.generation,

- provider=exp_src_url.scheme)

- else:

- os.unlink(exp_src_url.object_name)

- with self.stats_lock:

- self.total_elapsed_time += elapsed_time

- self.total_bytes_transferred += bytes_transferred

- # Command entry point.

- def RunCommand(self):

- copy_helper_opts = self._ParseOpts()

- self.total_elapsed_time = self.total_bytes_transferred = 0

- if self.args[-1] == '-' or self.args[-1] == 'file://-':

- return CatHelper(self).CatUrlStrings(self.args[:-1])

- if copy_helper_opts.read_args_from_stdin:

- if len(self.args) != 1:

- raise CommandException('Source URLs cannot be specified with -I option')

- url_strs = StdinIterator()

- else:

- if len(self.args) < 2:

- raise CommandException('Wrong number of arguments for "cp" command.')

- url_strs = self.args[:-1]

- (self.exp_dst_url, self.have_existing_dst_container) = (

- copy_helper.ExpandUrlToSingleBlr(self.args[-1], self.gsutil_api,

- self.debug, self.project_id))

- # If the destination bucket has versioning enabled iterate with

- # all_versions=True. That way we'll copy all versions if the source bucket

- # is versioned; and by leaving all_versions=False if the destination bucket

- # has versioning disabled we will avoid copying old versions all to the same

- # un-versioned destination object.

- all_versions = False

- try:

- bucket = self._GetBucketWithVersioningConfig(self.exp_dst_url)

- if bucket and bucket.versioning and bucket.versioning.enabled:

- all_versions = True

- except AccessDeniedException:

- # This happens (in the XML API only) if the user doesn't have OWNER access

- # on the bucket (needed to check if versioning is enabled). In this case

- # fall back to copying all versions (which can be inefficient for the

- # reason noted in the comment above). We don't try to warn the user

- # because that would result in false positive warnings (since we can't

- # check if versioning is enabled on the destination bucket).

- #

- # For JSON, we will silently not return versioning if we don't have

- # access.

- all_versions = True

- name_expansion_iterator = NameExpansionIterator(

- self.command_name, self.debug,

- self.logger, self.gsutil_api, url_strs,

- self.recursion_requested or copy_helper_opts.perform_mv,

- project_id=self.project_id, all_versions=all_versions,

- continue_on_error=self.continue_on_error or self.parallel_operations)

- # Use a lock to ensure accurate statistics in the face of

- # multi-threading/multi-processing.

- self.stats_lock = CreateLock()

- # Tracks if any copies failed.

- self.op_failure_count = 0

- # Start the clock.

- start_time = time.time()

- # Tuple of attributes to share/manage across multiple processes in

- # parallel (-m) mode.

- shared_attrs = ('op_failure_count', 'total_bytes_transferred')

- # Perform copy requests in parallel (-m) mode, if requested, using

- # configured number of parallel processes and threads. Otherwise,

- # perform requests with sequential function calls in current process.

- self.Apply(_CopyFuncWrapper, name_expansion_iterator,

- _CopyExceptionHandler, shared_attrs,

- fail_on_error=(not self.continue_on_error))

- self.logger.debug(

- 'total_bytes_transferred: %d', self.total_bytes_transferred)

- end_time = time.time()

- self.total_elapsed_time = end_time - start_time

- # Sometimes, particularly when running unit tests, the total elapsed time

- # is really small. On Windows, the timer resolution is too small and

- # causes total_elapsed_time to be zero.

- try:

- float(self.total_bytes_transferred) / float(self.total_elapsed_time)

- except ZeroDivisionError:

- self.total_elapsed_time = 0.01

- self.total_bytes_per_second = (float(self.total_bytes_transferred) /

- float(self.total_elapsed_time))

- if self.debug == 3:

- # Note that this only counts the actual GET and PUT bytes for the copy

- # - not any transfers for doing wildcard expansion, the initial

- # HEAD/GET request performed to get the object metadata, etc.

- if self.total_bytes_transferred != 0:

- self.logger.info(

- 'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)',

- self.total_bytes_transferred, self.total_elapsed_time,

- MakeHumanReadable(self.total_bytes_per_second))

- if self.op_failure_count:

- plural_str = 's' if self.op_failure_count else ''

- raise CommandException('%d file%s/object%s could not be transferred.' % (

- self.op_failure_count, plural_str, plural_str))

- return 0

- def _ParseOpts(self):

- perform_mv = False

- # exclude_symlinks is handled by Command parent class, so save in Command

- # state rather than CopyHelperOpts.

- self.exclude_symlinks = False

- no_clobber = False

- # continue_on_error is handled by Command parent class, so save in Command

- # state rather than CopyHelperOpts.

- self.continue_on_error = False

- daisy_chain = False

- read_args_from_stdin = False

- print_ver = False

- use_manifest = False

- preserve_acl = False

- canned_acl = None

- # canned_acl is handled by a helper function in parent

- # Command class, so save in Command state rather than CopyHelperOpts.

- self.canned = None

- self.skip_unsupported_objects = False

- # Files matching these extensions should be gzipped before uploading.

- self.gzip_exts = []

- test_callback_file = None

- # self.recursion_requested initialized in command.py (so can be checked

- # in parent class for all commands).

- self.manifest = None

- if self.sub_opts:

- for o, a in self.sub_opts:

- if o == '-a':

- canned_acl = a

- self.canned = True

- if o == '-c':

- self.continue_on_error = True

- elif o == '-D':

- daisy_chain = True

- elif o == '-e':

- self.exclude_symlinks = True

- elif o == '--testcallbackfile':

- # File path of a pickled class that implements ProgressCallback.call.

- # Used for testing transfer interruptions and resumes.

- test_callback_file = a

- elif o == '-I':

- read_args_from_stdin = True

- elif o == '-L':

- use_manifest = True

- self.manifest = Manifest(a)

- elif o == '-M':

- # Note that we signal to the cp command to perform a move (copy

- # followed by remove) and use directory-move naming rules by passing

- # the undocumented (for internal use) -M option when running the cp

- # command from mv.py.

- perform_mv = True

- elif o == '-n':

- no_clobber = True

- elif o == '-p':

- preserve_acl = True

- elif o == '-r' or o == '-R':

- self.recursion_requested = True

- elif o == '-U':

- self.skip_unsupported_objects = True

- elif o == '-v':

- print_ver = True

- elif o == '-z':

- self.gzip_exts = [x.strip() for x in a.split(',')]

- if preserve_acl and canned_acl:

- raise CommandException(

- 'Specifying both the -p and -a options together is invalid.')

- return CreateCopyHelperOpts(

- perform_mv=perform_mv,

- no_clobber=no_clobber,

- daisy_chain=daisy_chain,

- read_args_from_stdin=read_args_from_stdin,

- print_ver=print_ver,

- use_manifest=use_manifest,

- preserve_acl=preserve_acl,

- canned_acl=canned_acl,

- skip_unsupported_objects=self.skip_unsupported_objects,

- test_callback_file=test_callback_file)

- def _GetBucketWithVersioningConfig(self, exp_dst_url):

- """Gets versioning config for a bucket and ensures that it exists.

- Args:

- exp_dst_url: Wildcard-expanded destination StorageUrl.

- Raises:

- AccessDeniedException: if there was a permissions problem accessing the

- bucket or its versioning config.

- CommandException: if URL refers to a cloud bucket that does not exist.

- Returns:

- apitools Bucket with versioning configuration.

- """

- bucket = None

- if exp_dst_url.IsCloudUrl() and exp_dst_url.IsBucket():

- try:

- bucket = self.gsutil_api.GetBucket(

- exp_dst_url.bucket_name, provider=exp_dst_url.scheme,

- fields=['versioning'])

- except AccessDeniedException, e:

- raise

- except NotFoundException, e:

- raise CommandException('Destination bucket %s does not exist.' %

- exp_dst_url)

- except Exception, e:

- raise CommandException('Error retrieving destination bucket %s: %s' %

- (exp_dst_url, e.message))

- return bucket