Index: tools/telemetry/third_party/gsutil/gslib/commands/cp.py |
diff --git a/tools/telemetry/third_party/gsutil/gslib/commands/cp.py b/tools/telemetry/third_party/gsutil/gslib/commands/cp.py |
deleted file mode 100644 |
index 34636dc47d5df460909c9a9e65b38f139aed9a10..0000000000000000000000000000000000000000 |
--- a/tools/telemetry/third_party/gsutil/gslib/commands/cp.py |
+++ /dev/null |
@@ -1,1067 +0,0 @@ |
-# -*- coding: utf-8 -*- |
-# Copyright 2011 Google Inc. All Rights Reserved. |
-# Copyright 2011, Nexenta Systems Inc. |
-# |
-# Licensed under the Apache License, Version 2.0 (the "License"); |
-# you may not use this file except in compliance with the License. |
-# You may obtain a copy of the License at |
-# |
-# http://www.apache.org/licenses/LICENSE-2.0 |
-# |
-# Unless required by applicable law or agreed to in writing, software |
-# distributed under the License is distributed on an "AS IS" BASIS, |
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
-# See the License for the specific language governing permissions and |
-# limitations under the License. |
-"""Implementation of Unix-like cp command for cloud storage providers.""" |
- |
-from __future__ import absolute_import |
- |
-import os |
-import time |
-import traceback |
- |
-from gslib import copy_helper |
-from gslib.cat_helper import CatHelper |
-from gslib.cloud_api import AccessDeniedException |
-from gslib.cloud_api import NotFoundException |
-from gslib.command import Command |
-from gslib.command_argument import CommandArgument |
-from gslib.commands.compose import MAX_COMPONENT_COUNT |
-from gslib.copy_helper import CreateCopyHelperOpts |
-from gslib.copy_helper import ItemExistsError |
-from gslib.copy_helper import Manifest |
-from gslib.copy_helper import PARALLEL_UPLOAD_TEMP_NAMESPACE |
-from gslib.copy_helper import SkipUnsupportedObjectError |
-from gslib.cs_api_map import ApiSelector |
-from gslib.exception import CommandException |
-from gslib.name_expansion import NameExpansionIterator |
-from gslib.storage_url import ContainsWildcard |
-from gslib.util import CreateLock |
-from gslib.util import GetCloudApiInstance |
-from gslib.util import IsCloudSubdirPlaceholder |
-from gslib.util import MakeHumanReadable |
-from gslib.util import NO_MAX |
-from gslib.util import RemoveCRLFFromString |
-from gslib.util import StdinIterator |
- |
-_SYNOPSIS = """ |
- gsutil cp [OPTION]... src_url dst_url |
- gsutil cp [OPTION]... src_url... dst_url |
- gsutil cp [OPTION]... -I dst_url |
-""" |
- |
-_SYNOPSIS_TEXT = """ |
-<B>SYNOPSIS</B> |
-""" + _SYNOPSIS |
- |
-_DESCRIPTION_TEXT = """ |
-<B>DESCRIPTION</B> |
- The gsutil cp command allows you to copy data between your local file |
- system and the cloud, copy data within the cloud, and copy data between |
- cloud storage providers. For example, to copy all text files from the |
- local directory to a bucket you could do: |
- |
- gsutil cp *.txt gs://my_bucket |
- |
- Similarly, you can download text files from a bucket by doing: |
- |
- gsutil cp gs://my_bucket/*.txt . |
- |
- If you want to copy an entire directory tree you need to use the -r option: |
- |
- gsutil cp -r dir gs://my_bucket |
- |
- If you have a large number of files to upload you might want to use the |
- gsutil -m option, to perform a parallel (multi-threaded/multi-processing) |
- copy: |
- |
- gsutil -m cp -r dir gs://my_bucket |
- |
- You can pass a list of URLs (one per line) to copy on stdin instead of as |
- command line arguments by using the -I option. This allows you to use gsutil |
- in a pipeline to upload or download files / objects as generated by a program, |
- such as: |
- |
- some_program | gsutil -m cp -I gs://my_bucket |
- |
- or: |
- |
- some_program | gsutil -m cp -I ./download_dir |
- |
- The contents of stdin can name files, cloud URLs, and wildcards of files |
- and cloud URLs. |
-""" |
- |
-_NAME_CONSTRUCTION_TEXT = """ |
-<B>HOW NAMES ARE CONSTRUCTED</B> |
- The gsutil cp command strives to name objects in a way consistent with how |
- Linux cp works, which causes names to be constructed in varying ways depending |
- on whether you're performing a recursive directory copy or copying |
- individually named objects; and whether you're copying to an existing or |
- non-existent directory. |
- |
- When performing recursive directory copies, object names are constructed |
- that mirror the source directory structure starting at the point of |
- recursive processing. For example, the command: |
- |
- gsutil cp -r dir1/dir2 gs://my_bucket |
- |
- will create objects named like gs://my_bucket/dir2/a/b/c, assuming |
- dir1/dir2 contains the file a/b/c. |
- |
- In contrast, copying individually named files will result in objects named |
- by the final path component of the source files. For example, the command: |
- |
- gsutil cp dir1/dir2/** gs://my_bucket |
- |
- will create objects named like gs://my_bucket/c. |
- |
- The same rules apply for downloads: recursive copies of buckets and |
- bucket subdirectories produce a mirrored filename structure, while copying |
- individually (or wildcard) named objects produce flatly named files. |
- |
- Note that in the above example the '**' wildcard matches all names |
- anywhere under dir. The wildcard '*' will match names just one level deep. For |
- more details see 'gsutil help wildcards'. |
- |
- There's an additional wrinkle when working with subdirectories: the resulting |
- names depend on whether the destination subdirectory exists. For example, |
- if gs://my_bucket/subdir exists as a subdirectory, the command: |
- |
- gsutil cp -r dir1/dir2 gs://my_bucket/subdir |
- |
- will create objects named like gs://my_bucket/subdir/dir2/a/b/c. In contrast, |
- if gs://my_bucket/subdir does not exist, this same gsutil cp command will |
- create objects named like gs://my_bucket/subdir/a/b/c. |
- |
- Note: If you use the |
- `Google Developers Console <https://console.developers.google.com>`_ |
- to create folders, it does so by creating a "placeholder" object that ends |
- with a "/" character. gsutil skips these objects when downloading from the |
- cloud to the local file system, because attempting to create a file that |
- ends with a "/" is not allowed on Linux and MacOS. Because of this, it is |
- recommended that you not create objects that end with "/" (unless you don't |
- need to be able to download such objects using gsutil). |
-""" |
- |
-_SUBDIRECTORIES_TEXT = """ |
-<B>COPYING TO/FROM SUBDIRECTORIES; DISTRIBUTING TRANSFERS ACROSS MACHINES</B> |
- You can use gsutil to copy to and from subdirectories by using a command |
- like: |
- |
- gsutil cp -r dir gs://my_bucket/data |
- |
- This will cause dir and all of its files and nested subdirectories to be |
- copied under the specified destination, resulting in objects with names like |
- gs://my_bucket/data/dir/a/b/c. Similarly you can download from bucket |
- subdirectories by using a command like: |
- |
- gsutil cp -r gs://my_bucket/data dir |
- |
- This will cause everything nested under gs://my_bucket/data to be downloaded |
- into dir, resulting in files with names like dir/data/a/b/c. |
- |
- Copying subdirectories is useful if you want to add data to an existing |
- bucket directory structure over time. It's also useful if you want |
- to parallelize uploads and downloads across multiple machines (often |
- reducing overall transfer time compared with simply running gsutil -m |
- cp on one machine). For example, if your bucket contains this structure: |
- |
- gs://my_bucket/data/result_set_01/ |
- gs://my_bucket/data/result_set_02/ |
- ... |
- gs://my_bucket/data/result_set_99/ |
- |
- you could perform concurrent downloads across 3 machines by running these |
- commands on each machine, respectively: |
- |
- gsutil -m cp -r gs://my_bucket/data/result_set_[0-3]* dir |
- gsutil -m cp -r gs://my_bucket/data/result_set_[4-6]* dir |
- gsutil -m cp -r gs://my_bucket/data/result_set_[7-9]* dir |
- |
- Note that dir could be a local directory on each machine, or it could |
- be a directory mounted off of a shared file server; whether the latter |
- performs acceptably may depend on a number of things, so we recommend |
- you experiment and find out what works best for you. |
-""" |
- |
-_COPY_IN_CLOUD_TEXT = """ |
-<B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B> |
- If both the source and destination URL are cloud URLs from the same |
- provider, gsutil copies data "in the cloud" (i.e., without downloading |
- to and uploading from the machine where you run gsutil). In addition to |
- the performance and cost advantages of doing this, copying in the cloud |
- preserves metadata (like Content-Type and Cache-Control). In contrast, |
- when you download data from the cloud it ends up in a file, which has |
- no associated metadata. Thus, unless you have some way to hold on to |
- or re-create that metadata, downloading to a file will not retain the |
- metadata. |
- |
- Copies spanning locations and/or storage classes cause data to be rewritten |
- in the cloud, which may take some time. Such operations can be resumed with |
- the same command if they are interrupted, so long as the command parameters |
- are identical. |
- |
- Note that by default, the gsutil cp command does not copy the object |
- ACL to the new object, and instead will use the default bucket ACL (see |
- "gsutil help defacl"). You can override this behavior with the -p |
- option (see OPTIONS below). |
- |
- One additional note about copying in the cloud: If the destination bucket has |
- versioning enabled, gsutil cp will copy all versions of the source object(s). |
- For example: |
- |
- gsutil cp gs://bucket1/obj gs://bucket2 |
- |
- will cause all versions of gs://bucket1/obj to be copied to gs://bucket2. |
-""" |
- |
-_CHECKSUM_VALIDATION_TEXT = """ |
-<B>CHECKSUM VALIDATION</B> |
- At the end of every upload or download the gsutil cp command validates that |
- the checksum it computes for the source file/object matches the checksum |
- the service computes. If the checksums do not match, gsutil will delete the |
- corrupted object and print a warning message. This very rarely happens, but |
- if it does, please contact gs-team@google.com. |
- |
- If you know the MD5 of a file before uploading you can specify it in the |
- Content-MD5 header, which will cause the cloud storage service to reject the |
- upload if the MD5 doesn't match the value computed by the service. For |
- example: |
- |
- % gsutil hash obj |
- Hashing obj: |
- Hashes [base64] for obj: |
- Hash (crc32c): lIMoIw== |
- Hash (md5): VgyllJgiiaRAbyUUIqDMmw== |
- |
- % gsutil -h Content-MD5:VgyllJgiiaRAbyUUIqDMmw== cp obj gs://your-bucket/obj |
- Copying file://obj [Content-Type=text/plain]... |
- Uploading gs://your-bucket/obj: 182 b/182 B |
- |
- If the checksum didn't match the service would instead reject the upload and |
- gsutil would print a message like: |
- |
- BadRequestException: 400 Provided MD5 hash "VgyllJgiiaRAbyUUIqDMmw==" |
- doesn't match calculated MD5 hash "7gyllJgiiaRAbyUUIqDMmw==". |
- |
- Even if you don't do this gsutil will delete the object if the computed |
- checksum mismatches, but specifying the Content-MD5 header has three |
- advantages: |
- |
- 1. It prevents the corrupted object from becoming visible at all, whereas |
- otherwise it would be visible for 1-3 seconds before gsutil deletes it. |
- |
- 2. It will definitively prevent the corrupted object from being left in |
- the cloud, whereas the gsutil approach of deleting after the upload |
- completes could fail if (for example) the gsutil process gets ^C'd |
- between upload and deletion request. |
- |
- 3. It supports a customer-to-service integrity check handoff. For example, |
- if you have a content production pipeline that generates data to be |
- uploaded to the cloud along with checksums of that data, specifying the |
- MD5 computed by your content pipeline when you run gsutil cp will ensure |
- that the checksums match all the way through the process (e.g., detecting |
- if data gets corrupted on your local disk between the time it was written |
- by your content pipeline and the time it was uploaded to GCS). |
- |
- Note: The Content-MD5 header is ignored for composite objects, because such |
- objects only have a CRC32C checksum. |
-""" |
- |
-_RETRY_HANDLING_TEXT = """ |
-<B>RETRY HANDLING</B> |
- The cp command will retry when failures occur, but if enough failures happen |
- during a particular copy or delete operation the command will skip that object |
- and move on. At the end of the copy run if any failures were not successfully |
- retried, the cp command will report the count of failures, and exit with |
- non-zero status. |
- |
- Note that there are cases where retrying will never succeed, such as if you |
- don't have write permission to the destination bucket or if the destination |
- path for some objects is longer than the maximum allowed length. |
- |
- For more details about gsutil's retry handling, please see |
- "gsutil help retries". |
-""" |
- |
-_RESUMABLE_TRANSFERS_TEXT = """ |
-<B>RESUMABLE TRANSFERS</B> |
- gsutil automatically uses the Google Cloud Storage resumable upload feature |
- whenever you use the cp command to upload an object that is larger than 2 |
- MiB. You do not need to specify any special command line options to make this |
- happen. If your upload is interrupted you can restart the upload by running |
- the same cp command that you ran to start the upload. Until the upload |
- has completed successfully, it will not be visible at the destination object |
- and will not replace any existing object the upload is intended to overwrite. |
- (However, see the section on PARALLEL COMPOSITE UPLOADS, which may leave |
- temporary component objects in place during the upload process.) |
- |
- Similarly, gsutil automatically performs resumable downloads (using HTTP |
- standard Range GET operations) whenever you use the cp command, unless the |
- destination is a stream or null. In this case the partially downloaded file |
- will be visible as soon as it starts being written. Thus, before you attempt |
- to use any files downloaded by gsutil you should make sure the download |
- completed successfully, by checking the exit status from the gsutil command. |
- This can be done in a bash script, for example, by doing: |
- |
- gsutil cp gs://your-bucket/your-object ./local-file |
- if [ "$status" -ne "0" ] ; then |
- << Code that handles failures >> |
- fi |
- |
- Resumable uploads and downloads store some state information in a file |
- in ~/.gsutil named by the destination object or file. If you attempt to |
- resume a transfer from a machine with a different directory, the transfer |
- will start over from scratch. |
- |
- See also "gsutil help prod" for details on using resumable transfers |
- in production. |
-""" |
- |
-_STREAMING_TRANSFERS_TEXT = """ |
-<B>STREAMING TRANSFERS</B> |
- Use '-' in place of src_url or dst_url to perform a streaming |
- transfer. For example: |
- |
- long_running_computation | gsutil cp - gs://my_bucket/obj |
- |
- Streaming uploads using the JSON API (see "gsutil help apis") are buffered in |
- memory and can retry in the event of network flakiness or service errors. |
- |
- Streaming transfers (other than uploads using the JSON API) do not support |
- resumable uploads/downloads. If you have a large amount of data to upload |
- (say, more than 100 MiB) it is recommended to write the data to a local file |
- and then copy that file to the cloud rather than streaming it (and similarly |
- for large downloads). |
- |
- WARNING: When performing streaming transfers gsutil does not compute a |
- checksum of the uploaded or downloaded data. Therefore, we recommend that |
- users either perform their own validation of the data or use non-streaming |
- transfers (which perform integrity checking automatically). |
-""" |
- |
-_PARALLEL_COMPOSITE_UPLOADS_TEXT = """ |
-<B>PARALLEL COMPOSITE UPLOADS</B> |
- gsutil can automatically use |
- `object composition <https://developers.google.com/storage/docs/composite-objects>`_ |
- to perform uploads in parallel for large, local files being uploaded to Google |
- Cloud Storage. This means that, if enabled (see next paragraph), a large file |
- will be split into component pieces that will be uploaded in parallel. Those |
- components will then be composed in the cloud, and the temporary components in |
- the cloud will be deleted after successful composition. No additional local |
- disk space is required for this operation. |
- |
- Using parallel composite uploads presents a tradeoff between upload |
- performance and download configuration: If you enable parallel composite |
- uploads your uploads will run faster, but someone will need to install a |
- compiled crcmod (see "gsutil help crcmod") on every machine where objects are |
- downloaded by gsutil or other Python applications. For some distributions this |
- is easy (e.g., it comes pre-installed on MacOS), but in some cases users have |
- found it difficult. Because of this at present parallel composite uploads are |
- disabled by default. Google is actively working with a number of the Linux |
- distributions to get crcmod included with the stock distribution. Once that is |
- done we will re-enable parallel composite uploads by default in gsutil. |
- |
- To try parallel composite uploads you can run the command: |
- |
- gsutil -o GSUtil:parallel_composite_upload_threshold=150M cp bigfile gs://your-bucket |
- |
- where bigfile is larger than 150 MiB. When you do this notice that the upload |
- progress indicator continuously updates for several different uploads at once |
- (corresponding to each of the sections of the file being uploaded in |
- parallel), until the parallel upload completes. If you then want to enable |
- parallel composite uploads for all of your future uploads (notwithstanding the |
- caveats mentioned earlier), you can uncomment and set the |
- "parallel_composite_upload_threshold" config value in your .boto configuration |
- file to this value. |
- |
- Note that the crcmod problem only impacts downloads via Python applications |
- (such as gsutil). If any users who need to download the data using gsutil or |
- other Python applications can install crcmod, it makes sense to enable |
- parallel composite uploads (see above). For example, if you use gsutil to |
- upload video assets and those assets will only ever be served via a Java |
- application (there are efficient crc32c implementations available in Java), it |
- would make sense to enable parallel composite uploads on your machine. |
- |
- If a parallel composite upload fails prior to composition, re-running the |
- gsutil command will take advantage of resumable uploads for those components |
- that failed, and the component objects will be deleted after the first |
- successful attempt. Any temporary objects that were uploaded successfully |
- before gsutil failed will still exist until the upload is completed |
- successfully. The temporary objects will be named in the following fashion: |
- |
- <random ID>%s<hash> |
- |
- where <random ID> is some numerical value, and <hash> is an MD5 hash (not |
- related to the hash of the contents of the file or object). |
- |
- To avoid leaving temporary objects around, you should make sure to check the |
- exit status from the gsutil command. This can be done in a bash script, for |
- example, by doing: |
- |
- gsutil cp ./local-file gs://your-bucket/your-object |
- if [ "$status" -ne "0" ] ; then |
- << Code that handles failures >> |
- fi |
- |
- Or, for copying a directory, use this instead: |
- |
- gsutil cp -c -L cp.log -r ./dir gs://bucket |
- if [ "$status" -ne "0" ] ; then |
- << Code that handles failures >> |
- fi |
- |
- One important caveat is that files uploaded in this fashion are still subject |
- to the maximum number of components limit. For example, if you upload a large |
- file that gets split into %d components, and try to compose it with another |
- object with %d components, the operation will fail because it exceeds the %d |
- component limit. If you wish to compose an object later and the component |
- limit is a concern, it is recommended that you disable parallel composite |
- uploads for that transfer. |
- |
- Also note that an object uploaded using this feature will have a CRC32C hash, |
- but it will not have an MD5 hash (and because of that, requires users who |
- download the object to have crcmod installed, as noted earlier). For details |
- see 'gsutil help crc32c'. |
- |
- Note that this feature can be completely disabled by setting the |
- "parallel_composite_upload_threshold" variable in the .boto config file to 0. |
-""" % (PARALLEL_UPLOAD_TEMP_NAMESPACE, 10, MAX_COMPONENT_COUNT - 9, |
- MAX_COMPONENT_COUNT) |
- |
- |
-_CHANGING_TEMP_DIRECTORIES_TEXT = """ |
-<B>CHANGING TEMP DIRECTORIES</B> |
- gsutil writes data to a temporary directory in several cases: |
- |
- - when compressing data to be uploaded (see the -z option) |
- - when decompressing data being downloaded (when the data has |
- Content-Encoding:gzip, e.g., as happens when uploaded using gsutil cp -z) |
- - when running integration tests (using the gsutil test command) |
- |
- In these cases it's possible the temp file location on your system that |
- gsutil selects by default may not have enough space. If you find that |
- gsutil runs out of space during one of these operations (e.g., raising |
- "CommandException: Inadequate temp space available to compress <your file>" |
- during a gsutil cp -z operation), you can change where it writes these |
- temp files by setting the TMPDIR environment variable. On Linux and MacOS |
- you can do this either by running gsutil this way: |
- |
- TMPDIR=/some/directory gsutil cp ... |
- |
- or by adding this line to your ~/.bashrc file and then restarting the shell |
- before running gsutil: |
- |
- export TMPDIR=/some/directory |
- |
- On Windows 7 you can change the TMPDIR environment variable from Start -> |
- Computer -> System -> Advanced System Settings -> Environment Variables. |
- You need to reboot after making this change for it to take effect. (Rebooting |
- is not necessary after running the export command on Linux and MacOS.) |
-""" |
- |
-_OPTIONS_TEXT = """ |
-<B>OPTIONS</B> |
- -a canned_acl Sets named canned_acl when uploaded objects created. See |
- 'gsutil help acls' for further details. |
- |
- -c If an error occurs, continue to attempt to copy the remaining |
- files. If any copies were unsuccessful, gsutil's exit status |
- will be non-zero even if this flag is set. This option is |
- implicitly set when running "gsutil -m cp...". Note: -c only |
- applies to the actual copying operation. If an error occurs |
- while iterating over the files in the local directory (e.g., |
- invalid Unicode file name) gsutil will print an error message |
- and abort. |
- |
- -D Copy in "daisy chain" mode, i.e., copying between two buckets |
- by hooking a download to an upload, via the machine where |
- gsutil is run. By default, data are copied between two buckets |
- "in the cloud", i.e., without needing to copy via the machine |
- where gsutil runs. |
- |
- By default, a "copy in the cloud" when the source is a |
- composite object will retain the composite nature of the |
- object. However, Daisy chain mode can be used to change a |
- composite object into a non-composite object. For example: |
- |
- gsutil cp -D -p gs://bucket/obj gs://bucket/obj_tmp |
- gsutil mv -p gs://bucket/obj_tmp gs://bucket/obj |
- |
- Note: Daisy chain mode is automatically used when copying |
- between providers (e.g., to copy data from Google Cloud Storage |
- to another provider). |
- |
- -e Exclude symlinks. When specified, symbolic links will not be |
- copied. |
- |
- -I Causes gsutil to read the list of files or objects to copy from |
- stdin. This allows you to run a program that generates the list |
- of files to upload/download. |
- |
- -L <file> Outputs a manifest log file with detailed information about |
- each item that was copied. This manifest contains the following |
- information for each item: |
- |
- - Source path. |
- - Destination path. |
- - Source size. |
- - Bytes transferred. |
- - MD5 hash. |
- - UTC date and time transfer was started in ISO 8601 format. |
- - UTC date and time transfer was completed in ISO 8601 format. |
- - Upload id, if a resumable upload was performed. |
- - Final result of the attempted transfer, success or failure. |
- - Failure details, if any. |
- |
- If the log file already exists, gsutil will use the file as an |
- input to the copy process, and will also append log items to |
- the existing file. Files/objects that are marked in the |
- existing log file as having been successfully copied (or |
- skipped) will be ignored. Files/objects without entries will be |
- copied and ones previously marked as unsuccessful will be |
- retried. This can be used in conjunction with the -c option to |
- build a script that copies a large number of objects reliably, |
- using a bash script like the following: |
- |
- until gsutil cp -c -L cp.log -r ./dir gs://bucket; do |
- sleep 1 |
- done |
- |
- The -c option will cause copying to continue after failures |
- occur, and the -L option will allow gsutil to pick up where it |
- left off without duplicating work. The loop will continue |
- running as long as gsutil exits with a non-zero status (such a |
- status indicates there was at least one failure during the |
- gsutil run). |
- |
- Note: If you're trying to synchronize the contents of a |
- directory and a bucket (or two buckets), see |
- 'gsutil help rsync'. |
- |
- -n No-clobber. When specified, existing files or objects at the |
- destination will not be overwritten. Any items that are skipped |
- by this option will be reported as being skipped. This option |
- will perform an additional GET request to check if an item |
- exists before attempting to upload the data. This will save |
- retransmitting data, but the additional HTTP requests may make |
- small object transfers slower and more expensive. |
- |
- -p Causes ACLs to be preserved when copying in the cloud. Note |
- that this option has performance and cost implications when |
- using the XML API, as it requires separate HTTP calls for |
- interacting with ACLs. The performance issue can be mitigated |
- to some degree by using gsutil -m cp to cause parallel copying. |
- Also, this option only works if you have OWNER access to all of |
- the objects that are copied. |
- |
- You can avoid the additional performance and cost of using |
- cp -p if you want all objects in the destination bucket to end |
- up with the same ACL by setting a default object ACL on that |
- bucket instead of using cp -p. See "help gsutil defacl". |
- |
- Note that it's not valid to specify both the -a and -p options |
- together. |
- |
- -R, -r Causes directories, buckets, and bucket subdirectories to be |
- copied recursively. If you neglect to use this option for |
- an upload, gsutil will copy any files it finds and skip any |
- directories. Similarly, neglecting to specify -r for a download |
- will cause gsutil to copy any objects at the current bucket |
- directory level, and skip any subdirectories. |
- |
- -U Skip objects with unsupported object types instead of failing. |
- Unsupported object types are s3 glacier objects. |
- |
- -v Requests that the version-specific URL for each uploaded object |
- be printed. Given this URL you can make future upload requests |
- that are safe in the face of concurrent updates, because Google |
- Cloud Storage will refuse to perform the update if the current |
- object version doesn't match the version-specific URL. See |
- 'gsutil help versions' for more details. |
- |
- -z <ext,...> Applies gzip content-encoding to file uploads with the given |
- extensions. This is useful when uploading files with |
- compressible content (such as .js, .css, or .html files) |
- because it saves network bandwidth and space in Google Cloud |
- Storage, which in turn reduces storage costs. |
- |
- When you specify the -z option, the data from your files is |
- compressed before it is uploaded, but your actual files are |
- left uncompressed on the local disk. The uploaded objects |
- retain the Content-Type and name of the original files but are |
- given a Content-Encoding header with the value "gzip" to |
- indicate that the object data stored are compressed on the |
- Google Cloud Storage servers. |
- |
- For example, the following command: |
- |
- gsutil cp -z html -a public-read cattypes.html gs://mycats |
- |
- will do all of the following: |
- |
- - Upload as the object gs://mycats/cattypes.html (cp command) |
- - Set the Content-Type to text/html (based on file extension) |
- - Compress the data in the file cattypes.html (-z option) |
- - Set the Content-Encoding to gzip (-z option) |
- - Set the ACL to public-read (-a option) |
- - If a user tries to view cattypes.html in a browser, the |
- browser will know to uncompress the data based on the |
- Content-Encoding header, and to render it as HTML based on |
- the Content-Type header. |
- |
- Note that if you download an object with Content-Encoding:gzip |
- gsutil will decompress the content before writing the local |
- file. |
-""" |
- |
-_DETAILED_HELP_TEXT = '\n\n'.join([_SYNOPSIS_TEXT, |
- _DESCRIPTION_TEXT, |
- _NAME_CONSTRUCTION_TEXT, |
- _SUBDIRECTORIES_TEXT, |
- _COPY_IN_CLOUD_TEXT, |
- _CHECKSUM_VALIDATION_TEXT, |
- _RETRY_HANDLING_TEXT, |
- _RESUMABLE_TRANSFERS_TEXT, |
- _STREAMING_TRANSFERS_TEXT, |
- _PARALLEL_COMPOSITE_UPLOADS_TEXT, |
- _CHANGING_TEMP_DIRECTORIES_TEXT, |
- _OPTIONS_TEXT]) |
- |
- |
-CP_SUB_ARGS = 'a:cDeIL:MNnprRtUvz:' |
- |
- |
-def _CopyFuncWrapper(cls, args, thread_state=None): |
- cls.CopyFunc(args, thread_state=thread_state) |
- |
- |
-def _CopyExceptionHandler(cls, e): |
- """Simple exception handler to allow post-completion status.""" |
- cls.logger.error(str(e)) |
- cls.op_failure_count += 1 |
- cls.logger.debug('\n\nEncountered exception while copying:\n%s\n', |
- traceback.format_exc()) |
- |
- |
-def _RmExceptionHandler(cls, e): |
- """Simple exception handler to allow post-completion status.""" |
- cls.logger.error(str(e)) |
- |
- |
-class CpCommand(Command): |
- """Implementation of gsutil cp command. |
- |
- Note that CpCommand is run for both gsutil cp and gsutil mv. The latter |
- happens by MvCommand calling CpCommand and passing the hidden (undocumented) |
- -M option. This allows the copy and remove needed for each mv to run |
- together (rather than first running all the cp's and then all the rm's, as |
- we originally had implemented), which in turn avoids the following problem |
- with removing the wrong objects: starting with a bucket containing only |
- the object gs://bucket/obj, say the user does: |
- gsutil mv gs://bucket/* gs://bucket/d.txt |
- If we ran all the cp's and then all the rm's and we didn't expand the wildcard |
- first, the cp command would first copy gs://bucket/obj to gs://bucket/d.txt, |
- and the rm command would then remove that object. In the implementation |
- prior to gsutil release 3.12 we avoided this by building a list of objects |
- to process and then running the copies and then the removes; but building |
- the list up front limits scalability (compared with the current approach |
- of processing the bucket listing iterator on the fly). |
- """ |
- |
- # Command specification. See base class for documentation. |
- command_spec = Command.CreateCommandSpec( |
- 'cp', |
- command_name_aliases=['copy'], |
- usage_synopsis=_SYNOPSIS, |
- min_args=1, |
- max_args=NO_MAX, |
- # -t is deprecated but leave intact for now to avoid breakage. |
- supported_sub_args=CP_SUB_ARGS, |
- file_url_ok=True, |
- provider_url_ok=False, |
- urls_start_arg=0, |
- gs_api_support=[ApiSelector.XML, ApiSelector.JSON], |
- gs_default_api=ApiSelector.JSON, |
- supported_private_args=['testcallbackfile='], |
- argparse_arguments=[ |
- CommandArgument.MakeZeroOrMoreCloudOrFileURLsArgument() |
- ] |
- ) |
- # Help specification. See help_provider.py for documentation. |
- help_spec = Command.HelpSpec( |
- help_name='cp', |
- help_name_aliases=['copy'], |
- help_type='command_help', |
- help_one_line_summary='Copy files and objects', |
- help_text=_DETAILED_HELP_TEXT, |
- subcommand_help_text={}, |
- ) |
- |
- # pylint: disable=too-many-statements |
- def CopyFunc(self, name_expansion_result, thread_state=None): |
- """Worker function for performing the actual copy (and rm, for mv).""" |
- gsutil_api = GetCloudApiInstance(self, thread_state=thread_state) |
- |
- copy_helper_opts = copy_helper.GetCopyHelperOpts() |
- if copy_helper_opts.perform_mv: |
- cmd_name = 'mv' |
- else: |
- cmd_name = self.command_name |
- src_url = name_expansion_result.source_storage_url |
- exp_src_url = name_expansion_result.expanded_storage_url |
- src_url_names_container = name_expansion_result.names_container |
- have_multiple_srcs = name_expansion_result.is_multi_source_request |
- |
- if src_url.IsCloudUrl() and src_url.IsProvider(): |
- raise CommandException( |
- 'The %s command does not allow provider-only source URLs (%s)' % |
- (cmd_name, src_url)) |
- if have_multiple_srcs: |
- copy_helper.InsistDstUrlNamesContainer( |
- self.exp_dst_url, self.have_existing_dst_container, cmd_name) |
- |
- # Various GUI tools (like the GCS web console) create placeholder objects |
- # ending with '/' when the user creates an empty directory. Normally these |
- # tools should delete those placeholders once objects have been written |
- # "under" the directory, but sometimes the placeholders are left around. We |
- # need to filter them out here, otherwise if the user tries to rsync from |
- # GCS to a local directory it will result in a directory/file conflict |
- # (e.g., trying to download an object called "mydata/" where the local |
- # directory "mydata" exists). |
- if IsCloudSubdirPlaceholder(exp_src_url): |
- self.logger.info('Skipping cloud sub-directory placeholder object (%s) ' |
- 'because such objects aren\'t needed in (and would ' |
- 'interfere with) directories in the local file system', |
- exp_src_url) |
- return |
- |
- if copy_helper_opts.use_manifest and self.manifest.WasSuccessful( |
- exp_src_url.url_string): |
- return |
- |
- if copy_helper_opts.perform_mv: |
- if name_expansion_result.names_container: |
- # Use recursion_requested when performing name expansion for the |
- # directory mv case so we can determine if any of the source URLs are |
- # directories (and then use cp -r and rm -r to perform the move, to |
- # match the behavior of Linux mv (which when moving a directory moves |
- # all the contained files). |
- self.recursion_requested = True |
- # Disallow wildcard src URLs when moving directories, as supporting it |
- # would make the name transformation too complex and would also be |
- # dangerous (e.g., someone could accidentally move many objects to the |
- # wrong name, or accidentally overwrite many objects). |
- if ContainsWildcard(src_url.url_string): |
- raise CommandException('The mv command disallows naming source ' |
- 'directories using wildcards') |
- |
- if (self.exp_dst_url.IsFileUrl() |
- and not os.path.exists(self.exp_dst_url.object_name) |
- and have_multiple_srcs): |
- os.makedirs(self.exp_dst_url.object_name) |
- |
- dst_url = copy_helper.ConstructDstUrl( |
- src_url, exp_src_url, src_url_names_container, have_multiple_srcs, |
- self.exp_dst_url, self.have_existing_dst_container, |
- self.recursion_requested) |
- dst_url = copy_helper.FixWindowsNaming(src_url, dst_url) |
- |
- copy_helper.CheckForDirFileConflict(exp_src_url, dst_url) |
- if copy_helper.SrcDstSame(exp_src_url, dst_url): |
- raise CommandException('%s: "%s" and "%s" are the same file - ' |
- 'abort.' % (cmd_name, exp_src_url, dst_url)) |
- |
- if dst_url.IsCloudUrl() and dst_url.HasGeneration(): |
- raise CommandException('%s: a version-specific URL\n(%s)\ncannot be ' |
- 'the destination for gsutil cp - abort.' |
- % (cmd_name, dst_url)) |
- |
- elapsed_time = bytes_transferred = 0 |
- try: |
- if copy_helper_opts.use_manifest: |
- self.manifest.Initialize( |
- exp_src_url.url_string, dst_url.url_string) |
- (elapsed_time, bytes_transferred, result_url, md5) = ( |
- copy_helper.PerformCopy( |
- self.logger, exp_src_url, dst_url, gsutil_api, |
- self, _CopyExceptionHandler, allow_splitting=True, |
- headers=self.headers, manifest=self.manifest, |
- gzip_exts=self.gzip_exts, test_method=self.test_method)) |
- if copy_helper_opts.use_manifest: |
- if md5: |
- self.manifest.Set(exp_src_url.url_string, 'md5', md5) |
- self.manifest.SetResult( |
- exp_src_url.url_string, bytes_transferred, 'OK') |
- if copy_helper_opts.print_ver: |
- # Some cases don't return a version-specific URL (e.g., if destination |
- # is a file). |
- self.logger.info('Created: %s', result_url) |
- except ItemExistsError: |
- message = 'Skipping existing item: %s' % dst_url |
- self.logger.info(message) |
- if copy_helper_opts.use_manifest: |
- self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message) |
- except SkipUnsupportedObjectError, e: |
- message = ('Skipping item %s with unsupported object type %s' % |
- (exp_src_url.url_string, e.unsupported_type)) |
- self.logger.info(message) |
- if copy_helper_opts.use_manifest: |
- self.manifest.SetResult(exp_src_url.url_string, 0, 'skip', message) |
- except copy_helper.FileConcurrencySkipError, e: |
- self.logger.warn('Skipping copy of source URL %s because destination URL ' |
- '%s is already being copied by another gsutil process ' |
- 'or thread (did you specify the same source URL twice?) ' |
- % (src_url, dst_url)) |
- except Exception, e: |
- if (copy_helper_opts.no_clobber and |
- copy_helper.IsNoClobberServerException(e)): |
- message = 'Rejected (noclobber): %s' % dst_url |
- self.logger.info(message) |
- if copy_helper_opts.use_manifest: |
- self.manifest.SetResult( |
- exp_src_url.url_string, 0, 'skip', message) |
- elif self.continue_on_error: |
- message = 'Error copying %s: %s' % (src_url, str(e)) |
- self.op_failure_count += 1 |
- self.logger.error(message) |
- if copy_helper_opts.use_manifest: |
- self.manifest.SetResult( |
- exp_src_url.url_string, 0, 'error', |
- RemoveCRLFFromString(message)) |
- else: |
- if copy_helper_opts.use_manifest: |
- self.manifest.SetResult( |
- exp_src_url.url_string, 0, 'error', str(e)) |
- raise |
- else: |
- if copy_helper_opts.perform_mv: |
- self.logger.info('Removing %s...', exp_src_url) |
- if exp_src_url.IsCloudUrl(): |
- gsutil_api.DeleteObject(exp_src_url.bucket_name, |
- exp_src_url.object_name, |
- generation=exp_src_url.generation, |
- provider=exp_src_url.scheme) |
- else: |
- os.unlink(exp_src_url.object_name) |
- |
- with self.stats_lock: |
- self.total_elapsed_time += elapsed_time |
- self.total_bytes_transferred += bytes_transferred |
- |
- # Command entry point. |
- def RunCommand(self): |
- copy_helper_opts = self._ParseOpts() |
- |
- self.total_elapsed_time = self.total_bytes_transferred = 0 |
- if self.args[-1] == '-' or self.args[-1] == 'file://-': |
- return CatHelper(self).CatUrlStrings(self.args[:-1]) |
- |
- if copy_helper_opts.read_args_from_stdin: |
- if len(self.args) != 1: |
- raise CommandException('Source URLs cannot be specified with -I option') |
- url_strs = StdinIterator() |
- else: |
- if len(self.args) < 2: |
- raise CommandException('Wrong number of arguments for "cp" command.') |
- url_strs = self.args[:-1] |
- |
- (self.exp_dst_url, self.have_existing_dst_container) = ( |
- copy_helper.ExpandUrlToSingleBlr(self.args[-1], self.gsutil_api, |
- self.debug, self.project_id)) |
- |
- # If the destination bucket has versioning enabled iterate with |
- # all_versions=True. That way we'll copy all versions if the source bucket |
- # is versioned; and by leaving all_versions=False if the destination bucket |
- # has versioning disabled we will avoid copying old versions all to the same |
- # un-versioned destination object. |
- all_versions = False |
- try: |
- bucket = self._GetBucketWithVersioningConfig(self.exp_dst_url) |
- if bucket and bucket.versioning and bucket.versioning.enabled: |
- all_versions = True |
- except AccessDeniedException: |
- # This happens (in the XML API only) if the user doesn't have OWNER access |
- # on the bucket (needed to check if versioning is enabled). In this case |
- # fall back to copying all versions (which can be inefficient for the |
- # reason noted in the comment above). We don't try to warn the user |
- # because that would result in false positive warnings (since we can't |
- # check if versioning is enabled on the destination bucket). |
- # |
- # For JSON, we will silently not return versioning if we don't have |
- # access. |
- all_versions = True |
- |
- name_expansion_iterator = NameExpansionIterator( |
- self.command_name, self.debug, |
- self.logger, self.gsutil_api, url_strs, |
- self.recursion_requested or copy_helper_opts.perform_mv, |
- project_id=self.project_id, all_versions=all_versions, |
- continue_on_error=self.continue_on_error or self.parallel_operations) |
- |
- # Use a lock to ensure accurate statistics in the face of |
- # multi-threading/multi-processing. |
- self.stats_lock = CreateLock() |
- |
- # Tracks if any copies failed. |
- self.op_failure_count = 0 |
- |
- # Start the clock. |
- start_time = time.time() |
- |
- # Tuple of attributes to share/manage across multiple processes in |
- # parallel (-m) mode. |
- shared_attrs = ('op_failure_count', 'total_bytes_transferred') |
- |
- # Perform copy requests in parallel (-m) mode, if requested, using |
- # configured number of parallel processes and threads. Otherwise, |
- # perform requests with sequential function calls in current process. |
- self.Apply(_CopyFuncWrapper, name_expansion_iterator, |
- _CopyExceptionHandler, shared_attrs, |
- fail_on_error=(not self.continue_on_error)) |
- self.logger.debug( |
- 'total_bytes_transferred: %d', self.total_bytes_transferred) |
- |
- end_time = time.time() |
- self.total_elapsed_time = end_time - start_time |
- |
- # Sometimes, particularly when running unit tests, the total elapsed time |
- # is really small. On Windows, the timer resolution is too small and |
- # causes total_elapsed_time to be zero. |
- try: |
- float(self.total_bytes_transferred) / float(self.total_elapsed_time) |
- except ZeroDivisionError: |
- self.total_elapsed_time = 0.01 |
- |
- self.total_bytes_per_second = (float(self.total_bytes_transferred) / |
- float(self.total_elapsed_time)) |
- |
- if self.debug == 3: |
- # Note that this only counts the actual GET and PUT bytes for the copy |
- # - not any transfers for doing wildcard expansion, the initial |
- # HEAD/GET request performed to get the object metadata, etc. |
- if self.total_bytes_transferred != 0: |
- self.logger.info( |
- 'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)', |
- self.total_bytes_transferred, self.total_elapsed_time, |
- MakeHumanReadable(self.total_bytes_per_second)) |
- if self.op_failure_count: |
- plural_str = 's' if self.op_failure_count else '' |
- raise CommandException('%d file%s/object%s could not be transferred.' % ( |
- self.op_failure_count, plural_str, plural_str)) |
- |
- return 0 |
- |
- def _ParseOpts(self): |
- perform_mv = False |
- # exclude_symlinks is handled by Command parent class, so save in Command |
- # state rather than CopyHelperOpts. |
- self.exclude_symlinks = False |
- no_clobber = False |
- # continue_on_error is handled by Command parent class, so save in Command |
- # state rather than CopyHelperOpts. |
- self.continue_on_error = False |
- daisy_chain = False |
- read_args_from_stdin = False |
- print_ver = False |
- use_manifest = False |
- preserve_acl = False |
- canned_acl = None |
- # canned_acl is handled by a helper function in parent |
- # Command class, so save in Command state rather than CopyHelperOpts. |
- self.canned = None |
- |
- self.skip_unsupported_objects = False |
- |
- # Files matching these extensions should be gzipped before uploading. |
- self.gzip_exts = [] |
- |
- test_callback_file = None |
- |
- # self.recursion_requested initialized in command.py (so can be checked |
- # in parent class for all commands). |
- self.manifest = None |
- if self.sub_opts: |
- for o, a in self.sub_opts: |
- if o == '-a': |
- canned_acl = a |
- self.canned = True |
- if o == '-c': |
- self.continue_on_error = True |
- elif o == '-D': |
- daisy_chain = True |
- elif o == '-e': |
- self.exclude_symlinks = True |
- elif o == '--testcallbackfile': |
- # File path of a pickled class that implements ProgressCallback.call. |
- # Used for testing transfer interruptions and resumes. |
- test_callback_file = a |
- elif o == '-I': |
- read_args_from_stdin = True |
- elif o == '-L': |
- use_manifest = True |
- self.manifest = Manifest(a) |
- elif o == '-M': |
- # Note that we signal to the cp command to perform a move (copy |
- # followed by remove) and use directory-move naming rules by passing |
- # the undocumented (for internal use) -M option when running the cp |
- # command from mv.py. |
- perform_mv = True |
- elif o == '-n': |
- no_clobber = True |
- elif o == '-p': |
- preserve_acl = True |
- elif o == '-r' or o == '-R': |
- self.recursion_requested = True |
- elif o == '-U': |
- self.skip_unsupported_objects = True |
- elif o == '-v': |
- print_ver = True |
- elif o == '-z': |
- self.gzip_exts = [x.strip() for x in a.split(',')] |
- if preserve_acl and canned_acl: |
- raise CommandException( |
- 'Specifying both the -p and -a options together is invalid.') |
- return CreateCopyHelperOpts( |
- perform_mv=perform_mv, |
- no_clobber=no_clobber, |
- daisy_chain=daisy_chain, |
- read_args_from_stdin=read_args_from_stdin, |
- print_ver=print_ver, |
- use_manifest=use_manifest, |
- preserve_acl=preserve_acl, |
- canned_acl=canned_acl, |
- skip_unsupported_objects=self.skip_unsupported_objects, |
- test_callback_file=test_callback_file) |
- |
- def _GetBucketWithVersioningConfig(self, exp_dst_url): |
- """Gets versioning config for a bucket and ensures that it exists. |
- |
- Args: |
- exp_dst_url: Wildcard-expanded destination StorageUrl. |
- |
- Raises: |
- AccessDeniedException: if there was a permissions problem accessing the |
- bucket or its versioning config. |
- CommandException: if URL refers to a cloud bucket that does not exist. |
- |
- Returns: |
- apitools Bucket with versioning configuration. |
- """ |
- bucket = None |
- if exp_dst_url.IsCloudUrl() and exp_dst_url.IsBucket(): |
- try: |
- bucket = self.gsutil_api.GetBucket( |
- exp_dst_url.bucket_name, provider=exp_dst_url.scheme, |
- fields=['versioning']) |
- except AccessDeniedException, e: |
- raise |
- except NotFoundException, e: |
- raise CommandException('Destination bucket %s does not exist.' % |
- exp_dst_url) |
- except Exception, e: |
- raise CommandException('Error retrieving destination bucket %s: %s' % |
- (exp_dst_url, e.message)) |
- return bucket |