OLD | NEW |
(Empty) | |
| 1 # Copyright 2011 Google Inc. |
| 2 # Copyright 2011, Nexenta Systems Inc. |
| 3 # |
| 4 # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 # you may not use this file except in compliance with the License. |
| 6 # You may obtain a copy of the License at |
| 7 # |
| 8 # http://www.apache.org/licenses/LICENSE-2.0 |
| 9 # |
| 10 # Unless required by applicable law or agreed to in writing, software |
| 11 # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 # See the License for the specific language governing permissions and |
| 14 # limitations under the License. |
| 15 |
| 16 import boto |
| 17 import errno |
| 18 import gzip |
| 19 import hashlib |
| 20 import mimetypes |
| 21 import os |
| 22 import platform |
| 23 import re |
| 24 import subprocess |
| 25 import sys |
| 26 import tempfile |
| 27 import threading |
| 28 import time |
| 29 |
| 30 from boto.gs.resumable_upload_handler import ResumableUploadHandler |
| 31 from boto import config |
| 32 from boto.s3.resumable_download_handler import ResumableDownloadHandler |
| 33 from boto.exception import GSResponseError |
| 34 from boto.exception import ResumableUploadException |
| 35 from gslib.command import Command |
| 36 from gslib.command import COMMAND_NAME |
| 37 from gslib.command import COMMAND_NAME_ALIASES |
| 38 from gslib.command import CONFIG_REQUIRED |
| 39 from gslib.command import FILE_URIS_OK |
| 40 from gslib.command import MAX_ARGS |
| 41 from gslib.command import MIN_ARGS |
| 42 from gslib.command import PROVIDER_URIS_OK |
| 43 from gslib.command import SUPPORTED_SUB_ARGS |
| 44 from gslib.command import URIS_START_ARG |
| 45 from gslib.exception import CommandException |
| 46 from gslib.help_provider import HELP_NAME |
| 47 from gslib.help_provider import HELP_NAME_ALIASES |
| 48 from gslib.help_provider import HELP_ONE_LINE_SUMMARY |
| 49 from gslib.help_provider import HELP_TEXT |
| 50 from gslib.help_provider import HelpType |
| 51 from gslib.help_provider import HELP_TYPE |
| 52 from gslib.name_expansion import NameExpansionIterator |
| 53 from gslib.util import MakeHumanReadable |
| 54 from gslib.util import NO_MAX |
| 55 from gslib.util import ONE_MB |
| 56 from gslib.wildcard_iterator import ContainsWildcard |
| 57 |
| 58 _detailed_help_text = (""" |
| 59 <B>SYNOPSIS</B> |
| 60 gsutil cp [-a canned_acl] [-e] [-p] [-q] [-z ext1,ext2,...] src_uri dst_uri |
| 61 - or - |
| 62 gsutil cp [-a canned_acl] [-e] [-p] [-q] [-R] [-z extensions] src_uri... dst_u
ri |
| 63 - or - |
| 64 gsutil cp [-a canned_acl] [-e] [-p] [-q] [-R] [-z extensions] -I dst_uri |
| 65 |
| 66 |
| 67 <B>DESCRIPTION</B> |
| 68 The gsutil cp command allows you to copy data between your local file |
| 69 system and the cloud, copy data within the cloud, and copy data between |
| 70 cloud storage providers. For example, to copy all text files from the |
| 71 local directory to a bucket you could do: |
| 72 |
| 73 gsutil cp *.txt gs://my_bucket |
| 74 |
| 75 Similarly, you can download text files from a bucket by doing: |
| 76 |
| 77 gsutil cp gs://my_bucket/*.txt . |
| 78 |
| 79 If you want to copy an entire directory tree you need to use the -R option: |
| 80 |
| 81 gsutil cp -R dir gs://my_bucket |
| 82 |
| 83 If you have a large number of files to upload you might want to use the |
| 84 gsutil -m option, to perform a parallel (multi-threaded/multi-processing) |
| 85 copy: |
| 86 |
| 87 gsutil -m cp -R dir gs://my_bucket |
| 88 |
| 89 You can pass a list of URIs to copy on STDIN instead of as command line |
| 90 arguments by using the -I option. This allows you to use gsutil in a |
| 91 pipeline to copy files and objects as generated by a program, such as: |
| 92 |
| 93 some_program | gsutil -m cp -I gs://my_bucket |
| 94 |
| 95 The contents of STDIN can name files, cloud URIs, and wildcards of files |
| 96 and cloud URIs. |
| 97 |
| 98 |
| 99 <B>HOW NAMES ARE CONSTRUCTED</B> |
| 100 The gsutil cp command strives to name objects in a way consistent with how |
| 101 Unix cp works, which causes names to be constructed in varying ways depending |
| 102 on whether you're performing a recursive directory copy or copying |
| 103 individually named objects; and whether you're copying to an existing or |
| 104 non-existent directory. |
| 105 |
| 106 When performing recursive directory copies, object names are constructed |
| 107 that mirror the source directory structure starting at the point of |
| 108 recursive processing. For example, the command: |
| 109 |
| 110 gsutil cp -R dir1/dir2 gs://my_bucket |
| 111 |
| 112 will create objects named like gs://my_bucket/dir2/a/b/c, assuming |
| 113 dir1/dir2 contains the file a/b/c. |
| 114 |
| 115 In contrast, copying individually named files will result in objects named |
| 116 by the final path component of the source files. For example, the command: |
| 117 |
| 118 gsutil cp dir1/dir2/** gs://my_bucket |
| 119 |
| 120 will create objects named like gs://my_bucket/c. |
| 121 |
| 122 The same rules apply for downloads: recursive copies of buckets and |
| 123 bucket subdirectories produce a mirrored filename structure, while copying |
| 124 individually (or wildcard) named objects produce flatly named files. |
| 125 |
| 126 Note that in the above example the '**' wildcard matches all names |
| 127 anywhere under dir. The wildcard '*' will match names just one level deep. For |
| 128 more details see 'gsutil help wildcards'. |
| 129 |
| 130 There's an additional wrinkle when working with subdirectories: the resulting |
| 131 names depend on whether the destination subdirectory exists. For example, |
| 132 if gs://my_bucket/subdir exists as a subdirectory, the command: |
| 133 |
| 134 gsutil cp -R dir1/dir2 gs://my_bucket/subdir |
| 135 |
| 136 will create objects named like gs://my_bucket/subdir/dir2/a/b/c. In contrast, |
| 137 if gs://my_bucket/subdir does not exist, this same gsutil cp command will |
| 138 create objects named like gs://my_bucket/subdir/a/b/c. |
| 139 |
| 140 |
| 141 <B>COPYING TO/FROM SUBDIRECTORIES; DISTRIBUTING TRANSFERS ACROSS MACHINES</B> |
| 142 You can use gsutil to copy to and from subdirectories by using a command like: |
| 143 |
| 144 gsutil cp -R dir gs://my_bucket/data |
| 145 |
| 146 This will cause dir and all of its files and nested subdirectories to be |
| 147 copied under the specified destination, resulting in objects with names like |
| 148 gs://my_bucket/data/dir/a/b/c. Similarly you can download from bucket |
| 149 subdirectories by using a command like: |
| 150 |
| 151 gsutil cp -R gs://my_bucket/data dir |
| 152 |
| 153 This will cause everything nested under gs://my_bucket/data to be downloaded |
| 154 into dir, resulting in files with names like dir/data/a/b/c. |
| 155 |
| 156 Copying subdirectories is useful if you want to add data to an existing |
| 157 bucket directory structure over time. It's also useful if you want |
| 158 to parallelize uploads and downloads across multiple machines (often |
| 159 reducing overall transfer time compared with simply running gsutil -m |
| 160 cp on one machine). For example, if your bucket contains this structure: |
| 161 |
| 162 gs://my_bucket/data/result_set_01/ |
| 163 gs://my_bucket/data/result_set_02/ |
| 164 ... |
| 165 gs://my_bucket/data/result_set_99/ |
| 166 |
| 167 you could perform concurrent downloads across 3 machines by running these |
| 168 commands on each machine, respectively: |
| 169 |
| 170 gsutil cp -R gs://my_bucket/data/result_set_[0-3]* dir |
| 171 gsutil cp -R gs://my_bucket/data/result_set_[4-6]* dir |
| 172 gsutil cp -R gs://my_bucket/data/result_set_[7-9]* dir |
| 173 |
| 174 Note that dir could be a local directory on each machine, or it could |
| 175 be a directory mounted off of a shared file server; whether the latter |
| 176 performs acceptably may depend on a number of things, so we recommend |
| 177 you experiment and find out what works best for you. |
| 178 |
| 179 |
| 180 <B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B> |
| 181 If both the source and destination URI are cloud URIs from the same |
| 182 provider, gsutil copies data "in the cloud" (i.e., without downloading |
| 183 to and uploading from the machine where you run gsutil). In addition to |
| 184 the performance and cost advantages of doing this, copying in the cloud |
| 185 preserves metadata (like Content-Type and Cache-Control). In contrast, |
| 186 when you download data from the cloud it ends up in a file, which has |
| 187 no associated metadata. Thus, unless you have some way to hold on to |
| 188 or re-create that metadata, downloading to a file will not retain the |
| 189 metadata. |
| 190 |
| 191 Note that by default, the gsutil cp command does not copy the object |
| 192 ACL to the new object, and instead will use the default bucket ACL (see |
| 193 "gsutil help setdefacl"). You can override this behavior with the -p |
| 194 option (see OPTIONS below). |
| 195 |
| 196 |
| 197 <B>RESUMABLE TRANSFERS</B> |
| 198 gsutil automatically uses the Google Cloud Storage resumable upload |
| 199 feature whenever you use the cp command to upload an object that is larger |
| 200 than 1 MB. You do not need to specify any special command line options |
| 201 to make this happen. If your upload is interrupted you can restart the |
| 202 upload by running the same cp command that you ran to start the upload. |
| 203 |
| 204 Similarly, gsutil automatically performs resumable downloads (using HTTP |
| 205 standard Range GET operations) whenever you use the cp command to download an |
| 206 object larger than 1 MB. |
| 207 |
| 208 Resumable uploads and downloads store some state information in a file |
| 209 in ~/.gsutil named by the destination object or file. If you attempt to |
| 210 resume a transfer from a machine with a different directory, the transfer |
| 211 will start over from scratch. |
| 212 |
| 213 See also "gsutil help prod" for details on using resumable transfers |
| 214 in production. |
| 215 |
| 216 |
| 217 <B>STREAMING TRANSFERS</B> |
| 218 Use '-' in place of src_uri or dst_uri to perform a streaming |
| 219 transfer. For example: |
| 220 long_running_computation | gsutil cp - gs://my_bucket/obj |
| 221 |
| 222 Streaming transfers do not support resumable uploads/downloads. |
| 223 (The Google resumable transfer protocol has a way to support streaming |
| 224 transers, but gsutil doesn't currently implement support for this.) |
| 225 |
| 226 |
| 227 <B>CHANGING TEMP DIRECTORIES</B> |
| 228 gsutil writes data to a temporary directory in several cases: |
| 229 - when compressing data to be uploaded (see the -z option) |
| 230 - when decompressing data being downloaded (when the data has |
| 231 Content-Encoding:gzip, e.g., as happens when uploaded using gsutil cp -z) |
| 232 - when copying between cloud service providers, where the destination |
| 233 provider does not support streaming uploads. In this case each object |
| 234 is downloaded from the source provider to a temp file, and then uploaded |
| 235 from that temp file to the destination provider. |
| 236 |
| 237 In these cases it's possible the temp file location on your system that |
| 238 gsutil selects by default may not have enough space. If you find that |
| 239 gsutil runs out of space during one of these operations (e.g., raising |
| 240 "CommandException: Inadequate temp space available to compress <your file>" |
| 241 during a gsutil cp -z operation), you can change where it writes these |
| 242 temp files by setting the TMPDIR environment variable. On Linux and MacOS |
| 243 you can do this using: |
| 244 |
| 245 export TMPDIR=/some/directory |
| 246 |
| 247 On Windows 7 you can change the TMPDIR environment variable from Start -> |
| 248 Computer -> System -> Advanced System Settings -> Environment Variables. |
| 249 You need to reboot after making this change for it to take effect. (Rebooting |
| 250 is not necessary after running the export command on Linux and MacOS.) |
| 251 |
| 252 |
| 253 <B>OPTIONS</B> |
| 254 -a Sets named canned_acl when uploaded objects created. See |
| 255 'gsutil help acls' for further details. |
| 256 |
| 257 -c If an error occurrs, continue to attempt to copy the remaining |
| 258 files. |
| 259 |
| 260 -e Exclude symlinks. When specified, symbolic links will not be |
| 261 copied. |
| 262 |
| 263 -n No-clobber. When specified, existing files or objects at the |
| 264 destination will not be overwritten. Any items that are skipped |
| 265 by this option will be reported as being skipped. |
| 266 |
| 267 Please note that using this feature will make gsutil perform |
| 268 additional HTTP requests for every item being copied. This may |
| 269 increase latency and cost. |
| 270 |
| 271 -p Causes ACL to be preserved when copying in the cloud. Note that |
| 272 this option has performance and cost implications, because it |
| 273 is essentially performing three requests (getacl, cp, setacl). |
| 274 (The performance issue can be mitigated to some degree by |
| 275 using gsutil -m cp to cause parallel copying.) |
| 276 |
| 277 You can avoid the additional performance and cost of using cp -p |
| 278 if you want all objects in the destination bucket to end up with |
| 279 the same ACL, but setting a default ACL on that bucket instead of |
| 280 using cp -p. See "help gsutil setdefacl". |
| 281 |
| 282 Note that it's not valid to specify both the -a and -p options |
| 283 together. |
| 284 |
| 285 -q Causes copies to be performed quietly, i.e., without reporting |
| 286 progress indicators of files being copied. Errors are still |
| 287 reported. This option can be useful for running gsutil from a |
| 288 cron job that logs its output to a file, for which the only |
| 289 information desired in the log is failures. |
| 290 |
| 291 -R, -r Causes directories, buckets, and bucket subdirectories to be |
| 292 copied recursively. If you neglect to use this option for |
| 293 an upload, gsutil will copy any files it finds and skip any |
| 294 directories. Similarly, neglecting to specify -R for a download |
| 295 will cause gsutil to copy any objects at the current bucket |
| 296 directory level, and skip any subdirectories. |
| 297 |
| 298 -t DEPRECATED. At one time this option was used to request setting |
| 299 Content-Type based on file extension and/or content, which is |
| 300 now the default behavior. The -t option is left in place for |
| 301 now to avoid breaking existing scripts. It will be removed at |
| 302 a future date. |
| 303 |
| 304 -v Parses uris for version / generation numbers (only applicable in |
| 305 version-enabled buckets). For example: |
| 306 |
| 307 gsutil cp -v gs://bucket/object#1348772910166013 ~/Desktop |
| 308 |
| 309 Note that wildcards are not permitted while using this flag. |
| 310 |
| 311 -z 'txt,html' Compresses file uploads with the given extensions. |
| 312 If you are uploading a large file with compressible content, |
| 313 such as a .js, .css, or .html file, you can gzip-compress the |
| 314 file during the upload process by specifying the -z <extensions> |
| 315 option. Compressing data before upload saves on usage charges |
| 316 because you are uploading a smaller amount of data. |
| 317 |
| 318 When you specify the -z option, the data from your files is |
| 319 compressed before it is uploaded, but your actual files are left |
| 320 uncompressed on the local disk. The uploaded objects retain the |
| 321 original content type and name as the original files but are given |
| 322 a Content-Encoding header with the value "gzip" to indicate that |
| 323 the object data stored are compressed on the Google Cloud Storage |
| 324 servers. |
| 325 |
| 326 For example, the following command: |
| 327 |
| 328 gsutil cp -z html -a public-read cattypes.html gs://mycats |
| 329 |
| 330 will do all of the following: |
| 331 - Upload as the object gs://mycats/cattypes.html (cp command) |
| 332 - Set the Content-Type to text/html (based on file extension) |
| 333 - Compress the data in the file cattypes.html (-z option) |
| 334 - Set the Content-Encoding to gzip (-z option) |
| 335 - Set the ACL to public-read (-a option) |
| 336 - If a user tries to view cattypes.html in a browser, the |
| 337 browser will know to uncompress the data based on the |
| 338 Content-Encoding header, and to render it as HTML based on |
| 339 the Content-Type header. |
| 340 """) |
| 341 |
| 342 class KeyFile(): |
| 343 """ |
| 344 Wrapper class to expose Key class as file to boto. |
| 345 """ |
| 346 def __init__(self, key): |
| 347 self.key = key |
| 348 |
| 349 def tell(self): |
| 350 raise IOError |
| 351 |
| 352 def seek(self, pos): |
| 353 raise IOError |
| 354 |
| 355 def read(self, size): |
| 356 return self.key.read(size) |
| 357 |
| 358 def write(self, buf): |
| 359 raise IOError |
| 360 |
| 361 def close(self): |
| 362 self.key.close() |
| 363 |
| 364 class CpCommand(Command): |
| 365 """ |
| 366 Implementation of gsutil cp command. |
| 367 |
| 368 Note that CpCommand is run for both gsutil cp and gsutil mv. The latter |
| 369 happens by MvCommand calling CpCommand and passing the hidden (undocumented) |
| 370 -M option. This allows the copy and remove needed for each mv to run |
| 371 together (rather than first running all the cp's and then all the rm's, as |
| 372 we originally had implemented), which in turn avoids the following problem |
| 373 with removing the wrong objects: starting with a bucket containing only |
| 374 the object gs://bucket/obj, say the user does: |
| 375 gsutil mv gs://bucket/* gs://bucket/d.txt |
| 376 If we ran all the cp's and then all the rm's and we didn't expand the wildcard |
| 377 first, the cp command would first copy gs://bucket/obj to gs://bucket/d.txt, |
| 378 and the rm command would then remove that object. In the implementation |
| 379 prior to gsutil release 3.12 we avoided this by building a list of objects |
| 380 to process and then running the copies and then the removes; but building |
| 381 the list up front limits scalability (compared with the current approach |
| 382 of processing the bucket listing iterator on the fly). |
| 383 """ |
| 384 |
| 385 # Set default Content-Type type. |
| 386 DEFAULT_CONTENT_TYPE = 'application/octet-stream' |
| 387 USE_MAGICFILE = boto.config.getbool('GSUtil', 'use_magicfile', False) |
| 388 |
| 389 # Command specification (processed by parent class). |
| 390 command_spec = { |
| 391 # Name of command. |
| 392 COMMAND_NAME : 'cp', |
| 393 # List of command name aliases. |
| 394 COMMAND_NAME_ALIASES : ['copy'], |
| 395 # Min number of args required by this command. |
| 396 MIN_ARGS : 1, |
| 397 # Max number of args required by this command, or NO_MAX. |
| 398 MAX_ARGS : NO_MAX, |
| 399 # Getopt-style string specifying acceptable sub args. |
| 400 # -t is deprecated but leave intact for now to avoid breakage. |
| 401 SUPPORTED_SUB_ARGS : 'a:ceIMnpqrRtvz:', |
| 402 # True if file URIs acceptable for this command. |
| 403 FILE_URIS_OK : True, |
| 404 # True if provider-only URIs acceptable for this command. |
| 405 PROVIDER_URIS_OK : False, |
| 406 # Index in args of first URI arg. |
| 407 URIS_START_ARG : 0, |
| 408 # True if must configure gsutil before running command. |
| 409 CONFIG_REQUIRED : True, |
| 410 } |
| 411 help_spec = { |
| 412 # Name of command or auxiliary help info for which this help applies. |
| 413 HELP_NAME : 'cp', |
| 414 # List of help name aliases. |
| 415 HELP_NAME_ALIASES : ['copy'], |
| 416 # Type of help: |
| 417 HELP_TYPE : HelpType.COMMAND_HELP, |
| 418 # One line summary of this help. |
| 419 HELP_ONE_LINE_SUMMARY : 'Copy files and objects', |
| 420 # The full help text. |
| 421 HELP_TEXT : _detailed_help_text, |
| 422 } |
| 423 |
| 424 def _CheckFinalMd5(self, key, file_name): |
| 425 """ |
| 426 Checks that etag from server agrees with md5 computed after the |
| 427 download completes. |
| 428 """ |
| 429 obj_md5 = key.etag.strip('"\'') |
| 430 file_md5 = None |
| 431 |
| 432 if hasattr(key, 'md5') and key.md5: |
| 433 file_md5 = key.md5 |
| 434 else: |
| 435 print 'Computing MD5 from scratch for resumed download' |
| 436 |
| 437 # Open file in binary mode to avoid surprises in Windows. |
| 438 fp = open(file_name, 'rb') |
| 439 try: |
| 440 file_md5 = key.compute_md5(fp)[0] |
| 441 finally: |
| 442 fp.close() |
| 443 |
| 444 if self.debug: |
| 445 print 'Checking file md5 against etag. (%s/%s)' % (file_md5, obj_md5) |
| 446 if file_md5 != obj_md5: |
| 447 # Checksums don't match - remove file and raise exception. |
| 448 os.unlink(file_name) |
| 449 raise CommandException( |
| 450 'File changed during download: md5 signature doesn\'t match ' |
| 451 'etag (incorrect downloaded file deleted)') |
| 452 |
| 453 def _CheckForDirFileConflict(self, exp_src_uri, dst_uri): |
| 454 """Checks whether copying exp_src_uri into dst_uri is not possible. |
| 455 |
| 456 This happens if a directory exists in local file system where a file |
| 457 needs to go or vice versa. In that case we print an error message and |
| 458 exits. Example: if the file "./x" exists and you try to do: |
| 459 gsutil cp gs://mybucket/x/y . |
| 460 the request can't succeed because it requires a directory where |
| 461 the file x exists. |
| 462 |
| 463 Note that we don't enforce any corresponding restrictions for buckets, |
| 464 because the flat namespace semantics for buckets doesn't prohibit such |
| 465 cases the way hierarchical file systems do. For example, if a bucket |
| 466 contains an object called gs://bucket/dir and then you run the command: |
| 467 gsutil cp file1 file2 gs://bucket/dir |
| 468 you'll end up with objects gs://bucket/dir, gs://bucket/dir/file1, and |
| 469 gs://bucket/dir/file2. |
| 470 |
| 471 Args: |
| 472 exp_src_uri: Expanded source StorageUri of copy. |
| 473 dst_uri: Destination URI. |
| 474 |
| 475 Raises: |
| 476 CommandException: if errors encountered. |
| 477 """ |
| 478 if dst_uri.is_cloud_uri(): |
| 479 # The problem can only happen for file destination URIs. |
| 480 return |
| 481 dst_path = dst_uri.object_name |
| 482 final_dir = os.path.dirname(dst_path) |
| 483 if os.path.isfile(final_dir): |
| 484 raise CommandException('Cannot retrieve %s because a file exists ' |
| 485 'where a directory needs to be created (%s).' % |
| 486 (exp_src_uri, final_dir)) |
| 487 if os.path.isdir(dst_path): |
| 488 raise CommandException('Cannot retrieve %s because a directory exists ' |
| 489 '(%s) where the file needs to be created.' % |
| 490 (exp_src_uri, dst_path)) |
| 491 |
| 492 def _InsistDstUriNamesContainer(self, exp_dst_uri, |
| 493 have_existing_dst_container, command_name): |
| 494 """ |
| 495 Raises an exception if URI doesn't name a directory, bucket, or bucket |
| 496 subdir, with special exception for cp -R (see comments below). |
| 497 |
| 498 Args: |
| 499 exp_dst_uri: Wildcard-expanding dst_uri. |
| 500 have_existing_dst_container: bool indicator of whether exp_dst_uri |
| 501 names a container (directory, bucket, or existing bucket subdir). |
| 502 command_name: Name of command making call. May not be the same as |
| 503 self.command_name in the case of commands implemented atop other |
| 504 commands (like mv command). |
| 505 |
| 506 Raises: |
| 507 CommandException: if the URI being checked does not name a container. |
| 508 """ |
| 509 if exp_dst_uri.is_file_uri(): |
| 510 ok = exp_dst_uri.names_directory() |
| 511 else: |
| 512 if have_existing_dst_container: |
| 513 ok = True |
| 514 else: |
| 515 # It's ok to specify a non-existing bucket subdir, for example: |
| 516 # gsutil cp -R dir gs://bucket/abc |
| 517 # where gs://bucket/abc isn't an existing subdir. |
| 518 ok = exp_dst_uri.names_object() |
| 519 if not ok: |
| 520 raise CommandException('Destination URI must name a directory, bucket, ' |
| 521 'or bucket\nsubdirectory for the multiple ' |
| 522 'source form of the %s command.' % command_name) |
| 523 |
| 524 class _FileCopyCallbackHandler(object): |
| 525 """Outputs progress info for large copy requests.""" |
| 526 |
| 527 def __init__(self, upload): |
| 528 if upload: |
| 529 self.announce_text = 'Uploading' |
| 530 else: |
| 531 self.announce_text = 'Downloading' |
| 532 |
| 533 def call(self, total_bytes_transferred, total_size): |
| 534 sys.stderr.write('%s: %s/%s \r' % ( |
| 535 self.announce_text, |
| 536 MakeHumanReadable(total_bytes_transferred), |
| 537 MakeHumanReadable(total_size))) |
| 538 if total_bytes_transferred == total_size: |
| 539 sys.stderr.write('\n') |
| 540 |
| 541 class _StreamCopyCallbackHandler(object): |
| 542 """Outputs progress info for Stream copy to cloud. |
| 543 Total Size of the stream is not known, so we output |
| 544 only the bytes transferred. |
| 545 """ |
| 546 |
| 547 def call(self, total_bytes_transferred, total_size): |
| 548 sys.stderr.write('Uploading: %s \r' % ( |
| 549 MakeHumanReadable(total_bytes_transferred))) |
| 550 if total_size and total_bytes_transferred == total_size: |
| 551 sys.stderr.write('\n') |
| 552 |
| 553 def _GetTransferHandlers(self, dst_uri, size, upload): |
| 554 """ |
| 555 Selects upload/download and callback handlers. |
| 556 |
| 557 We use a callback handler that shows a simple textual progress indicator |
| 558 if size is above the configurable threshold. |
| 559 |
| 560 We use a resumable transfer handler if size is >= the configurable |
| 561 threshold and resumable transfers are supported by the given provider. |
| 562 boto supports resumable downloads for all providers, but resumable |
| 563 uploads are currently only supported by GS. |
| 564 |
| 565 Args: |
| 566 dst_uri: the destination URI. |
| 567 size: size of file (object) being uploaded (downloaded). |
| 568 upload: bool indication of whether transfer is an upload. |
| 569 """ |
| 570 config = boto.config |
| 571 resumable_threshold = config.getint('GSUtil', 'resumable_threshold', ONE_MB) |
| 572 transfer_handler = None |
| 573 cb = None |
| 574 num_cb = None |
| 575 |
| 576 if size >= resumable_threshold: |
| 577 if not self.quiet: |
| 578 cb = self._FileCopyCallbackHandler(upload).call |
| 579 num_cb = int(size / ONE_MB) |
| 580 |
| 581 resumable_tracker_dir = config.get( |
| 582 'GSUtil', 'resumable_tracker_dir', |
| 583 os.path.expanduser('~' + os.sep + '.gsutil')) |
| 584 if not os.path.exists(resumable_tracker_dir): |
| 585 os.makedirs(resumable_tracker_dir) |
| 586 |
| 587 if upload: |
| 588 # Encode the dest bucket and object name into the tracker file name. |
| 589 res_tracker_file_name = ( |
| 590 re.sub('[/\\\\]', '_', 'resumable_upload__%s__%s.url' % |
| 591 (dst_uri.bucket_name, dst_uri.object_name))) |
| 592 else: |
| 593 # Encode the fully-qualified dest file name into the tracker file name. |
| 594 res_tracker_file_name = ( |
| 595 re.sub('[/\\\\]', '_', 'resumable_download__%s.etag' % |
| 596 (os.path.realpath(dst_uri.object_name)))) |
| 597 |
| 598 res_tracker_file_name = _hash_filename(res_tracker_file_name) |
| 599 tracker_file = '%s%s%s' % (resumable_tracker_dir, os.sep, |
| 600 res_tracker_file_name) |
| 601 if upload: |
| 602 if dst_uri.scheme == 'gs': |
| 603 transfer_handler = ResumableUploadHandler(tracker_file) |
| 604 else: |
| 605 transfer_handler = ResumableDownloadHandler(tracker_file) |
| 606 |
| 607 return (cb, num_cb, transfer_handler) |
| 608 |
| 609 def _LogCopyOperation(self, src_uri, dst_uri, headers): |
| 610 """ |
| 611 Logs copy operation being performed, including Content-Type if appropriate. |
| 612 """ |
| 613 if self.quiet: |
| 614 return |
| 615 if 'Content-Type' in headers and dst_uri.is_cloud_uri(): |
| 616 content_type_msg = ' [Content-Type=%s]' % headers['Content-Type'] |
| 617 else: |
| 618 content_type_msg = '' |
| 619 if src_uri.is_stream(): |
| 620 self.THREADED_LOGGER.info('Copying from <STDIN>%s...', content_type_msg) |
| 621 else: |
| 622 self.THREADED_LOGGER.info('Copying %s%s...', src_uri, content_type_msg) |
| 623 |
| 624 # We pass the headers explicitly to this call instead of using self.headers |
| 625 # so we can set different metadata (like Content-Type type) for each object. |
| 626 def _CopyObjToObjSameProvider(self, src_key, src_uri, dst_uri, headers): |
| 627 self._SetContentTypeHeader(src_uri, headers) |
| 628 self._LogCopyOperation(src_uri, dst_uri, headers) |
| 629 # Do Object -> object copy within same provider (uses |
| 630 # x-<provider>-copy-source metadata HTTP header to request copying at the |
| 631 # server). |
| 632 src_bucket = src_uri.get_bucket(False, headers) |
| 633 dst_bucket = dst_uri.get_bucket(False, headers) |
| 634 preserve_acl = False |
| 635 canned_acl = None |
| 636 if self.sub_opts: |
| 637 for o, a in self.sub_opts: |
| 638 if o == '-a': |
| 639 canned_acls = dst_uri.canned_acls() |
| 640 if a not in canned_acls: |
| 641 raise CommandException('Invalid canned ACL "%s".' % a) |
| 642 canned_acl = a |
| 643 headers[dst_uri.get_provider().acl_header] = canned_acl |
| 644 if o == '-p': |
| 645 preserve_acl = True |
| 646 if preserve_acl and canned_acl: |
| 647 raise CommandException( |
| 648 'Specifying both the -p and -a options together is invalid.') |
| 649 start_time = time.time() |
| 650 # Pass headers in headers param not metadata param, so boto will copy |
| 651 # existing key's metadata and just set the additional headers specified |
| 652 # in the headers param (rather than using the headers to override existing |
| 653 # metadata). In particular this allows us to copy the existing key's |
| 654 # Content-Type and other metadata users need while still being able to |
| 655 # set headers the API needs (like x-goog-project-id). Note that this means |
| 656 # you can't do something like: |
| 657 # gsutil cp -t Content-Type text/html gs://bucket/* gs://bucket2 |
| 658 # to change the Content-Type while copying. |
| 659 |
| 660 dst_uri.copy_key(src_bucket.name, src_uri.object_name, |
| 661 preserve_acl=False, headers=headers, |
| 662 src_version_id=src_uri.version_id, |
| 663 src_generation=src_uri.generation) |
| 664 end_time = time.time() |
| 665 return (end_time - start_time, src_key.size) |
| 666 |
| 667 def _CheckFreeSpace(self, path): |
| 668 """Return path/drive free space (in bytes).""" |
| 669 if platform.system() == 'Windows': |
| 670 from ctypes import c_int, c_uint64, c_wchar_p, windll, POINTER, WINFUNCTYP
E, WinError |
| 671 try: |
| 672 GetDiskFreeSpaceEx = WINFUNCTYPE(c_int, c_wchar_p, POINTER(c_uint64), |
| 673 POINTER(c_uint64), POINTER(c_uint64)) |
| 674 GetDiskFreeSpaceEx = GetDiskFreeSpaceEx( |
| 675 ('GetDiskFreeSpaceExW', windll.kernel32), ( |
| 676 (1, 'lpszPathName'), |
| 677 (2, 'lpFreeUserSpace'), |
| 678 (2, 'lpTotalSpace'), |
| 679 (2, 'lpFreeSpace'),)) |
| 680 except AttributeError: |
| 681 GetDiskFreeSpaceEx = WINFUNCTYPE(c_int, c_char_p, POINTER(c_uint64), |
| 682 POINTER(c_uint64), POINTER(c_uint64)) |
| 683 GetDiskFreeSpaceEx = GetDiskFreeSpaceEx( |
| 684 ('GetDiskFreeSpaceExA', windll.kernel32), ( |
| 685 (1, 'lpszPathName'), |
| 686 (2, 'lpFreeUserSpace'), |
| 687 (2, 'lpTotalSpace'), |
| 688 (2, 'lpFreeSpace'),)) |
| 689 |
| 690 def GetDiskFreeSpaceEx_errcheck(result, func, args): |
| 691 if not result: |
| 692 raise WinError() |
| 693 return args[1].value |
| 694 GetDiskFreeSpaceEx.errcheck = GetDiskFreeSpaceEx_errcheck |
| 695 |
| 696 return GetDiskFreeSpaceEx(os.getenv('SystemDrive')) |
| 697 else: |
| 698 (_, f_frsize, _, _, f_bavail, _, _, _, _, _) = os.statvfs(path) |
| 699 return f_frsize * f_bavail |
| 700 |
| 701 def _PerformResumableUploadIfApplies(self, fp, dst_uri, canned_acl, headers): |
| 702 """ |
| 703 Performs resumable upload if supported by provider and file is above |
| 704 threshold, else performs non-resumable upload. |
| 705 |
| 706 Returns (elapsed_time, bytes_transferred). |
| 707 """ |
| 708 start_time = time.time() |
| 709 file_size = os.path.getsize(fp.name) |
| 710 dst_key = dst_uri.new_key(False, headers) |
| 711 (cb, num_cb, res_upload_handler) = self._GetTransferHandlers( |
| 712 dst_uri, file_size, True) |
| 713 if dst_uri.scheme == 'gs': |
| 714 # Resumable upload protocol is Google Cloud Storage-specific. |
| 715 dst_key.set_contents_from_file(fp, headers, policy=canned_acl, |
| 716 cb=cb, num_cb=num_cb, |
| 717 res_upload_handler=res_upload_handler) |
| 718 else: |
| 719 dst_key.set_contents_from_file(fp, headers, policy=canned_acl, |
| 720 cb=cb, num_cb=num_cb) |
| 721 if res_upload_handler: |
| 722 # ResumableUploadHandler does not update upload_start_point from its |
| 723 # initial value of -1 if transferring the whole file, so clamp at 0 |
| 724 bytes_transferred = file_size - max( |
| 725 res_upload_handler.upload_start_point, 0) |
| 726 else: |
| 727 bytes_transferred = file_size |
| 728 end_time = time.time() |
| 729 return (end_time - start_time, bytes_transferred) |
| 730 |
| 731 def _PerformStreamingUpload(self, fp, dst_uri, headers, canned_acl=None): |
| 732 """ |
| 733 Performs a streaming upload to the cloud. |
| 734 |
| 735 Args: |
| 736 fp: The file whose contents to upload. |
| 737 dst_uri: Destination StorageUri. |
| 738 headers: A copy of the headers dictionary. |
| 739 canned_acl: Optional canned ACL to set on the object. |
| 740 |
| 741 Returns (elapsed_time, bytes_transferred). |
| 742 """ |
| 743 start_time = time.time() |
| 744 dst_key = dst_uri.new_key(False, headers) |
| 745 |
| 746 if self.quiet: |
| 747 cb = None |
| 748 else: |
| 749 cb = self._StreamCopyCallbackHandler().call |
| 750 dst_key.set_contents_from_stream(fp, headers, policy=canned_acl, cb=cb) |
| 751 try: |
| 752 bytes_transferred = fp.tell() |
| 753 except: |
| 754 bytes_transferred = 0 |
| 755 |
| 756 end_time = time.time() |
| 757 return (end_time - start_time, bytes_transferred) |
| 758 |
| 759 def _SetContentTypeHeader(self, src_uri, headers): |
| 760 """ |
| 761 Sets content type header to value specified in '-h Content-Type' option (if |
| 762 specified); else sets using Content-Type detection. |
| 763 """ |
| 764 if 'Content-Type' in headers: |
| 765 # If empty string specified (i.e., -h "Content-Type:") set header to None, |
| 766 # which will inhibit boto from sending the CT header. Otherwise, boto will |
| 767 # pass through the user specified CT header. |
| 768 if not headers['Content-Type']: |
| 769 headers['Content-Type'] = None |
| 770 # else we'll keep the value passed in via -h option (not performing |
| 771 # content type detection). |
| 772 else: |
| 773 # Only do content type recognition is src_uri is a file. Object-to-object |
| 774 # copies with no -h Content-Type specified re-use the content type of the |
| 775 # source object. |
| 776 if src_uri.is_file_uri(): |
| 777 object_name = src_uri.object_name |
| 778 content_type = None |
| 779 # Streams (denoted by '-') are expected to be 'application/octet-stream' |
| 780 # and 'file' would partially consume them. |
| 781 if object_name != '-': |
| 782 if self.USE_MAGICFILE: |
| 783 p = subprocess.Popen(['file', '--mime-type', object_name], |
| 784 stdout=subprocess.PIPE, stderr=subprocess.PIPE) |
| 785 output, error = p.communicate() |
| 786 if p.returncode != 0 or error: |
| 787 raise CommandException( |
| 788 'Encountered error running "file --mime-type %s" ' |
| 789 '(returncode=%d).\n%s' % (object_name, p.returncode, error)) |
| 790 # Parse output by removing line delimiter and splitting on last ": |
| 791 content_type = output.rstrip().rpartition(': ')[2] |
| 792 else: |
| 793 content_type = mimetypes.guess_type(object_name)[0] |
| 794 if not content_type: |
| 795 content_type = self.DEFAULT_CONTENT_TYPE |
| 796 headers['Content-Type'] = content_type |
| 797 |
| 798 def _UploadFileToObject(self, src_key, src_uri, dst_uri, headers, |
| 799 should_log=True): |
| 800 """Helper method for uploading a local file to an object. |
| 801 |
| 802 Args: |
| 803 src_key: Source StorageUri. Must be a file URI. |
| 804 src_uri: Source StorageUri. |
| 805 dst_uri: Destination StorageUri. |
| 806 headers: The headers dictionary. |
| 807 should_log: bool indicator whether we should log this operation. |
| 808 Returns: |
| 809 (elapsed_time, bytes_transferred) excluding overhead like initial HEAD. |
| 810 |
| 811 Raises: |
| 812 CommandException: if errors encountered. |
| 813 """ |
| 814 gzip_exts = [] |
| 815 canned_acl = None |
| 816 if self.sub_opts: |
| 817 for o, a in self.sub_opts: |
| 818 if o == '-a': |
| 819 canned_acls = dst_uri.canned_acls() |
| 820 if a not in canned_acls: |
| 821 raise CommandException('Invalid canned ACL "%s".' % a) |
| 822 canned_acl = a |
| 823 elif o == '-t': |
| 824 print('Warning: -t is deprecated, and will be removed in the future. ' |
| 825 'Content type\ndetection is ' |
| 826 'now performed by default, unless inhibited by specifying ' |
| 827 'a\nContent-Type header via the -h option.') |
| 828 elif o == '-z': |
| 829 gzip_exts = a.split(',') |
| 830 |
| 831 self._SetContentTypeHeader(src_uri, headers) |
| 832 if should_log: |
| 833 self._LogCopyOperation(src_uri, dst_uri, headers) |
| 834 |
| 835 if 'Content-Language' not in headers: |
| 836 content_language = config.get_value('GSUtil', 'content_language') |
| 837 if content_language: |
| 838 headers['Content-Language'] = content_language |
| 839 |
| 840 fname_parts = src_uri.object_name.split('.') |
| 841 if len(fname_parts) > 1 and fname_parts[-1] in gzip_exts: |
| 842 if self.debug: |
| 843 print 'Compressing %s (to tmp)...' % src_key |
| 844 (gzip_fh, gzip_path) = tempfile.mkstemp() |
| 845 gzip_fp = None |
| 846 try: |
| 847 # Check for temp space. Assume the compressed object is at most 2x |
| 848 # the size of the object (normally should compress to smaller than |
| 849 # the object) |
| 850 if (self._CheckFreeSpace(gzip_path) |
| 851 < 2*int(os.path.getsize(src_key.name))): |
| 852 raise CommandException('Inadequate temp space available to compress ' |
| 853 '%s' % src_key.name) |
| 854 gzip_fp = gzip.open(gzip_path, 'wb') |
| 855 gzip_fp.writelines(src_key.fp) |
| 856 finally: |
| 857 if gzip_fp: |
| 858 gzip_fp.close() |
| 859 os.close(gzip_fh) |
| 860 headers['Content-Encoding'] = 'gzip' |
| 861 gzip_fp = open(gzip_path, 'rb') |
| 862 try: |
| 863 (elapsed_time, bytes_transferred) = ( |
| 864 self._PerformResumableUploadIfApplies(gzip_fp, dst_uri, |
| 865 canned_acl, headers)) |
| 866 finally: |
| 867 gzip_fp.close() |
| 868 try: |
| 869 os.unlink(gzip_path) |
| 870 # Windows sometimes complains the temp file is locked when you try to |
| 871 # delete it. |
| 872 except Exception, e: |
| 873 pass |
| 874 elif (src_key.is_stream() |
| 875 and dst_uri.get_provider().supports_chunked_transfer()): |
| 876 (elapsed_time, bytes_transferred) = self._PerformStreamingUpload( |
| 877 src_key.fp, dst_uri, headers, canned_acl) |
| 878 else: |
| 879 if src_key.is_stream(): |
| 880 # For Providers that doesn't support chunked Transfers |
| 881 tmp = tempfile.NamedTemporaryFile() |
| 882 file_uri = self.suri_builder.StorageUri('file://%s' % tmp.name) |
| 883 try: |
| 884 file_uri.new_key(False, headers).set_contents_from_file( |
| 885 src_key.fp, headers) |
| 886 src_key = file_uri.get_key() |
| 887 finally: |
| 888 file_uri.close() |
| 889 try: |
| 890 (elapsed_time, bytes_transferred) = ( |
| 891 self._PerformResumableUploadIfApplies(src_key.fp, dst_uri, |
| 892 canned_acl, headers)) |
| 893 finally: |
| 894 if src_key.is_stream(): |
| 895 tmp.close() |
| 896 else: |
| 897 src_key.close() |
| 898 |
| 899 return (elapsed_time, bytes_transferred) |
| 900 |
| 901 def _DownloadObjectToFile(self, src_key, src_uri, dst_uri, headers, |
| 902 should_log=True): |
| 903 if should_log: |
| 904 self._LogCopyOperation(src_uri, dst_uri, headers) |
| 905 (cb, num_cb, res_download_handler) = self._GetTransferHandlers( |
| 906 dst_uri, src_key.size, False) |
| 907 file_name = dst_uri.object_name |
| 908 dir_name = os.path.dirname(file_name) |
| 909 if dir_name and not os.path.exists(dir_name): |
| 910 # Do dir creation in try block so can ignore case where dir already |
| 911 # exists. This is needed to avoid a race condition when running gsutil |
| 912 # -m cp. |
| 913 try: |
| 914 os.makedirs(dir_name) |
| 915 except OSError, e: |
| 916 if e.errno != errno.EEXIST: |
| 917 raise |
| 918 # For gzipped objects not named *.gz download to a temp file and unzip. |
| 919 if (hasattr(src_key, 'content_encoding') |
| 920 and src_key.content_encoding == 'gzip' |
| 921 and not file_name.endswith('.gz')): |
| 922 # We can't use tempfile.mkstemp() here because we need a predictable |
| 923 # filename for resumable downloads. |
| 924 download_file_name = '%s_.gztmp' % file_name |
| 925 need_to_unzip = True |
| 926 else: |
| 927 download_file_name = file_name |
| 928 need_to_unzip = False |
| 929 fp = None |
| 930 try: |
| 931 if res_download_handler: |
| 932 fp = open(download_file_name, 'ab') |
| 933 else: |
| 934 fp = open(download_file_name, 'wb') |
| 935 start_time = time.time() |
| 936 if not self.parse_versions: |
| 937 src_key.generation = None |
| 938 src_key.get_contents_to_file(fp, headers, cb=cb, num_cb=num_cb, |
| 939 res_download_handler=res_download_handler) |
| 940 # If a custom test method is defined, call it here. For the copy command, |
| 941 # test methods are expected to take one argument: an open file pointer, |
| 942 # and are used to perturb the open file during download to exercise |
| 943 # download error detection. |
| 944 if self.test_method: |
| 945 self.test_method(fp) |
| 946 end_time = time.time() |
| 947 finally: |
| 948 if fp: |
| 949 fp.close() |
| 950 |
| 951 # Discard the md5 if we are resuming a partial download. |
| 952 if res_download_handler and res_download_handler.download_start_point: |
| 953 src_key.md5 = None |
| 954 |
| 955 # Verify downloaded file checksum matched source object's checksum. |
| 956 self._CheckFinalMd5(src_key, download_file_name) |
| 957 |
| 958 if res_download_handler: |
| 959 bytes_transferred = ( |
| 960 src_key.size - res_download_handler.download_start_point) |
| 961 else: |
| 962 bytes_transferred = src_key.size |
| 963 if need_to_unzip: |
| 964 # Log that we're uncompressing if the file is big enough that |
| 965 # decompressing would make it look like the transfer "stalled" at the end. |
| 966 if not self.quiet and bytes_transferred > 10 * 1024 * 1024: |
| 967 self.THREADED_LOGGER.info('Uncompressing downloaded tmp file to %s...', |
| 968 file_name) |
| 969 # Downloaded gzipped file to a filename w/o .gz extension, so unzip. |
| 970 f_in = gzip.open(download_file_name, 'rb') |
| 971 f_out = open(file_name, 'wb') |
| 972 try: |
| 973 while True: |
| 974 data = f_in.read(8192) |
| 975 if not data: |
| 976 break |
| 977 f_out.write(data) |
| 978 finally: |
| 979 f_out.close() |
| 980 f_in.close() |
| 981 os.unlink(download_file_name) |
| 982 return (end_time - start_time, bytes_transferred) |
| 983 |
| 984 def _PerformDownloadToStream(self, src_key, src_uri, str_fp, headers): |
| 985 (cb, num_cb, res_download_handler) = self._GetTransferHandlers( |
| 986 src_uri, src_key.size, False) |
| 987 start_time = time.time() |
| 988 src_key.get_contents_to_file(str_fp, headers, cb=cb, num_cb=num_cb) |
| 989 end_time = time.time() |
| 990 bytes_transferred = src_key.size |
| 991 end_time = time.time() |
| 992 return (end_time - start_time, bytes_transferred) |
| 993 |
| 994 def _CopyFileToFile(self, src_key, src_uri, dst_uri, headers): |
| 995 self._LogCopyOperation(src_uri, dst_uri, headers) |
| 996 dst_key = dst_uri.new_key(False, headers) |
| 997 start_time = time.time() |
| 998 dst_key.set_contents_from_file(src_key.fp, headers) |
| 999 end_time = time.time() |
| 1000 return (end_time - start_time, os.path.getsize(src_key.fp.name)) |
| 1001 |
| 1002 def _CopyObjToObjDiffProvider(self, src_key, src_uri, dst_uri, headers): |
| 1003 self._SetContentTypeHeader(src_uri, headers) |
| 1004 self._LogCopyOperation(src_uri, dst_uri, headers) |
| 1005 # If destination is GS we can avoid the local copying through a local file |
| 1006 # as GS supports chunked transfer. This also allows us to preserve metadata |
| 1007 # between original and destination object. |
| 1008 if dst_uri.scheme == 'gs': |
| 1009 canned_acl = None |
| 1010 if self.sub_opts: |
| 1011 for o, a in self.sub_opts: |
| 1012 if o == '-a': |
| 1013 canned_acls = dst_uri.canned_acls() |
| 1014 if a not in canned_acls: |
| 1015 raise CommandException('Invalid canned ACL "%s".' % a) |
| 1016 canned_acl = a |
| 1017 elif o == '-p': |
| 1018 # We don't attempt to preserve ACLs across providers because |
| 1019 # GCS and S3 support different ACLs. |
| 1020 raise NotImplementedError('Cross-provider cp -p not supported') |
| 1021 |
| 1022 # TODO: This _PerformStreamingUpload call passes in a Key for fp |
| 1023 # param, relying on Python "duck typing" (the fact that the lower-level |
| 1024 # methods that expect an fp only happen to call fp methods that are |
| 1025 # defined and semantically equivalent to those defined on src_key). This |
| 1026 # should be replaced by a class that wraps an fp interface around the |
| 1027 # Key, throwing 'not implemented' for methods (like seek) that aren't |
| 1028 # implemented by non-file Keys. |
| 1029 # NOTE: As of 7/28/2012 this bug now makes cross-provider copies into gs |
| 1030 # fail, because of boto changes that make that code now attempt to perform |
| 1031 # additional operations on the fp parameter, like seek() and tell(). |
| 1032 return self._PerformStreamingUpload(KeyFile(src_key), dst_uri, headers, ca
nned_acl) |
| 1033 |
| 1034 # If destination is not GS we implement object copy through a local |
| 1035 # temp file. There are at least 3 downsides of this approach: |
| 1036 # 1. It doesn't preserve metadata from the src object when uploading to |
| 1037 # the dst object. |
| 1038 # 2. It requires enough temp space on the local disk to hold the file |
| 1039 # while transferring. |
| 1040 # 3. Killing the gsutil process partway through and then restarting will |
| 1041 # always repeat the download and upload, because the temp file name is |
| 1042 # different for each incarnation. (If however you just leave the |
| 1043 # process running and failures happen along the way, they will |
| 1044 # continue to restart and make progress as long as not too many |
| 1045 # failures happen in a row with no progress.) |
| 1046 tmp = tempfile.NamedTemporaryFile() |
| 1047 if self._CheckFreeSpace(tempfile.tempdir) < src_key.size: |
| 1048 raise CommandException('Inadequate temp space available to perform the ' |
| 1049 'requested copy') |
| 1050 start_time = time.time() |
| 1051 file_uri = self.suri_builder.StorageUri('file://%s' % tmp.name) |
| 1052 try: |
| 1053 self._DownloadObjectToFile(src_key, src_uri, file_uri, headers, False) |
| 1054 self._UploadFileToObject(file_uri.get_key(), file_uri, dst_uri, headers, |
| 1055 False) |
| 1056 finally: |
| 1057 tmp.close() |
| 1058 end_time = time.time() |
| 1059 return (end_time - start_time, src_key.size) |
| 1060 |
| 1061 def _PerformCopy(self, src_uri, dst_uri): |
| 1062 """Performs copy from src_uri to dst_uri, handling various special cases. |
| 1063 |
| 1064 Args: |
| 1065 src_uri: Source StorageUri. |
| 1066 dst_uri: Destination StorageUri. |
| 1067 |
| 1068 Returns: |
| 1069 (elapsed_time, bytes_transferred) excluding overhead like initial HEAD. |
| 1070 |
| 1071 Raises: |
| 1072 CommandException: if errors encountered. |
| 1073 """ |
| 1074 # Make a copy of the input headers each time so we can set a different |
| 1075 # content type for each object. |
| 1076 if self.headers: |
| 1077 headers = self.headers.copy() |
| 1078 else: |
| 1079 headers = {} |
| 1080 |
| 1081 src_key = src_uri.get_key(False, headers) |
| 1082 if not src_key: |
| 1083 raise CommandException('"%s" does not exist.' % src_uri) |
| 1084 |
| 1085 if self.no_clobber: |
| 1086 # There are two checks to prevent clobbering: |
| 1087 # 1) The first check is to see if the item |
| 1088 # already exists at the destination and prevent the upload/download |
| 1089 # from happening. This is done by the exists() call. |
| 1090 # 2) The second check is only relevant if we are writing to gs. We can |
| 1091 # enforce that the server only writes the object if it doesn't exist |
| 1092 # by specifying the header below. This check only happens at the |
| 1093 # server after the complete file has been uploaded. We specify this |
| 1094 # header to prevent a race condition where a destination file may |
| 1095 # be created after the first check and before the file is fully |
| 1096 # uploaded. |
| 1097 # In order to save on unneccesary uploads/downloads we perform both |
| 1098 # checks. However, this may come at the cost of additional HTTP calls. |
| 1099 if dst_uri.exists(headers): |
| 1100 if not self.quiet: |
| 1101 self.THREADED_LOGGER.info('Skipping existing item: %s' % |
| 1102 dst_uri.uri) |
| 1103 return (0, 0) |
| 1104 if dst_uri.is_cloud_uri() and dst_uri.scheme == 'gs': |
| 1105 headers['x-goog-if-generation-match'] = '0' |
| 1106 |
| 1107 if src_uri.is_cloud_uri() and dst_uri.is_cloud_uri(): |
| 1108 if src_uri.scheme == dst_uri.scheme: |
| 1109 return self._CopyObjToObjSameProvider(src_key, src_uri, dst_uri, |
| 1110 headers) |
| 1111 else: |
| 1112 return self._CopyObjToObjDiffProvider(src_key, src_uri, dst_uri, |
| 1113 headers) |
| 1114 elif src_uri.is_file_uri() and dst_uri.is_cloud_uri(): |
| 1115 return self._UploadFileToObject(src_key, src_uri, dst_uri, headers) |
| 1116 elif src_uri.is_cloud_uri() and dst_uri.is_file_uri(): |
| 1117 return self._DownloadObjectToFile(src_key, src_uri, dst_uri, headers) |
| 1118 elif src_uri.is_file_uri() and dst_uri.is_file_uri(): |
| 1119 return self._CopyFileToFile(src_key, src_uri, dst_uri, headers) |
| 1120 else: |
| 1121 raise CommandException('Unexpected src/dest case') |
| 1122 |
| 1123 def _ExpandDstUri(self, dst_uri_str): |
| 1124 """ |
| 1125 Expands wildcard if present in dst_uri_str. |
| 1126 |
| 1127 Args: |
| 1128 dst_uri_str: String representation of requested dst_uri. |
| 1129 |
| 1130 Returns: |
| 1131 (exp_dst_uri, have_existing_dst_container) |
| 1132 where have_existing_dst_container is a bool indicating whether |
| 1133 exp_dst_uri names an existing directory, bucket, or bucket subdirectory. |
| 1134 |
| 1135 Raises: |
| 1136 CommandException: if dst_uri_str matched more than 1 URI. |
| 1137 """ |
| 1138 dst_uri = self.suri_builder.StorageUri(dst_uri_str) |
| 1139 |
| 1140 # Handle wildcarded dst_uri case. |
| 1141 if ContainsWildcard(dst_uri): |
| 1142 blr_expansion = list(self.WildcardIterator(dst_uri)) |
| 1143 if len(blr_expansion) != 1: |
| 1144 raise CommandException('Destination (%s) must match exactly 1 URI' % |
| 1145 dst_uri_str) |
| 1146 blr = blr_expansion[0] |
| 1147 uri = blr.GetUri() |
| 1148 if uri.is_cloud_uri(): |
| 1149 return (uri, uri.names_bucket() or blr.HasPrefix() |
| 1150 or blr.GetKey().endswith('/')) |
| 1151 else: |
| 1152 return (uri, uri.names_directory()) |
| 1153 |
| 1154 # Handle non-wildcarded dst_uri: |
| 1155 if dst_uri.is_file_uri(): |
| 1156 return (dst_uri, dst_uri.names_directory()) |
| 1157 if dst_uri.names_bucket(): |
| 1158 return (dst_uri, True) |
| 1159 # For object URIs check 3 cases: (a) if the name ends with '/' treat as a |
| 1160 # subdir; else, perform a wildcard expansion with dst_uri + "*" and then |
| 1161 # find if (b) there's a Prefix matching dst_uri, or (c) name is of form |
| 1162 # dir_$folder$ (and in both these cases also treat dir as a subdir). |
| 1163 if dst_uri.is_cloud_uri() and dst_uri_str.endswith('/'): |
| 1164 return (dst_uri, True) |
| 1165 blr_expansion = list(self.WildcardIterator( |
| 1166 '%s*' % dst_uri_str.rstrip(dst_uri.delim))) |
| 1167 for blr in blr_expansion: |
| 1168 if blr.GetRStrippedUriString().endswith('_$folder$'): |
| 1169 return (dst_uri, True) |
| 1170 if blr.GetRStrippedUriString() == dst_uri_str.rstrip(dst_uri.delim): |
| 1171 return (dst_uri, blr.HasPrefix()) |
| 1172 return (dst_uri, False) |
| 1173 |
| 1174 def _ConstructDstUri(self, src_uri, exp_src_uri, |
| 1175 src_uri_names_container, src_uri_expands_to_multi, |
| 1176 have_multiple_srcs, exp_dst_uri, |
| 1177 have_existing_dest_subdir): |
| 1178 """ |
| 1179 Constructs the destination URI for a given exp_src_uri/exp_dst_uri pair, |
| 1180 using context-dependent naming rules that mimic Unix cp and mv behavior. |
| 1181 |
| 1182 Args: |
| 1183 src_uri: src_uri to be copied. |
| 1184 exp_src_uri: Single StorageUri from wildcard expansion of src_uri. |
| 1185 src_uri_names_container: True if src_uri names a container (including the |
| 1186 case of a wildcard-named bucket subdir (like gs://bucket/abc, |
| 1187 where gs://bucket/abc/* matched some objects). Note that this is |
| 1188 additional semantics tha src_uri.names_container() doesn't understand |
| 1189 because the latter only understands StorageUris, not wildcards. |
| 1190 src_uri_expands_to_multi: True if src_uri expanded to multiple URIs. |
| 1191 have_multiple_srcs: True if this is a multi-source request. This can be |
| 1192 true if src_uri wildcard-expanded to multiple URIs or if there were |
| 1193 multiple source URIs in the request. |
| 1194 exp_dst_uri: the expanded StorageUri requested for the cp destination. |
| 1195 Final written path is constructed from this plus a context-dependent |
| 1196 variant of src_uri. |
| 1197 have_existing_dest_subdir: bool indicator whether dest is an existing |
| 1198 subdirectory. |
| 1199 |
| 1200 Returns: |
| 1201 StorageUri to use for copy. |
| 1202 |
| 1203 Raises: |
| 1204 CommandException if destination object name not specified for |
| 1205 source and source is a stream. |
| 1206 """ |
| 1207 if self._ShouldTreatDstUriAsSingleton( |
| 1208 have_multiple_srcs, have_existing_dest_subdir, exp_dst_uri): |
| 1209 # We're copying one file or object to one file or object. |
| 1210 return exp_dst_uri |
| 1211 |
| 1212 if exp_src_uri.is_stream(): |
| 1213 if exp_dst_uri.names_container(): |
| 1214 raise CommandException('Destination object name needed when ' |
| 1215 'source is a stream') |
| 1216 return exp_dst_uri |
| 1217 |
| 1218 if not self.recursion_requested and not have_multiple_srcs: |
| 1219 # We're copying one file or object to a subdirectory. Append final comp |
| 1220 # of exp_src_uri to exp_dest_uri. |
| 1221 src_final_comp = exp_src_uri.object_name.rpartition(src_uri.delim)[-1] |
| 1222 return self.suri_builder.StorageUri('%s%s%s' % ( |
| 1223 exp_dst_uri.uri.rstrip(exp_dst_uri.delim), exp_dst_uri.delim, |
| 1224 src_final_comp)) |
| 1225 |
| 1226 # Else we're copying multiple sources to a directory, bucket, or a bucket |
| 1227 # "sub-directory". |
| 1228 |
| 1229 # Ensure exp_dst_uri ends in delim char if we're doing a multi-src copy or |
| 1230 # a copy to a directory. (The check for copying to a directory needs |
| 1231 # special-case handling so that the command: |
| 1232 # gsutil cp gs://bucket/obj dir |
| 1233 # will turn into file://dir/ instead of file://dir -- the latter would cause |
| 1234 # the file "dirobj" to be created.) |
| 1235 # Note: need to check have_multiple_srcs or src_uri.names_container() |
| 1236 # because src_uri could be a bucket containing a single object, named |
| 1237 # as gs://bucket. |
| 1238 if ((have_multiple_srcs or src_uri.names_container() |
| 1239 or os.path.isdir(exp_dst_uri.object_name)) |
| 1240 and not exp_dst_uri.uri.endswith(exp_dst_uri.delim)): |
| 1241 exp_dst_uri = exp_dst_uri.clone_replace_name( |
| 1242 '%s%s' % (exp_dst_uri.object_name, exp_dst_uri.delim) |
| 1243 ) |
| 1244 |
| 1245 # Making naming behavior match how things work with local Unix cp and mv |
| 1246 # operations depends on many factors, including whether the destination is a |
| 1247 # container, the plurality of the source(s), and whether the mv command is |
| 1248 # being used: |
| 1249 # 1. For the "mv" command that specifies a non-existent destination subdir, |
| 1250 # renaming should occur at the level of the src subdir, vs appending that |
| 1251 # subdir beneath the dst subdir like is done for copying. For example: |
| 1252 # gsutil rm -R gs://bucket |
| 1253 # gsutil cp -R cloudreader gs://bucket |
| 1254 # gsutil cp -R cloudauth gs://bucket/subdir1 |
| 1255 # gsutil mv gs://bucket/subdir1 gs://bucket/subdir2 |
| 1256 # would (if using cp naming behavior) end up with paths like: |
| 1257 # gs://bucket/subdir2/subdir1/cloudauth/.svn/all-wcprops |
| 1258 # whereas mv naming behavior should result in: |
| 1259 # gs://bucket/subdir2/cloudauth/.svn/all-wcprops |
| 1260 # 2. Copying from directories, buckets, or bucket subdirs should result in |
| 1261 # objects/files mirroring the source directory hierarchy. For example: |
| 1262 # gsutil cp dir1/dir2 gs://bucket |
| 1263 # should create the object gs://bucket/dir2/file2, assuming dir1/dir2 |
| 1264 # contains file2). |
| 1265 # To be consistent with Unix cp behavior, there's one more wrinkle when |
| 1266 # working with subdirs: The resulting object names depend on whether the |
| 1267 # destination subdirectory exists. For example, if gs://bucket/subdir |
| 1268 # exists, the command: |
| 1269 # gsutil cp -R dir1/dir2 gs://bucket/subdir |
| 1270 # should create objects named like gs://bucket/subdir/dir2/a/b/c. In |
| 1271 # contrast, if gs://bucket/subdir does not exist, this same command |
| 1272 # should create objects named like gs://bucket/subdir/a/b/c. |
| 1273 # 3. Copying individual files or objects to dirs, buckets or bucket subdirs |
| 1274 # should result in objects/files named by the final source file name |
| 1275 # component. Example: |
| 1276 # gsutil cp dir1/*.txt gs://bucket |
| 1277 # should create the objects gs://bucket/f1.txt and gs://bucket/f2.txt, |
| 1278 # assuming dir1 contains f1.txt and f2.txt. |
| 1279 |
| 1280 if (self.perform_mv and self.recursion_requested |
| 1281 and src_uri_expands_to_multi and not have_existing_dest_subdir): |
| 1282 # Case 1. Handle naming rules for bucket subdir mv. Here we want to |
| 1283 # line up the src_uri against its expansion, to find the base to build |
| 1284 # the new name. For example, running the command: |
| 1285 # gsutil mv gs://bucket/abcd gs://bucket/xyz |
| 1286 # when processing exp_src_uri=gs://bucket/abcd/123 |
| 1287 # exp_src_uri_tail should become /123 |
| 1288 # Note: mv.py code disallows wildcard specification of source URI. |
| 1289 exp_src_uri_tail = exp_src_uri.uri[len(src_uri.uri):] |
| 1290 dst_key_name = '%s/%s' % (exp_dst_uri.object_name.rstrip('/'), |
| 1291 exp_src_uri_tail.strip('/')) |
| 1292 return exp_dst_uri.clone_replace_name(dst_key_name) |
| 1293 |
| 1294 if src_uri_names_container and not exp_dst_uri.names_file(): |
| 1295 # Case 2. Build dst_key_name from subpath of exp_src_uri past |
| 1296 # where src_uri ends. For example, for src_uri=gs://bucket/ and |
| 1297 # exp_src_uri=gs://bucket/src_subdir/obj, dst_key_name should be |
| 1298 # src_subdir/obj. |
| 1299 src_uri_path_sans_final_dir = _GetPathBeforeFinalDir(src_uri) |
| 1300 dst_key_name = exp_src_uri.uri[ |
| 1301 len(src_uri_path_sans_final_dir):].lstrip(src_uri.delim) |
| 1302 # Handle case where dst_uri is a non-existent subdir. |
| 1303 if not have_existing_dest_subdir: |
| 1304 dst_key_name = dst_key_name.partition(exp_dst_uri.delim)[-1] |
| 1305 # Handle special case where src_uri was a directory named with '.' or |
| 1306 # './', so that running a command like: |
| 1307 # gsutil cp -r . gs://dest |
| 1308 # will produce obj names of the form gs://dest/abc instead of |
| 1309 # gs://dest/./abc. |
| 1310 if dst_key_name.startswith('./'): |
| 1311 dst_key_name = dst_key_name[2:] |
| 1312 |
| 1313 else: |
| 1314 # Case 3. |
| 1315 dst_key_name = exp_src_uri.object_name.rpartition(src_uri.delim)[-1] |
| 1316 |
| 1317 if (exp_dst_uri.is_file_uri() |
| 1318 or self._ShouldTreatDstUriAsBucketSubDir( |
| 1319 have_multiple_srcs, exp_dst_uri, have_existing_dest_subdir)): |
| 1320 if exp_dst_uri.object_name.endswith(exp_dst_uri.delim): |
| 1321 dst_key_name = '%s%s%s' % ( |
| 1322 exp_dst_uri.object_name.rstrip(exp_dst_uri.delim), |
| 1323 exp_dst_uri.delim, dst_key_name) |
| 1324 else: |
| 1325 dst_key_name = '%s%s' % (exp_dst_uri.object_name, dst_key_name) |
| 1326 |
| 1327 return exp_dst_uri.clone_replace_name(dst_key_name) |
| 1328 |
| 1329 def _FixWindowsNaming(self, src_uri, dst_uri): |
| 1330 """ |
| 1331 Rewrites the destination URI built by _ConstructDstUri() to translate |
| 1332 Windows pathnames to cloud pathnames if needed. |
| 1333 |
| 1334 Args: |
| 1335 src_uri: Source URI to be copied. |
| 1336 dst_uri: The destination URI built by _ConstructDstUri(). |
| 1337 |
| 1338 Returns: |
| 1339 StorageUri to use for copy. |
| 1340 """ |
| 1341 if (src_uri.is_file_uri() and src_uri.delim == '\\' |
| 1342 and dst_uri.is_cloud_uri()): |
| 1343 trans_uri_str = re.sub(r'\\', '/', dst_uri.uri) |
| 1344 dst_uri = self.suri_builder.StorageUri(trans_uri_str) |
| 1345 return dst_uri |
| 1346 |
| 1347 # Command entry point. |
| 1348 def RunCommand(self): |
| 1349 |
| 1350 # Inner funcs. |
| 1351 def _CopyExceptionHandler(e): |
| 1352 """Simple exception handler to allow post-completion status.""" |
| 1353 self.THREADED_LOGGER.error(str(e)) |
| 1354 self.copy_failure_count += 1 |
| 1355 |
| 1356 def _CopyFunc(name_expansion_result): |
| 1357 """Worker function for performing the actual copy (and rm, for mv).""" |
| 1358 if self.perform_mv: |
| 1359 cmd_name = 'mv' |
| 1360 else: |
| 1361 cmd_name = self.command_name |
| 1362 src_uri = self.suri_builder.StorageUri( |
| 1363 name_expansion_result.GetSrcUriStr()) |
| 1364 exp_src_uri = self.suri_builder.StorageUri( |
| 1365 name_expansion_result.GetExpandedUriStr(), |
| 1366 parse_version=self.parse_versions) |
| 1367 src_uri_names_container = name_expansion_result.NamesContainer() |
| 1368 src_uri_expands_to_multi = name_expansion_result.NamesContainer() |
| 1369 have_multiple_srcs = name_expansion_result.IsMultiSrcRequest() |
| 1370 have_existing_dest_subdir = ( |
| 1371 name_expansion_result.HaveExistingDstContainer()) |
| 1372 |
| 1373 if src_uri.names_provider(): |
| 1374 raise CommandException( |
| 1375 'The %s command does not allow provider-only source URIs (%s)' % |
| 1376 (cmd_name, src_uri)) |
| 1377 if have_multiple_srcs: |
| 1378 self._InsistDstUriNamesContainer(exp_dst_uri, |
| 1379 have_existing_dst_container, |
| 1380 cmd_name) |
| 1381 |
| 1382 if self.perform_mv: |
| 1383 # Disallow files as source arguments to protect users from deleting |
| 1384 # data off the local disk. Note that we can't simply set FILE_URIS_OK |
| 1385 # to False in command_spec because we *do* allow a file URI for the dest |
| 1386 # URI. (We allow users to move data out of the cloud to the local disk, |
| 1387 # but we disallow commands that would delete data off the local disk, |
| 1388 # and instead require the user to delete data separately, using local |
| 1389 # commands/tools.) |
| 1390 if src_uri.is_file_uri(): |
| 1391 raise CommandException('The mv command disallows files as source ' |
| 1392 'arguments.\nDid you mean to use a gs:// URI? ' |
| 1393 'If you meant to use a file as a source, you\n' |
| 1394 'might consider using the "cp" command ' |
| 1395 'instead.') |
| 1396 if name_expansion_result.NamesContainer(): |
| 1397 # Use recursion_requested when performing name expansion for the |
| 1398 # directory mv case so we can determine if any of the source URIs are |
| 1399 # directories (and then use cp -R and rm -R to perform the move, to |
| 1400 # match the behavior of Unix mv (which when moving a directory moves |
| 1401 # all the contained files). |
| 1402 self.recursion_requested = True |
| 1403 # Disallow wildcard src URIs when moving directories, as supporting it |
| 1404 # would make the name transformation too complex and would also be |
| 1405 # dangerous (e.g., someone could accidentally move many objects to the |
| 1406 # wrong name, or accidentally overwrite many objects). |
| 1407 if ContainsWildcard(src_uri): |
| 1408 raise CommandException('The mv command disallows naming source ' |
| 1409 'directories using wildcards') |
| 1410 |
| 1411 if (exp_dst_uri.is_file_uri() |
| 1412 and not os.path.exists(exp_dst_uri.object_name) |
| 1413 and have_multiple_srcs): |
| 1414 os.makedirs(exp_dst_uri.object_name) |
| 1415 |
| 1416 dst_uri = self._ConstructDstUri(src_uri, exp_src_uri, |
| 1417 src_uri_names_container, |
| 1418 src_uri_expands_to_multi, |
| 1419 have_multiple_srcs, exp_dst_uri, |
| 1420 have_existing_dest_subdir) |
| 1421 dst_uri = self._FixWindowsNaming(src_uri, dst_uri) |
| 1422 |
| 1423 self._CheckForDirFileConflict(exp_src_uri, dst_uri) |
| 1424 if self._SrcDstSame(exp_src_uri, dst_uri): |
| 1425 raise CommandException('%s: "%s" and "%s" are the same file - ' |
| 1426 'abort.' % (cmd_name, exp_src_uri, dst_uri)) |
| 1427 |
| 1428 elapsed_time = bytes_transferred = 0 |
| 1429 try: |
| 1430 (elapsed_time, bytes_transferred) = self._PerformCopy(exp_src_uri, |
| 1431 dst_uri) |
| 1432 except Exception, e: |
| 1433 if self._IsNoClobberServerException(e): |
| 1434 if not self.quiet: |
| 1435 self.THREADED_LOGGER.info('Rejected (noclobber): %s' % dst_uri.uri) |
| 1436 elif self.continue_on_error: |
| 1437 if not self.quiet: |
| 1438 self.THREADED_LOGGER.error('Error copying %s: %s' % (src_uri.uri, |
| 1439 str(e))) |
| 1440 self.copy_failure_count += 1 |
| 1441 else: |
| 1442 raise |
| 1443 |
| 1444 # TODO: If we ever use -n (noclobber) with -M (move) (not possible today |
| 1445 # since we call copy internally from move and don't specify the -n flag) |
| 1446 # we'll need to only remove the source when we have not skipped the |
| 1447 # destination. |
| 1448 if self.perform_mv: |
| 1449 if not self.quiet: |
| 1450 self.THREADED_LOGGER.info('Removing %s...', exp_src_uri) |
| 1451 exp_src_uri.delete_key(validate=False, headers=self.headers) |
| 1452 stats_lock.acquire() |
| 1453 self.total_elapsed_time += elapsed_time |
| 1454 self.total_bytes_transferred += bytes_transferred |
| 1455 stats_lock.release() |
| 1456 |
| 1457 # Start of RunCommand code. |
| 1458 self._ParseArgs() |
| 1459 |
| 1460 self.total_elapsed_time = self.total_bytes_transferred = 0 |
| 1461 if self.args[-1] == '-' or self.args[-1] == 'file://-': |
| 1462 self._HandleStreamingDownload() |
| 1463 return 0 |
| 1464 |
| 1465 if self.read_args_from_stdin: |
| 1466 if len(self.args) != 1: |
| 1467 raise CommandException('Source URIs cannot be specified with -I option') |
| 1468 uri_strs = self._StdinIterator() |
| 1469 else: |
| 1470 if len(self.args) < 2: |
| 1471 raise CommandException('Wrong number of arguments for "cp" command.') |
| 1472 uri_strs = self.args[0:len(self.args)-1] |
| 1473 |
| 1474 (exp_dst_uri, have_existing_dst_container) = self._ExpandDstUri( |
| 1475 self.args[-1]) |
| 1476 name_expansion_iterator = NameExpansionIterator( |
| 1477 self.command_name, self.proj_id_handler, self.headers, self.debug, |
| 1478 self.bucket_storage_uri_class, uri_strs, |
| 1479 self.recursion_requested or self.perform_mv, |
| 1480 have_existing_dst_container) |
| 1481 |
| 1482 # Use a lock to ensure accurate statistics in the face of |
| 1483 # multi-threading/multi-processing. |
| 1484 stats_lock = threading.Lock() |
| 1485 |
| 1486 # Tracks if any copies failed. |
| 1487 self.copy_failure_count = 0 |
| 1488 |
| 1489 # Start the clock. |
| 1490 start_time = time.time() |
| 1491 |
| 1492 # Tuple of attributes to share/manage across multiple processes in |
| 1493 # parallel (-m) mode. |
| 1494 shared_attrs = ('copy_failure_count', 'total_bytes_transferred') |
| 1495 |
| 1496 # Perform copy requests in parallel (-m) mode, if requested, using |
| 1497 # configured number of parallel processes and threads. Otherwise, |
| 1498 # perform requests with sequential function calls in current process. |
| 1499 self.Apply(_CopyFunc, name_expansion_iterator, _CopyExceptionHandler, |
| 1500 shared_attrs) |
| 1501 if self.debug: |
| 1502 print 'total_bytes_transferred:' + str(self.total_bytes_transferred) |
| 1503 |
| 1504 end_time = time.time() |
| 1505 self.total_elapsed_time = end_time - start_time |
| 1506 |
| 1507 if self.debug == 3: |
| 1508 # Note that this only counts the actual GET and PUT bytes for the copy |
| 1509 # - not any transfers for doing wildcard expansion, the initial HEAD |
| 1510 # request boto performs when doing a bucket.get_key() operation, etc. |
| 1511 if self.total_bytes_transferred != 0: |
| 1512 self.THREADED_LOGGER.info( |
| 1513 'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)', |
| 1514 self.total_bytes_transferred, self.total_elapsed_time, |
| 1515 MakeHumanReadable(float(self.total_bytes_transferred) / |
| 1516 float(self.total_elapsed_time))) |
| 1517 if self.copy_failure_count: |
| 1518 plural_str = '' |
| 1519 if self.copy_failure_count > 1: |
| 1520 plural_str = 's' |
| 1521 raise CommandException('%d file%s/object%s could not be transferred.' % ( |
| 1522 self.copy_failure_count, plural_str, plural_str)) |
| 1523 |
| 1524 return 0 |
| 1525 |
| 1526 # Test specification. See definition of test_steps in base class for |
| 1527 # details on how to populate these fields. |
| 1528 num_test_buckets = 3 |
| 1529 test_steps = [ |
| 1530 # (test name, cmd line, ret code, (result_file, expect_file)) |
| 1531 # |
| 1532 # Testing of no-clobber behavior. |
| 1533 ('upload', 'gsutil cp $F1 gs://$B1/$O1', 0, None), |
| 1534 ('setup noclobber', 'echo Skipping >noclob.ct', 0, None), |
| 1535 ('download', 'gsutil cp gs://$B1/$O1 $F9', 0, ('$F9', '$F1')), |
| 1536 ('noclobber upload', |
| 1537 'gsutil cp -n $F1 gs://$B1/$O1 2>&1 >/dev/null | cut -c1-8 >noclob.ct1', |
| 1538 0, ('noclob.ct', 'noclob.ct1')), |
| 1539 ('noclobber download', |
| 1540 'gsutil cp -n gs://$B1/$O1 $F9 2>&1 >/dev/null | cut -c1-8 >noclob.ct2', |
| 1541 0, ('noclob.ct', 'noclob.ct2')), |
| 1542 # |
| 1543 # Testing of copy-in-the-cloud: |
| 1544 ('noclobber setup copy-in-the-cloud', |
| 1545 'gsutil cp gs://$B1/$O1 gs://$B2', 0, None), |
| 1546 ('noclobber verify copy-in-the-cloud', |
| 1547 'gsutil cp -n gs://$B1/$O1 gs://$B2 2>&1 ' |
| 1548 '>/dev/null | cut -c1-8 > noclob.ct3', 0, ('noclob.ct', 'noclob.ct3')), |
| 1549 ('stream upload', 'cat $F1 | gsutil cp - gs://$B1/$O1', 0, None), |
| 1550 ('check stream upload', 'gsutil cp gs://$B1/$O1 $F9', 0, ('$F9', '$F1')), |
| 1551 # Clean up if we got interrupted. |
| 1552 ('remove test files', |
| 1553 'rm -f test.mp3 test_mp3.ct test.gif test_gif.ct test.foo noclob.ct*', |
| 1554 0, None), |
| 1555 # |
| 1556 # Testing of Content-Type detection: |
| 1557 ('setup mp3 file', 'cp $G/gslib/test_data/test.mp3 test.mp3', 0, None), |
| 1558 ('setup mp3 CT', 'echo audio/mpeg >test_mp3.ct', 0, None), |
| 1559 ('setup gif file', 'cp $G/gslib/test_data/test.gif test.gif', 0, None), |
| 1560 ('setup gif CT', 'echo image/gif >test_gif.ct', 0, None), |
| 1561 # TODO: we don't need test.app and test.bin anymore if |
| 1562 # USE_MAGICFILE=True. Implement a way to test both with and without using |
| 1563 # magic file. |
| 1564 #('setup app file', 'echo application/octet-stream >test.app', 0, None), |
| 1565 ('setup foo file', 'echo foo/bar >test.foo', 0, None), |
| 1566 ('upload mp3', 'gsutil cp test.mp3 gs://$B1/$O1', 0, None), |
| 1567 ('verify mp3', |
| 1568 'gsutil ls -L gs://$B1/$O1 | grep Content-Type | cut -f3 >$F1', |
| 1569 0, ('$F1', 'test_mp3.ct')), |
| 1570 ('upload gif', 'gsutil cp test.gif gs://$B1/$O1', 0, None), |
| 1571 ('verify gif', |
| 1572 'gsutil ls -L gs://$B1/$O1 | grep Content-Type | cut -f3 >$F1', |
| 1573 0, ('$F1', 'test_gif.ct')), |
| 1574 # TODO: The commented-out /noCT test below fails with USE_MAGICFILE=True. |
| 1575 ('upload mp3/noCT', |
| 1576 'gsutil -h "Content-Type:" cp test.mp3 gs://$B1/$O1', 0, None), |
| 1577 ('verify mp3/noCT', |
| 1578 'gsutil ls -L gs://$B1/$O1 | grep Content-Type | cut -f3 >$F1', |
| 1579 0, ('$F1', 'test_mp3.ct')), |
| 1580 ('upload gif/noCT', |
| 1581 'gsutil -h "Content-Type:" cp test.gif gs://$B1/$O1', 0, None), |
| 1582 ('verify gif/noCT', |
| 1583 'gsutil ls -L gs://$B1/$O1 | grep Content-Type | cut -f3 >$F1', |
| 1584 0, ('$F1', 'test_gif.ct')), |
| 1585 #('upload foo/noCT', 'gsutil -h "Content-Type:" cp test.foo gs://$B1/$O1', |
| 1586 # 0, None), |
| 1587 #('verify foo/noCT', |
| 1588 # 'gsutil ls -L gs://$B1/$O1 | grep Content-Type | cut -f3 >$F1', |
| 1589 # 0, ('$F1', 'test_bin.ct')), |
| 1590 ('upload mp3/-h gif', |
| 1591 'gsutil -h "Content-Type:image/gif" cp test.mp3 gs://$B1/$O1', 0, None), |
| 1592 ('verify mp3/-h gif', |
| 1593 'gsutil ls -L gs://$B1/$O1 | grep Content-Type | cut -f3 >$F1', |
| 1594 0, ('$F1', 'test_gif.ct')), |
| 1595 ('upload gif/-h gif', |
| 1596 'gsutil -h "Content-Type:image/gif" cp test.gif gs://$B1/$O1', 0, None), |
| 1597 ('verify gif/-h gif', |
| 1598 'gsutil ls -L gs://$B1/$O1 | grep Content-Type | cut -f3 >$F1', |
| 1599 0, ('$F1', 'test_gif.ct')), |
| 1600 ('upload foo/-h gif', |
| 1601 'gsutil -h "Content-Type: image/gif" cp test.foo gs://$B1/$O1', 0, None), |
| 1602 ('verify foo/-h gif', |
| 1603 'gsutil ls -L gs://$B1/$O1 | grep Content-Type | cut -f3 >$F1', |
| 1604 0, ('$F1', 'test_gif.ct')), |
| 1605 ('remove test files', |
| 1606 'rm -f test.mp3 test_mp3.ct test.gif test_gif.ct test.foo', 0, None), |
| 1607 # |
| 1608 # Testing of versioning: |
| 1609 # To make the following tests simpler to understand, we note that bucket $B2 |
| 1610 # is unused before this point. |
| 1611 ('enable versioning', 'gsutil setversioning on gs://$B2', 0, None), |
| 1612 ('upload new version', 'echo \'data1\' | gsutil cp - gs://$B2/$O3', 0, |
| 1613 None), |
| 1614 ('cloud cp new version', 'gsutil cp gs://$B2/$O0 gs://$B2/$O3', 0, None), |
| 1615 ('upload new version', 'echo \'data2\' | gsutil cp - gs://$B2/$O3', 0, |
| 1616 None), |
| 1617 ('get first generation name', 'gsutil ls -a gs://$B2/$O3 | head -n 1 > $F9', |
| 1618 0, None), |
| 1619 ('setup data1 file', 'echo \'data1\' > $F1', 0, None), |
| 1620 ('setup data2 file', 'echo \'data2\' > $F2', 0, None), |
| 1621 ('download current version', 'gsutil cp gs://$B2/$O3 $F3', 0, |
| 1622 ('$F3', '$F2')), |
| 1623 ('download first version', 'gsutil cp -v `cat $F9` $F3', 0, |
| 1624 ('$F3', '$F1')), |
| 1625 ('cp first verion to current', 'gsutil cp -v `cat $F9` gs://$B2/$O3', 0, |
| 1626 None), |
| 1627 ('download current version', 'gsutil cp gs://$B2/$O3 $F3', 0, |
| 1628 ('$F3', '$F1')), |
| 1629 ('remove versioning test files', 'rm -f $F3 $F9', 0, None), |
| 1630 # |
| 1631 # Testing of taking args from stdin: |
| 1632 ('remove bucket B2 contents', 'gsutil -m rm -r gs://$B2', 0, None), |
| 1633 ('setup inputlist file, pt1', 'echo $F1 > $F3', 0, None), |
| 1634 ('setup inputlist file, pt1', 'echo $F2 >> $F3', 0, None), |
| 1635 ('setup expected ls output, pt1', 'echo gs://$B2/$F1 > $F4', 0, None), |
| 1636 ('setup expected ls output, pt2', 'echo gs://$B2/$F2 >> $F4', 0, None), |
| 1637 ('perform inputlist copy', 'gsutil cp -I gs://$B2 < $F3', 0, None), |
| 1638 ('verify inputlist copy', 'gsutil ls gs://$B2 > $F5', 0, |
| 1639 ('$F5', '$F4')), |
| 1640 ] |
| 1641 |
| 1642 def _ParseArgs(self): |
| 1643 self.perform_mv = False |
| 1644 self.exclude_symlinks = False |
| 1645 self.quiet = False |
| 1646 self.no_clobber = False |
| 1647 self.continue_on_error = False |
| 1648 self.read_args_from_stdin = False |
| 1649 # self.recursion_requested initialized in command.py (so can be checked |
| 1650 # in parent class for all commands). |
| 1651 if self.sub_opts: |
| 1652 for o, unused_a in self.sub_opts: |
| 1653 if o == '-c': |
| 1654 self.continue_on_error = True |
| 1655 elif o == '-e': |
| 1656 self.exclude_symlinks = True |
| 1657 elif o == '-I': |
| 1658 self.read_args_from_stdin = True |
| 1659 elif o == '-M': |
| 1660 # Note that we signal to the cp command to perform a move (copy |
| 1661 # followed by remove) and use directory-move naming rules by passing |
| 1662 # the undocumented (for internal use) -M option when running the cp |
| 1663 # command from mv.py. |
| 1664 self.perform_mv = True |
| 1665 elif o == '-n': |
| 1666 self.no_clobber = True |
| 1667 elif o == '-q': |
| 1668 self.quiet = True |
| 1669 elif o == '-r' or o == '-R': |
| 1670 self.recursion_requested = True |
| 1671 elif o == '-v': |
| 1672 self.parse_versions = True |
| 1673 |
| 1674 def _HandleStreamingDownload(self): |
| 1675 # Destination is <STDOUT>. Manipulate sys.stdout so as to redirect all |
| 1676 # debug messages to <STDERR>. |
| 1677 stdout_fp = sys.stdout |
| 1678 sys.stdout = sys.stderr |
| 1679 did_some_work = False |
| 1680 for uri_str in self.args[0:len(self.args)-1]: |
| 1681 for uri in self.WildcardIterator(uri_str).IterUris(): |
| 1682 if not uri.names_object(): |
| 1683 raise CommandException('Destination Stream requires that ' |
| 1684 'source URI %s should represent an object!') |
| 1685 did_some_work = True |
| 1686 key = uri.get_key(False, self.headers) |
| 1687 (elapsed_time, bytes_transferred) = self._PerformDownloadToStream( |
| 1688 key, uri, stdout_fp, self.headers) |
| 1689 self.total_elapsed_time += elapsed_time |
| 1690 self.total_bytes_transferred += bytes_transferred |
| 1691 if not did_some_work: |
| 1692 raise CommandException('No URIs matched') |
| 1693 if self.debug == 3: |
| 1694 if self.total_bytes_transferred != 0: |
| 1695 self.THREADED_LOGGER.info( |
| 1696 'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)', |
| 1697 self.total_bytes_transferred, self.total_elapsed_time, |
| 1698 MakeHumanReadable(float(self.total_bytes_transferred) / |
| 1699 float(self.total_elapsed_time))) |
| 1700 def _StdinIterator(self): |
| 1701 """A generator function that returns lines from stdin.""" |
| 1702 for line in sys.stdin: |
| 1703 # Strip CRLF. |
| 1704 yield line.rstrip() |
| 1705 |
| 1706 def _SrcDstSame(self, src_uri, dst_uri): |
| 1707 """Checks if src_uri and dst_uri represent the same object or file. |
| 1708 |
| 1709 We don't handle anything about hard or symbolic links. |
| 1710 |
| 1711 Args: |
| 1712 src_uri: Source StorageUri. |
| 1713 dst_uri: Destination StorageUri. |
| 1714 |
| 1715 Returns: |
| 1716 Bool indicator. |
| 1717 """ |
| 1718 if src_uri.is_file_uri() and dst_uri.is_file_uri(): |
| 1719 # Translate a/b/./c to a/b/c, so src=dst comparison below works. |
| 1720 new_src_path = re.sub('%s+\.%s+' % (os.sep, os.sep), os.sep, |
| 1721 src_uri.object_name) |
| 1722 new_src_path = re.sub('^.%s+' % os.sep, '', new_src_path) |
| 1723 new_dst_path = re.sub('%s+\.%s+' % (os.sep, os.sep), os.sep, |
| 1724 dst_uri.object_name) |
| 1725 new_dst_path = re.sub('^.%s+' % os.sep, '', new_dst_path) |
| 1726 return (src_uri.clone_replace_name(new_src_path).uri == |
| 1727 dst_uri.clone_replace_name(new_dst_path).uri) |
| 1728 else: |
| 1729 return (src_uri.uri == dst_uri.uri and |
| 1730 src_uri.generation == dst_uri.generation and |
| 1731 src_uri.version_id == dst_uri.version_id) |
| 1732 |
| 1733 def _ShouldTreatDstUriAsBucketSubDir(self, have_multiple_srcs, dst_uri, |
| 1734 have_existing_dest_subdir): |
| 1735 """ |
| 1736 Checks whether dst_uri should be treated as a bucket "sub-directory". The |
| 1737 decision about whether something constitutes a bucket "sub-directory" |
| 1738 depends on whether there are multiple sources in this request and whether |
| 1739 there is an existing bucket subdirectory. For example, when running the |
| 1740 command: |
| 1741 gsutil cp file gs://bucket/abc |
| 1742 if there's no existing gs://bucket/abc bucket subdirectory we should copy |
| 1743 file to the object gs://bucket/abc. In contrast, if |
| 1744 there's an existing gs://bucket/abc bucket subdirectory we should copy |
| 1745 file to gs://bucket/abc/file. And regardless of whether gs://bucket/abc |
| 1746 exists, when running the command: |
| 1747 gsutil cp file1 file2 gs://bucket/abc |
| 1748 we should copy file1 to gs://bucket/abc/file1 (and similarly for file2). |
| 1749 |
| 1750 Note that we don't disallow naming a bucket "sub-directory" where there's |
| 1751 already an object at that URI. For example it's legitimate (albeit |
| 1752 confusing) to have an object called gs://bucket/dir and |
| 1753 then run the command |
| 1754 gsutil cp file1 file2 gs://bucket/dir |
| 1755 Doing so will end up with objects gs://bucket/dir, gs://bucket/dir/file1, |
| 1756 and gs://bucket/dir/file2. |
| 1757 |
| 1758 Args: |
| 1759 have_multiple_srcs: Bool indicator of whether this is a multi-source |
| 1760 operation. |
| 1761 dst_uri: StorageUri to check. |
| 1762 have_existing_dest_subdir: bool indicator whether dest is an existing |
| 1763 subdirectory. |
| 1764 |
| 1765 Returns: |
| 1766 bool indicator. |
| 1767 """ |
| 1768 return ((have_multiple_srcs and dst_uri.is_cloud_uri()) |
| 1769 or (have_existing_dest_subdir)) |
| 1770 |
| 1771 def _ShouldTreatDstUriAsSingleton(self, have_multiple_srcs, |
| 1772 have_existing_dest_subdir, dst_uri): |
| 1773 """ |
| 1774 Checks that dst_uri names a singleton (file or object) after |
| 1775 dir/wildcard expansion. The decision is more nuanced than simply |
| 1776 dst_uri.names_singleton()) because of the possibility that an object path |
| 1777 might name a bucket sub-directory. |
| 1778 |
| 1779 Args: |
| 1780 have_multiple_srcs: Bool indicator of whether this is a multi-source |
| 1781 operation. |
| 1782 have_existing_dest_subdir: bool indicator whether dest is an existing |
| 1783 subdirectory. |
| 1784 dst_uri: StorageUri to check. |
| 1785 |
| 1786 Returns: |
| 1787 bool indicator. |
| 1788 """ |
| 1789 if have_multiple_srcs: |
| 1790 # Only a file meets the criteria in this case. |
| 1791 return dst_uri.names_file() |
| 1792 return not have_existing_dest_subdir and dst_uri.names_singleton() |
| 1793 |
| 1794 def _IsNoClobberServerException(self, e): |
| 1795 """ |
| 1796 Checks to see if the server attempted to clobber a file after we specified |
| 1797 in the header that we didn't want the file clobbered. |
| 1798 |
| 1799 Args: |
| 1800 e: The Exception that was generated by a failed copy operation |
| 1801 |
| 1802 Returns: |
| 1803 bool indicator - True indicates that the server did attempt to clobber |
| 1804 an existing file. |
| 1805 """ |
| 1806 return self.no_clobber and ( |
| 1807 (isinstance(e, GSResponseError) and e.status==412) or |
| 1808 (isinstance(e, ResumableUploadException) and 'code 412' in e.message)) |
| 1809 |
| 1810 def _GetPathBeforeFinalDir(uri): |
| 1811 """ |
| 1812 Returns the part of the path before the final directory component for the |
| 1813 given URI, handling cases for file system directories, bucket, and bucket |
| 1814 subdirectories. Example: for gs://bucket/dir/ we'll return 'gs://bucket', |
| 1815 and for file://dir we'll return file:// |
| 1816 |
| 1817 Args: |
| 1818 uri: StorageUri. |
| 1819 |
| 1820 Returns: |
| 1821 String name of above-described path, sans final path separator. |
| 1822 """ |
| 1823 sep = uri.delim |
| 1824 assert not uri.names_file() |
| 1825 if uri.names_directory(): |
| 1826 past_scheme = uri.uri[len('file://'):] |
| 1827 if past_scheme.find(sep) == -1: |
| 1828 return 'file://' |
| 1829 else: |
| 1830 return 'file://%s' % past_scheme.rstrip(sep).rpartition(sep)[0] |
| 1831 if uri.names_bucket(): |
| 1832 return '%s://' % uri.scheme |
| 1833 # Else it names a bucket subdir. |
| 1834 return uri.uri.rstrip(sep).rpartition(sep)[0] |
| 1835 |
| 1836 def _hash_filename(filename): |
| 1837 """ |
| 1838 Apply a hash function (SHA1) to shorten the passed file name. The spec |
| 1839 for the hashed file name is as follows: |
| 1840 |
| 1841 TRACKER_<hash>_<trailing> |
| 1842 |
| 1843 where hash is a SHA1 hash on the original file name and trailing is |
| 1844 the last 16 chars from the original file name. Max file name lengths |
| 1845 vary by operating system so the goal of this function is to ensure |
| 1846 the hashed version takes fewer than 100 characters. |
| 1847 |
| 1848 Args: |
| 1849 filename: file name to be hashed. |
| 1850 |
| 1851 Returns: |
| 1852 shorter, hashed version of passed file name |
| 1853 """ |
| 1854 m = hashlib.sha1(filename.encode('utf-8')) |
| 1855 hashed_name = ("TRACKER_" + m.hexdigest() + '.' + filename[-16:]) |
| 1856 return hashed_name |
OLD | NEW |