OLD | NEW |
| (Empty) |
1 # Copyright 2011 Google Inc. All Rights Reserved. | |
2 # Copyright 2011, Nexenta Systems Inc. | |
3 # | |
4 # Licensed under the Apache License, Version 2.0 (the "License"); | |
5 # you may not use this file except in compliance with the License. | |
6 # You may obtain a copy of the License at | |
7 # | |
8 # http://www.apache.org/licenses/LICENSE-2.0 | |
9 # | |
10 # Unless required by applicable law or agreed to in writing, software | |
11 # distributed under the License is distributed on an "AS IS" BASIS, | |
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 # See the License for the specific language governing permissions and | |
14 # limitations under the License. | |
15 | |
16 import boto | |
17 import errno | |
18 import gzip | |
19 import hashlib | |
20 import mimetypes | |
21 import os | |
22 import platform | |
23 import re | |
24 import subprocess | |
25 import stat | |
26 import sys | |
27 import tempfile | |
28 import threading | |
29 import time | |
30 | |
31 from boto import config | |
32 from boto.exception import GSResponseError | |
33 from boto.exception import ResumableUploadException | |
34 from boto.gs.resumable_upload_handler import ResumableUploadHandler | |
35 from boto.s3.keyfile import KeyFile | |
36 from boto.s3.resumable_download_handler import ResumableDownloadHandler | |
37 from boto.storage_uri import BucketStorageUri | |
38 from gslib.command import COMMAND_NAME | |
39 from gslib.command import COMMAND_NAME_ALIASES | |
40 from gslib.command import CONFIG_REQUIRED | |
41 from gslib.command import Command | |
42 from gslib.command import FILE_URIS_OK | |
43 from gslib.command import MAX_ARGS | |
44 from gslib.command import MIN_ARGS | |
45 from gslib.command import PROVIDER_URIS_OK | |
46 from gslib.command import SUPPORTED_SUB_ARGS | |
47 from gslib.command import URIS_START_ARG | |
48 from gslib.exception import CommandException | |
49 from gslib.help_provider import HELP_NAME | |
50 from gslib.help_provider import HELP_NAME_ALIASES | |
51 from gslib.help_provider import HELP_ONE_LINE_SUMMARY | |
52 from gslib.help_provider import HELP_TEXT | |
53 from gslib.help_provider import HELP_TYPE | |
54 from gslib.help_provider import HelpType | |
55 from gslib.name_expansion import NameExpansionIterator | |
56 from gslib.util import ExtractErrorDetail | |
57 from gslib.util import IS_WINDOWS | |
58 from gslib.util import MakeHumanReadable | |
59 from gslib.util import NO_MAX | |
60 from gslib.util import TWO_MB | |
61 from gslib.wildcard_iterator import ContainsWildcard | |
62 | |
63 _detailed_help_text = (""" | |
64 <B>SYNOPSIS</B> | |
65 gsutil cp [OPTION]... src_uri dst_uri | |
66 - or - | |
67 gsutil cp [OPTION]... src_uri... dst_uri | |
68 - or - | |
69 gsutil cp [OPTION]... -I dst_uri | |
70 | |
71 | |
72 <B>DESCRIPTION</B> | |
73 The gsutil cp command allows you to copy data between your local file | |
74 system and the cloud, copy data within the cloud, and copy data between | |
75 cloud storage providers. For example, to copy all text files from the | |
76 local directory to a bucket you could do: | |
77 | |
78 gsutil cp *.txt gs://my_bucket | |
79 | |
80 Similarly, you can download text files from a bucket by doing: | |
81 | |
82 gsutil cp gs://my_bucket/*.txt . | |
83 | |
84 If you want to copy an entire directory tree you need to use the -R option: | |
85 | |
86 gsutil cp -R dir gs://my_bucket | |
87 | |
88 If you have a large number of files to upload you might want to use the | |
89 gsutil -m option, to perform a parallel (multi-threaded/multi-processing) | |
90 copy: | |
91 | |
92 gsutil -m cp -R dir gs://my_bucket | |
93 | |
94 You can pass a list of URIs to copy on STDIN instead of as command line | |
95 arguments by using the -I option. This allows you to use gsutil in a | |
96 pipeline to copy files and objects as generated by a program, such as: | |
97 | |
98 some_program | gsutil -m cp -I gs://my_bucket | |
99 | |
100 The contents of STDIN can name files, cloud URIs, and wildcards of files | |
101 and cloud URIs. | |
102 | |
103 | |
104 <B>HOW NAMES ARE CONSTRUCTED</B> | |
105 The gsutil cp command strives to name objects in a way consistent with how | |
106 Linux cp works, which causes names to be constructed in varying ways depending | |
107 on whether you're performing a recursive directory copy or copying | |
108 individually named objects; and whether you're copying to an existing or | |
109 non-existent directory. | |
110 | |
111 When performing recursive directory copies, object names are constructed | |
112 that mirror the source directory structure starting at the point of | |
113 recursive processing. For example, the command: | |
114 | |
115 gsutil cp -R dir1/dir2 gs://my_bucket | |
116 | |
117 will create objects named like gs://my_bucket/dir2/a/b/c, assuming | |
118 dir1/dir2 contains the file a/b/c. | |
119 | |
120 In contrast, copying individually named files will result in objects named | |
121 by the final path component of the source files. For example, the command: | |
122 | |
123 gsutil cp dir1/dir2/** gs://my_bucket | |
124 | |
125 will create objects named like gs://my_bucket/c. | |
126 | |
127 The same rules apply for downloads: recursive copies of buckets and | |
128 bucket subdirectories produce a mirrored filename structure, while copying | |
129 individually (or wildcard) named objects produce flatly named files. | |
130 | |
131 Note that in the above example the '**' wildcard matches all names | |
132 anywhere under dir. The wildcard '*' will match names just one level deep. For | |
133 more details see 'gsutil help wildcards'. | |
134 | |
135 There's an additional wrinkle when working with subdirectories: the resulting | |
136 names depend on whether the destination subdirectory exists. For example, | |
137 if gs://my_bucket/subdir exists as a subdirectory, the command: | |
138 | |
139 gsutil cp -R dir1/dir2 gs://my_bucket/subdir | |
140 | |
141 will create objects named like gs://my_bucket/subdir/dir2/a/b/c. In contrast, | |
142 if gs://my_bucket/subdir does not exist, this same gsutil cp command will | |
143 create objects named like gs://my_bucket/subdir/a/b/c. | |
144 | |
145 | |
146 <B>COPYING TO/FROM SUBDIRECTORIES; DISTRIBUTING TRANSFERS ACROSS MACHINES</B> | |
147 You can use gsutil to copy to and from subdirectories by using a command like: | |
148 | |
149 gsutil cp -R dir gs://my_bucket/data | |
150 | |
151 This will cause dir and all of its files and nested subdirectories to be | |
152 copied under the specified destination, resulting in objects with names like | |
153 gs://my_bucket/data/dir/a/b/c. Similarly you can download from bucket | |
154 subdirectories by using a command like: | |
155 | |
156 gsutil cp -R gs://my_bucket/data dir | |
157 | |
158 This will cause everything nested under gs://my_bucket/data to be downloaded | |
159 into dir, resulting in files with names like dir/data/a/b/c. | |
160 | |
161 Copying subdirectories is useful if you want to add data to an existing | |
162 bucket directory structure over time. It's also useful if you want | |
163 to parallelize uploads and downloads across multiple machines (often | |
164 reducing overall transfer time compared with simply running gsutil -m | |
165 cp on one machine). For example, if your bucket contains this structure: | |
166 | |
167 gs://my_bucket/data/result_set_01/ | |
168 gs://my_bucket/data/result_set_02/ | |
169 ... | |
170 gs://my_bucket/data/result_set_99/ | |
171 | |
172 you could perform concurrent downloads across 3 machines by running these | |
173 commands on each machine, respectively: | |
174 | |
175 gsutil -m cp -R gs://my_bucket/data/result_set_[0-3]* dir | |
176 gsutil -m cp -R gs://my_bucket/data/result_set_[4-6]* dir | |
177 gsutil -m cp -R gs://my_bucket/data/result_set_[7-9]* dir | |
178 | |
179 Note that dir could be a local directory on each machine, or it could | |
180 be a directory mounted off of a shared file server; whether the latter | |
181 performs acceptably may depend on a number of things, so we recommend | |
182 you experiment and find out what works best for you. | |
183 | |
184 | |
185 <B>COPYING IN THE CLOUD AND METADATA PRESERVATION</B> | |
186 If both the source and destination URI are cloud URIs from the same | |
187 provider, gsutil copies data "in the cloud" (i.e., without downloading | |
188 to and uploading from the machine where you run gsutil). In addition to | |
189 the performance and cost advantages of doing this, copying in the cloud | |
190 preserves metadata (like Content-Type and Cache-Control). In contrast, | |
191 when you download data from the cloud it ends up in a file, which has | |
192 no associated metadata. Thus, unless you have some way to hold on to | |
193 or re-create that metadata, downloading to a file will not retain the | |
194 metadata. | |
195 | |
196 Note that by default, the gsutil cp command does not copy the object | |
197 ACL to the new object, and instead will use the default bucket ACL (see | |
198 "gsutil help setdefacl"). You can override this behavior with the -p | |
199 option (see OPTIONS below). | |
200 | |
201 gsutil does not preserve metadata when copying objects between providers. | |
202 | |
203 | |
204 <B>RESUMABLE TRANSFERS</B> | |
205 gsutil automatically uses the Google Cloud Storage resumable upload | |
206 feature whenever you use the cp command to upload an object that is larger | |
207 than 2 MB. You do not need to specify any special command line options | |
208 to make this happen. If your upload is interrupted you can restart the | |
209 upload by running the same cp command that you ran to start the upload. | |
210 | |
211 Similarly, gsutil automatically performs resumable downloads (using HTTP | |
212 standard Range GET operations) whenever you use the cp command to download an | |
213 object larger than 2 MB. | |
214 | |
215 Resumable uploads and downloads store some state information in a file | |
216 in ~/.gsutil named by the destination object or file. If you attempt to | |
217 resume a transfer from a machine with a different directory, the transfer | |
218 will start over from scratch. | |
219 | |
220 See also "gsutil help prod" for details on using resumable transfers | |
221 in production. | |
222 | |
223 | |
224 <B>STREAMING TRANSFERS</B> | |
225 Use '-' in place of src_uri or dst_uri to perform a streaming | |
226 transfer. For example: | |
227 long_running_computation | gsutil cp - gs://my_bucket/obj | |
228 | |
229 Streaming transfers do not support resumable uploads/downloads. | |
230 (The Google resumable transfer protocol has a way to support streaming | |
231 transers, but gsutil doesn't currently implement support for this.) | |
232 | |
233 | |
234 <B>CHANGING TEMP DIRECTORIES</B> | |
235 gsutil writes data to a temporary directory in several cases: | |
236 - when compressing data to be uploaded (see the -z option) | |
237 - when decompressing data being downloaded (when the data has | |
238 Content-Encoding:gzip, e.g., as happens when uploaded using gsutil cp -z) | |
239 - when running integration tests (using the gsutil test command) | |
240 | |
241 In these cases it's possible the temp file location on your system that | |
242 gsutil selects by default may not have enough space. If you find that | |
243 gsutil runs out of space during one of these operations (e.g., raising | |
244 "CommandException: Inadequate temp space available to compress <your file>" | |
245 during a gsutil cp -z operation), you can change where it writes these | |
246 temp files by setting the TMPDIR environment variable. On Linux and MacOS | |
247 you can do this either by running gsutil this way: | |
248 | |
249 TMPDIR=/some/directory gsutil cp ... | |
250 | |
251 or by adding this line to your ~/.bashrc file and then restarting the shell | |
252 before running gsutil: | |
253 | |
254 export TMPDIR=/some/directory | |
255 | |
256 On Windows 7 you can change the TMPDIR environment variable from Start -> | |
257 Computer -> System -> Advanced System Settings -> Environment Variables. | |
258 You need to reboot after making this change for it to take effect. (Rebooting | |
259 is not necessary after running the export command on Linux and MacOS.) | |
260 | |
261 | |
262 <B>OPTIONS</B> | |
263 -a canned_acl Sets named canned_acl when uploaded objects created. See | |
264 'gsutil help acls' for further details. | |
265 | |
266 -c If an error occurrs, continue to attempt to copy the remaining | |
267 files. | |
268 | |
269 -D Copy in "daisy chain" mode, i.e., copying between two buckets by | |
270 hooking a download to an upload, via the machine where gsutil is | |
271 run. By default, data are copied between two buckets "in the | |
272 cloud", i.e., without needing to copy via the machine where | |
273 gsutil runs. However, copy-in-the-cloud is not supported when | |
274 copying between different locations (like US and EU) or between | |
275 different storage classes (like STANDARD and | |
276 DURABLE_REDUCED_AVAILABILITY). For these cases, you can use the | |
277 -D option to copy data between buckets. | |
278 Note: Daisy chain mode is automatically used when copying | |
279 between providers (e.g., to copy data from Google Cloud Storage | |
280 to another provider). | |
281 | |
282 -e Exclude symlinks. When specified, symbolic links will not be | |
283 copied. | |
284 | |
285 -n No-clobber. When specified, existing files or objects at the | |
286 destination will not be overwritten. Any items that are skipped | |
287 by this option will be reported as being skipped. This option | |
288 will perform an additional HEAD request to check if an item | |
289 exists before attempting to upload the data. This will save | |
290 retransmitting data, but the additional HTTP requests may make | |
291 small object transfers slower and more expensive. | |
292 | |
293 This option can be combined with the -c option to build a script | |
294 that copies a large number of objects, allowing retries when | |
295 some failures occur from which gsutil doesn't automatically | |
296 recover, using a bash script like the following: | |
297 | |
298 status=1 | |
299 while [ $status -ne 0 ] ; do | |
300 gsutil cp -c -n -R ./dir gs://bucket | |
301 status=$? | |
302 done | |
303 | |
304 The -c option will cause copying to continue after failures | |
305 occur, and the -n option will cause objects already copied to be | |
306 skipped on subsequent iterations. The loop will continue running | |
307 as long as gsutil exits with a non-zero status (such a status | |
308 indicates there was at least one failure during the gsutil run). | |
309 | |
310 -p Causes ACLs to be preserved when copying in the cloud. Note that | |
311 this option has performance and cost implications, because it | |
312 is essentially performing three requests (getacl, cp, setacl). | |
313 (The performance issue can be mitigated to some degree by | |
314 using gsutil -m cp to cause parallel copying.) | |
315 | |
316 You can avoid the additional performance and cost of using cp -p | |
317 if you want all objects in the destination bucket to end up with | |
318 the same ACL by setting a default ACL on that bucket instead of | |
319 using cp -p. See "help gsutil setdefacl". | |
320 | |
321 Note that it's not valid to specify both the -a and -p options | |
322 together. | |
323 | |
324 -q Causes copies to be performed quietly, i.e., without reporting | |
325 progress indicators of files being copied. Errors are still | |
326 reported. This option can be useful for running gsutil from a | |
327 cron job that logs its output to a file, for which the only | |
328 information desired in the log is failures. | |
329 | |
330 -R, -r Causes directories, buckets, and bucket subdirectories to be | |
331 copied recursively. If you neglect to use this option for | |
332 an upload, gsutil will copy any files it finds and skip any | |
333 directories. Similarly, neglecting to specify -R for a download | |
334 will cause gsutil to copy any objects at the current bucket | |
335 directory level, and skip any subdirectories. | |
336 | |
337 -v Requests that the version-specific URI for each uploaded object | |
338 be printed. Given this URI you can make future upload requests | |
339 that are safe in the face of concurrent updates, because Google | |
340 Cloud Storage will refuse to perform the update if the current | |
341 object version doesn't match the version-specific URI. See | |
342 'gsutil help versioning' for more details. Note: at present this | |
343 option does not work correctly for objects copied "in the cloud" | |
344 (e.g., gsutil cp gs://bucket/obj1 gs://bucket/obj2). | |
345 | |
346 -z ext1,... Compresses file uploads with the given extensions. If you are | |
347 uploading a large file with compressible content, such as | |
348 a .js, .css, or .html file, you can gzip-compress the file | |
349 during the upload process by specifying the -z <extensions> | |
350 option. Compressing data before upload saves on usage charges | |
351 because you are uploading a smaller amount of data. | |
352 | |
353 When you specify the -z option, the data from your files is | |
354 compressed before it is uploaded, but your actual files are left | |
355 uncompressed on the local disk. The uploaded objects retain the | |
356 original content type and name as the original files but are | |
357 given a Content-Encoding header with the value "gzip" to | |
358 indicate that the object data stored are compressed on the | |
359 Google Cloud Storage servers. | |
360 | |
361 For example, the following command: | |
362 | |
363 gsutil cp -z html -a public-read cattypes.html gs://mycats | |
364 | |
365 will do all of the following: | |
366 - Upload as the object gs://mycats/cattypes.html (cp command) | |
367 - Set the Content-Type to text/html (based on file extension) | |
368 - Compress the data in the file cattypes.html (-z option) | |
369 - Set the Content-Encoding to gzip (-z option) | |
370 - Set the ACL to public-read (-a option) | |
371 - If a user tries to view cattypes.html in a browser, the | |
372 browser will know to uncompress the data based on the | |
373 Content-Encoding header, and to render it as HTML based on | |
374 the Content-Type header. | |
375 """) | |
376 | |
377 class CpCommand(Command): | |
378 """ | |
379 Implementation of gsutil cp command. | |
380 | |
381 Note that CpCommand is run for both gsutil cp and gsutil mv. The latter | |
382 happens by MvCommand calling CpCommand and passing the hidden (undocumented) | |
383 -M option. This allows the copy and remove needed for each mv to run | |
384 together (rather than first running all the cp's and then all the rm's, as | |
385 we originally had implemented), which in turn avoids the following problem | |
386 with removing the wrong objects: starting with a bucket containing only | |
387 the object gs://bucket/obj, say the user does: | |
388 gsutil mv gs://bucket/* gs://bucket/d.txt | |
389 If we ran all the cp's and then all the rm's and we didn't expand the wildcard | |
390 first, the cp command would first copy gs://bucket/obj to gs://bucket/d.txt, | |
391 and the rm command would then remove that object. In the implementation | |
392 prior to gsutil release 3.12 we avoided this by building a list of objects | |
393 to process and then running the copies and then the removes; but building | |
394 the list up front limits scalability (compared with the current approach | |
395 of processing the bucket listing iterator on the fly). | |
396 """ | |
397 | |
398 # Set default Content-Type type. | |
399 DEFAULT_CONTENT_TYPE = 'application/octet-stream' | |
400 USE_MAGICFILE = boto.config.getbool('GSUtil', 'use_magicfile', False) | |
401 | |
402 # Command specification (processed by parent class). | |
403 command_spec = { | |
404 # Name of command. | |
405 COMMAND_NAME : 'cp', | |
406 # List of command name aliases. | |
407 COMMAND_NAME_ALIASES : ['copy'], | |
408 # Min number of args required by this command. | |
409 MIN_ARGS : 1, | |
410 # Max number of args required by this command, or NO_MAX. | |
411 MAX_ARGS : NO_MAX, | |
412 # Getopt-style string specifying acceptable sub args. | |
413 # -t is deprecated but leave intact for now to avoid breakage. | |
414 SUPPORTED_SUB_ARGS : 'a:cDeIMNnpqrRtvz:', | |
415 # True if file URIs acceptable for this command. | |
416 FILE_URIS_OK : True, | |
417 # True if provider-only URIs acceptable for this command. | |
418 PROVIDER_URIS_OK : False, | |
419 # Index in args of first URI arg. | |
420 URIS_START_ARG : 0, | |
421 # True if must configure gsutil before running command. | |
422 CONFIG_REQUIRED : True, | |
423 } | |
424 help_spec = { | |
425 # Name of command or auxiliary help info for which this help applies. | |
426 HELP_NAME : 'cp', | |
427 # List of help name aliases. | |
428 HELP_NAME_ALIASES : ['copy'], | |
429 # Type of help: | |
430 HELP_TYPE : HelpType.COMMAND_HELP, | |
431 # One line summary of this help. | |
432 HELP_ONE_LINE_SUMMARY : 'Copy files and objects', | |
433 # The full help text. | |
434 HELP_TEXT : _detailed_help_text, | |
435 } | |
436 | |
437 def _CheckFinalMd5(self, key, file_name): | |
438 """ | |
439 Checks that etag from server agrees with md5 computed after the | |
440 download completes. | |
441 """ | |
442 obj_md5 = key.etag.strip('"\'') | |
443 file_md5 = None | |
444 | |
445 if hasattr(key, 'md5') and key.md5: | |
446 file_md5 = key.md5 | |
447 else: | |
448 print 'Computing MD5 from scratch for resumed download' | |
449 | |
450 # Open file in binary mode to avoid surprises in Windows. | |
451 fp = open(file_name, 'rb') | |
452 try: | |
453 file_md5 = key.compute_md5(fp)[0] | |
454 finally: | |
455 fp.close() | |
456 | |
457 if self.debug: | |
458 print 'Checking file md5 against etag. (%s/%s)' % (file_md5, obj_md5) | |
459 if file_md5 != obj_md5: | |
460 # Checksums don't match - remove file and raise exception. | |
461 os.unlink(file_name) | |
462 raise CommandException( | |
463 'File changed during download: md5 signature doesn\'t match ' | |
464 'etag (incorrect downloaded file deleted)') | |
465 | |
466 def _CheckForDirFileConflict(self, exp_src_uri, dst_uri): | |
467 """Checks whether copying exp_src_uri into dst_uri is not possible. | |
468 | |
469 This happens if a directory exists in local file system where a file | |
470 needs to go or vice versa. In that case we print an error message and | |
471 exits. Example: if the file "./x" exists and you try to do: | |
472 gsutil cp gs://mybucket/x/y . | |
473 the request can't succeed because it requires a directory where | |
474 the file x exists. | |
475 | |
476 Note that we don't enforce any corresponding restrictions for buckets, | |
477 because the flat namespace semantics for buckets doesn't prohibit such | |
478 cases the way hierarchical file systems do. For example, if a bucket | |
479 contains an object called gs://bucket/dir and then you run the command: | |
480 gsutil cp file1 file2 gs://bucket/dir | |
481 you'll end up with objects gs://bucket/dir, gs://bucket/dir/file1, and | |
482 gs://bucket/dir/file2. | |
483 | |
484 Args: | |
485 exp_src_uri: Expanded source StorageUri of copy. | |
486 dst_uri: Destination URI. | |
487 | |
488 Raises: | |
489 CommandException: if errors encountered. | |
490 """ | |
491 if dst_uri.is_cloud_uri(): | |
492 # The problem can only happen for file destination URIs. | |
493 return | |
494 dst_path = dst_uri.object_name | |
495 final_dir = os.path.dirname(dst_path) | |
496 if os.path.isfile(final_dir): | |
497 raise CommandException('Cannot retrieve %s because a file exists ' | |
498 'where a directory needs to be created (%s).' % | |
499 (exp_src_uri, final_dir)) | |
500 if os.path.isdir(dst_path): | |
501 raise CommandException('Cannot retrieve %s because a directory exists ' | |
502 '(%s) where the file needs to be created.' % | |
503 (exp_src_uri, dst_path)) | |
504 | |
505 def _InsistDstUriNamesContainer(self, exp_dst_uri, | |
506 have_existing_dst_container, command_name): | |
507 """ | |
508 Raises an exception if URI doesn't name a directory, bucket, or bucket | |
509 subdir, with special exception for cp -R (see comments below). | |
510 | |
511 Args: | |
512 exp_dst_uri: Wildcard-expanding dst_uri. | |
513 have_existing_dst_container: bool indicator of whether exp_dst_uri | |
514 names a container (directory, bucket, or existing bucket subdir). | |
515 command_name: Name of command making call. May not be the same as | |
516 self.command_name in the case of commands implemented atop other | |
517 commands (like mv command). | |
518 | |
519 Raises: | |
520 CommandException: if the URI being checked does not name a container. | |
521 """ | |
522 if exp_dst_uri.is_file_uri(): | |
523 ok = exp_dst_uri.names_directory() | |
524 else: | |
525 if have_existing_dst_container: | |
526 ok = True | |
527 else: | |
528 # It's ok to specify a non-existing bucket subdir, for example: | |
529 # gsutil cp -R dir gs://bucket/abc | |
530 # where gs://bucket/abc isn't an existing subdir. | |
531 ok = exp_dst_uri.names_object() | |
532 if not ok: | |
533 raise CommandException('Destination URI must name a directory, bucket, ' | |
534 'or bucket\nsubdirectory for the multiple ' | |
535 'source form of the %s command.' % command_name) | |
536 | |
537 class _FileCopyCallbackHandler(object): | |
538 """Outputs progress info for large copy requests.""" | |
539 | |
540 def __init__(self, upload): | |
541 if upload: | |
542 self.announce_text = 'Uploading' | |
543 else: | |
544 self.announce_text = 'Downloading' | |
545 | |
546 def call(self, total_bytes_transferred, total_size): | |
547 sys.stderr.write('%s: %s/%s \r' % ( | |
548 self.announce_text, | |
549 MakeHumanReadable(total_bytes_transferred), | |
550 MakeHumanReadable(total_size))) | |
551 if total_bytes_transferred == total_size: | |
552 sys.stderr.write('\n') | |
553 | |
554 class _StreamCopyCallbackHandler(object): | |
555 """Outputs progress info for Stream copy to cloud. | |
556 Total Size of the stream is not known, so we output | |
557 only the bytes transferred. | |
558 """ | |
559 | |
560 def call(self, total_bytes_transferred, total_size): | |
561 sys.stderr.write('Uploading: %s \r' % ( | |
562 MakeHumanReadable(total_bytes_transferred))) | |
563 if total_size and total_bytes_transferred == total_size: | |
564 sys.stderr.write('\n') | |
565 | |
566 def _GetTransferHandlers(self, dst_uri, size, upload): | |
567 """ | |
568 Selects upload/download and callback handlers. | |
569 | |
570 We use a callback handler that shows a simple textual progress indicator | |
571 if size is above the configurable threshold. | |
572 | |
573 We use a resumable transfer handler if size is >= the configurable | |
574 threshold and resumable transfers are supported by the given provider. | |
575 boto supports resumable downloads for all providers, but resumable | |
576 uploads are currently only supported by GS. | |
577 | |
578 Args: | |
579 dst_uri: the destination URI. | |
580 size: size of file (object) being uploaded (downloaded). | |
581 upload: bool indication of whether transfer is an upload. | |
582 """ | |
583 config = boto.config | |
584 resumable_threshold = config.getint('GSUtil', 'resumable_threshold', TWO_MB) | |
585 transfer_handler = None | |
586 cb = None | |
587 num_cb = None | |
588 | |
589 # Checks whether the destination file is a "special" file, like /dev/null on | |
590 # Linux platforms or null on Windows platforms, so we can disable resumable | |
591 # download support since the file size of the destination won't ever be | |
592 # correct. | |
593 dst_is_special = False | |
594 if dst_uri.is_file_uri(): | |
595 # Check explicitly first because os.stat doesn't work on 'nul' in Windows. | |
596 if dst_uri.object_name == os.devnull: | |
597 dst_is_special = True | |
598 try: | |
599 mode = os.stat(dst_uri.object_name).st_mode | |
600 if stat.S_ISCHR(mode): | |
601 dst_is_special = True | |
602 except OSError: | |
603 pass | |
604 | |
605 if size >= resumable_threshold and not dst_is_special: | |
606 if not self.quiet: | |
607 cb = self._FileCopyCallbackHandler(upload).call | |
608 num_cb = int(size / TWO_MB) | |
609 | |
610 resumable_tracker_dir = config.get( | |
611 'GSUtil', 'resumable_tracker_dir', | |
612 os.path.expanduser('~' + os.sep + '.gsutil')) | |
613 if not os.path.exists(resumable_tracker_dir): | |
614 os.makedirs(resumable_tracker_dir) | |
615 | |
616 if upload: | |
617 # Encode the dest bucket and object name into the tracker file name. | |
618 res_tracker_file_name = ( | |
619 re.sub('[/\\\\]', '_', 'resumable_upload__%s__%s.url' % | |
620 (dst_uri.bucket_name, dst_uri.object_name))) | |
621 else: | |
622 # Encode the fully-qualified dest file name into the tracker file name. | |
623 res_tracker_file_name = ( | |
624 re.sub('[/\\\\]', '_', 'resumable_download__%s.etag' % | |
625 (os.path.realpath(dst_uri.object_name)))) | |
626 | |
627 res_tracker_file_name = _hash_filename(res_tracker_file_name) | |
628 tracker_file = '%s%s%s' % (resumable_tracker_dir, os.sep, | |
629 res_tracker_file_name) | |
630 if upload: | |
631 if dst_uri.scheme == 'gs': | |
632 transfer_handler = ResumableUploadHandler(tracker_file) | |
633 else: | |
634 transfer_handler = ResumableDownloadHandler(tracker_file) | |
635 | |
636 return (cb, num_cb, transfer_handler) | |
637 | |
638 def _LogCopyOperation(self, src_uri, dst_uri, headers): | |
639 """ | |
640 Logs copy operation being performed, including Content-Type if appropriate. | |
641 """ | |
642 if self.quiet: | |
643 return | |
644 if 'Content-Type' in headers and dst_uri.is_cloud_uri(): | |
645 content_type_msg = ' [Content-Type=%s]' % headers['Content-Type'] | |
646 else: | |
647 content_type_msg = '' | |
648 if src_uri.is_stream(): | |
649 self.THREADED_LOGGER.info('Copying from <STDIN>%s...', content_type_msg) | |
650 else: | |
651 self.THREADED_LOGGER.info('Copying %s%s...', src_uri, content_type_msg) | |
652 | |
653 # We pass the headers explicitly to this call instead of using self.headers | |
654 # so we can set different metadata (like Content-Type type) for each object. | |
655 def _CopyObjToObjInTheCloud(self, src_key, src_uri, dst_uri, headers): | |
656 """Performs copy-in-the cloud from specified src to dest object. | |
657 | |
658 Args: | |
659 src_key: Source Key. | |
660 src_uri: Source StorageUri. | |
661 dst_uri: Destination StorageUri. | |
662 headers: A copy of the headers dictionary. | |
663 | |
664 Returns: | |
665 (elapsed_time, bytes_transferred, dst_uri) excluding overhead like initial | |
666 HEAD. Note: At present copy-in-the-cloud doesn't return the generation of | |
667 the created object, so the returned URI is actually not version-specific | |
668 (unlike other cp cases). | |
669 | |
670 Raises: | |
671 CommandException: if errors encountered. | |
672 """ | |
673 self._SetContentTypeHeader(src_uri, headers) | |
674 self._LogCopyOperation(src_uri, dst_uri, headers) | |
675 # Do Object -> object copy within same provider (uses | |
676 # x-<provider>-copy-source metadata HTTP header to request copying at the | |
677 # server). | |
678 src_bucket = src_uri.get_bucket(False, headers) | |
679 preserve_acl = False | |
680 canned_acl = None | |
681 if self.sub_opts: | |
682 for o, a in self.sub_opts: | |
683 if o == '-a': | |
684 canned_acls = dst_uri.canned_acls() | |
685 if a not in canned_acls: | |
686 raise CommandException('Invalid canned ACL "%s".' % a) | |
687 canned_acl = a | |
688 headers[dst_uri.get_provider().acl_header] = canned_acl | |
689 if o == '-p': | |
690 preserve_acl = True | |
691 if preserve_acl and canned_acl: | |
692 raise CommandException( | |
693 'Specifying both the -p and -a options together is invalid.') | |
694 start_time = time.time() | |
695 # Pass headers in headers param not metadata param, so boto will copy | |
696 # existing key's metadata and just set the additional headers specified | |
697 # in the headers param (rather than using the headers to override existing | |
698 # metadata). In particular this allows us to copy the existing key's | |
699 # Content-Type and other metadata users need while still being able to | |
700 # set headers the API needs (like x-goog-project-id). Note that this means | |
701 # you can't do something like: | |
702 # gsutil cp -t Content-Type text/html gs://bucket/* gs://bucket2 | |
703 # to change the Content-Type while copying. | |
704 | |
705 try: | |
706 dst_key = dst_uri.copy_key( | |
707 src_bucket.name, src_uri.object_name, preserve_acl=False, | |
708 headers=headers, src_version_id=src_uri.version_id, | |
709 src_generation=src_uri.generation) | |
710 except GSResponseError as e: | |
711 exc_name, error_detail = ExtractErrorDetail(e) | |
712 if (exc_name == 'GSResponseError' | |
713 and ('Copy-in-the-cloud disallowed' in error_detail)): | |
714 raise CommandException('%s.\nNote: you can copy between locations ' | |
715 'and between storage classes by using the ' | |
716 'gsutil cp -D option.' % error_detail) | |
717 else: | |
718 raise | |
719 end_time = time.time() | |
720 return (end_time - start_time, src_key.size, | |
721 dst_uri.clone_replace_key(dst_key)) | |
722 | |
723 def _CheckFreeSpace(self, path): | |
724 """Return path/drive free space (in bytes).""" | |
725 if platform.system() == 'Windows': | |
726 from ctypes import c_int, c_uint64, c_wchar_p, windll, POINTER, WINFUNCTYP
E, WinError | |
727 try: | |
728 GetDiskFreeSpaceEx = WINFUNCTYPE(c_int, c_wchar_p, POINTER(c_uint64), | |
729 POINTER(c_uint64), POINTER(c_uint64)) | |
730 GetDiskFreeSpaceEx = GetDiskFreeSpaceEx( | |
731 ('GetDiskFreeSpaceExW', windll.kernel32), ( | |
732 (1, 'lpszPathName'), | |
733 (2, 'lpFreeUserSpace'), | |
734 (2, 'lpTotalSpace'), | |
735 (2, 'lpFreeSpace'),)) | |
736 except AttributeError: | |
737 GetDiskFreeSpaceEx = WINFUNCTYPE(c_int, c_char_p, POINTER(c_uint64), | |
738 POINTER(c_uint64), POINTER(c_uint64)) | |
739 GetDiskFreeSpaceEx = GetDiskFreeSpaceEx( | |
740 ('GetDiskFreeSpaceExA', windll.kernel32), ( | |
741 (1, 'lpszPathName'), | |
742 (2, 'lpFreeUserSpace'), | |
743 (2, 'lpTotalSpace'), | |
744 (2, 'lpFreeSpace'),)) | |
745 | |
746 def GetDiskFreeSpaceEx_errcheck(result, func, args): | |
747 if not result: | |
748 raise WinError() | |
749 return args[1].value | |
750 GetDiskFreeSpaceEx.errcheck = GetDiskFreeSpaceEx_errcheck | |
751 | |
752 return GetDiskFreeSpaceEx(os.getenv('SystemDrive')) | |
753 else: | |
754 (_, f_frsize, _, _, f_bavail, _, _, _, _, _) = os.statvfs(path) | |
755 return f_frsize * f_bavail | |
756 | |
757 def _PerformResumableUploadIfApplies(self, fp, dst_uri, canned_acl, headers): | |
758 """ | |
759 Performs resumable upload if supported by provider and file is above | |
760 threshold, else performs non-resumable upload. | |
761 | |
762 Returns (elapsed_time, bytes_transferred, version-specific dst_uri). | |
763 """ | |
764 start_time = time.time() | |
765 # Determine file size different ways for case where fp is actually a wrapper | |
766 # around a Key vs an actual file. | |
767 if isinstance(fp, KeyFile): | |
768 file_size = fp.getkey().size | |
769 else: | |
770 file_size = os.path.getsize(fp.name) | |
771 (cb, num_cb, res_upload_handler) = self._GetTransferHandlers( | |
772 dst_uri, file_size, True) | |
773 if dst_uri.scheme == 'gs': | |
774 # Resumable upload protocol is Google Cloud Storage-specific. | |
775 dst_uri.set_contents_from_file(fp, headers, policy=canned_acl, | |
776 cb=cb, num_cb=num_cb, | |
777 res_upload_handler=res_upload_handler) | |
778 else: | |
779 dst_uri.set_contents_from_file(fp, headers, policy=canned_acl, | |
780 cb=cb, num_cb=num_cb) | |
781 if res_upload_handler: | |
782 # ResumableUploadHandler does not update upload_start_point from its | |
783 # initial value of -1 if transferring the whole file, so clamp at 0 | |
784 bytes_transferred = file_size - max( | |
785 res_upload_handler.upload_start_point, 0) | |
786 else: | |
787 bytes_transferred = file_size | |
788 end_time = time.time() | |
789 return (end_time - start_time, bytes_transferred, dst_uri) | |
790 | |
791 def _PerformStreamingUpload(self, fp, dst_uri, headers, canned_acl=None): | |
792 """ | |
793 Performs a streaming upload to the cloud. | |
794 | |
795 Args: | |
796 fp: The file whose contents to upload. | |
797 dst_uri: Destination StorageUri. | |
798 headers: A copy of the headers dictionary. | |
799 canned_acl: Optional canned ACL to set on the object. | |
800 | |
801 Returns (elapsed_time, bytes_transferred, version-specific dst_uri). | |
802 """ | |
803 start_time = time.time() | |
804 | |
805 if self.quiet: | |
806 cb = None | |
807 else: | |
808 cb = self._StreamCopyCallbackHandler().call | |
809 dst_uri.set_contents_from_stream( | |
810 fp, headers, policy=canned_acl, cb=cb) | |
811 try: | |
812 bytes_transferred = fp.tell() | |
813 except: | |
814 bytes_transferred = 0 | |
815 | |
816 end_time = time.time() | |
817 return (end_time - start_time, bytes_transferred, dst_uri) | |
818 | |
819 def _SetContentTypeHeader(self, src_uri, headers): | |
820 """ | |
821 Sets content type header to value specified in '-h Content-Type' option (if | |
822 specified); else sets using Content-Type detection. | |
823 """ | |
824 if 'Content-Type' in headers: | |
825 # If empty string specified (i.e., -h "Content-Type:") set header to None, | |
826 # which will inhibit boto from sending the CT header. Otherwise, boto will | |
827 # pass through the user specified CT header. | |
828 if not headers['Content-Type']: | |
829 headers['Content-Type'] = None | |
830 # else we'll keep the value passed in via -h option (not performing | |
831 # content type detection). | |
832 else: | |
833 # Only do content type recognition is src_uri is a file. Object-to-object | |
834 # copies with no -h Content-Type specified re-use the content type of the | |
835 # source object. | |
836 if src_uri.is_file_uri(): | |
837 object_name = src_uri.object_name | |
838 content_type = None | |
839 # Streams (denoted by '-') are expected to be 'application/octet-stream' | |
840 # and 'file' would partially consume them. | |
841 if object_name != '-': | |
842 if self.USE_MAGICFILE: | |
843 p = subprocess.Popen(['file', '--mime-type', object_name], | |
844 stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
845 output, error = p.communicate() | |
846 if p.returncode != 0 or error: | |
847 raise CommandException( | |
848 'Encountered error running "file --mime-type %s" ' | |
849 '(returncode=%d).\n%s' % (object_name, p.returncode, error)) | |
850 # Parse output by removing line delimiter and splitting on last ": | |
851 content_type = output.rstrip().rpartition(': ')[2] | |
852 else: | |
853 content_type = mimetypes.guess_type(object_name)[0] | |
854 if not content_type: | |
855 content_type = self.DEFAULT_CONTENT_TYPE | |
856 headers['Content-Type'] = content_type | |
857 | |
858 def _UploadFileToObject(self, src_key, src_uri, dst_uri, headers, | |
859 should_log=True): | |
860 """Uploads a local file to an object. | |
861 | |
862 Args: | |
863 src_key: Source StorageUri. Must be a file URI. | |
864 src_uri: Source StorageUri. | |
865 dst_uri: Destination StorageUri. | |
866 headers: The headers dictionary. | |
867 should_log: bool indicator whether we should log this operation. | |
868 Returns: | |
869 (elapsed_time, bytes_transferred, version-specific dst_uri), excluding | |
870 overhead like initial HEAD. | |
871 | |
872 Raises: | |
873 CommandException: if errors encountered. | |
874 """ | |
875 gzip_exts = [] | |
876 canned_acl = None | |
877 if self.sub_opts: | |
878 for o, a in self.sub_opts: | |
879 if o == '-a': | |
880 canned_acls = dst_uri.canned_acls() | |
881 if a not in canned_acls: | |
882 raise CommandException('Invalid canned ACL "%s".' % a) | |
883 canned_acl = a | |
884 elif o == '-t': | |
885 print('Warning: -t is deprecated, and will be removed in the future. ' | |
886 'Content type\ndetection is ' | |
887 'now performed by default, unless inhibited by specifying ' | |
888 'a\nContent-Type header via the -h option.') | |
889 elif o == '-z': | |
890 gzip_exts = a.split(',') | |
891 | |
892 self._SetContentTypeHeader(src_uri, headers) | |
893 if should_log: | |
894 self._LogCopyOperation(src_uri, dst_uri, headers) | |
895 | |
896 if 'Content-Language' not in headers: | |
897 content_language = config.get_value('GSUtil', 'content_language') | |
898 if content_language: | |
899 headers['Content-Language'] = content_language | |
900 | |
901 fname_parts = src_uri.object_name.split('.') | |
902 if len(fname_parts) > 1 and fname_parts[-1] in gzip_exts: | |
903 if self.debug: | |
904 print 'Compressing %s (to tmp)...' % src_key | |
905 (gzip_fh, gzip_path) = tempfile.mkstemp() | |
906 gzip_fp = None | |
907 try: | |
908 # Check for temp space. Assume the compressed object is at most 2x | |
909 # the size of the object (normally should compress to smaller than | |
910 # the object) | |
911 if (self._CheckFreeSpace(gzip_path) | |
912 < 2*int(os.path.getsize(src_key.name))): | |
913 raise CommandException('Inadequate temp space available to compress ' | |
914 '%s' % src_key.name) | |
915 gzip_fp = gzip.open(gzip_path, 'wb') | |
916 gzip_fp.writelines(src_key.fp) | |
917 finally: | |
918 if gzip_fp: | |
919 gzip_fp.close() | |
920 os.close(gzip_fh) | |
921 headers['Content-Encoding'] = 'gzip' | |
922 gzip_fp = open(gzip_path, 'rb') | |
923 try: | |
924 (elapsed_time, bytes_transferred, result_uri) = ( | |
925 self._PerformResumableUploadIfApplies(gzip_fp, dst_uri, | |
926 canned_acl, headers)) | |
927 finally: | |
928 gzip_fp.close() | |
929 try: | |
930 os.unlink(gzip_path) | |
931 # Windows sometimes complains the temp file is locked when you try to | |
932 # delete it. | |
933 except Exception, e: | |
934 pass | |
935 elif (src_key.is_stream() | |
936 and dst_uri.get_provider().supports_chunked_transfer()): | |
937 (elapsed_time, bytes_transferred, result_uri) = ( | |
938 self._PerformStreamingUpload(src_key.fp, dst_uri, headers, | |
939 canned_acl)) | |
940 else: | |
941 if src_key.is_stream(): | |
942 # For Providers that doesn't support chunked Transfers | |
943 tmp = tempfile.NamedTemporaryFile() | |
944 file_uri = self.suri_builder.StorageUri('file://%s' % tmp.name) | |
945 try: | |
946 file_uri.new_key(False, headers).set_contents_from_file( | |
947 src_key.fp, headers) | |
948 src_key = file_uri.get_key() | |
949 finally: | |
950 file_uri.close() | |
951 try: | |
952 (elapsed_time, bytes_transferred, result_uri) = ( | |
953 self._PerformResumableUploadIfApplies(src_key.fp, dst_uri, | |
954 canned_acl, headers)) | |
955 finally: | |
956 if src_key.is_stream(): | |
957 tmp.close() | |
958 else: | |
959 src_key.close() | |
960 | |
961 return (elapsed_time, bytes_transferred, result_uri) | |
962 | |
963 def _DownloadObjectToFile(self, src_key, src_uri, dst_uri, headers, | |
964 should_log=True): | |
965 """Downloads an object to a local file. | |
966 | |
967 Args: | |
968 src_key: Source StorageUri. Must be a file URI. | |
969 src_uri: Source StorageUri. | |
970 dst_uri: Destination StorageUri. | |
971 headers: The headers dictionary. | |
972 should_log: bool indicator whether we should log this operation. | |
973 Returns: | |
974 (elapsed_time, bytes_transferred, dst_uri), excluding overhead like | |
975 initial HEAD. | |
976 | |
977 Raises: | |
978 CommandException: if errors encountered. | |
979 """ | |
980 if should_log: | |
981 self._LogCopyOperation(src_uri, dst_uri, headers) | |
982 (cb, num_cb, res_download_handler) = self._GetTransferHandlers( | |
983 dst_uri, src_key.size, False) | |
984 file_name = dst_uri.object_name | |
985 dir_name = os.path.dirname(file_name) | |
986 if dir_name and not os.path.exists(dir_name): | |
987 # Do dir creation in try block so can ignore case where dir already | |
988 # exists. This is needed to avoid a race condition when running gsutil | |
989 # -m cp. | |
990 try: | |
991 os.makedirs(dir_name) | |
992 except OSError, e: | |
993 if e.errno != errno.EEXIST: | |
994 raise | |
995 # For gzipped objects not named *.gz download to a temp file and unzip. | |
996 if (hasattr(src_key, 'content_encoding') | |
997 and src_key.content_encoding == 'gzip' | |
998 and not file_name.endswith('.gz')): | |
999 # We can't use tempfile.mkstemp() here because we need a predictable | |
1000 # filename for resumable downloads. | |
1001 download_file_name = '%s_.gztmp' % file_name | |
1002 need_to_unzip = True | |
1003 else: | |
1004 download_file_name = file_name | |
1005 need_to_unzip = False | |
1006 fp = None | |
1007 try: | |
1008 if res_download_handler: | |
1009 fp = open(download_file_name, 'ab') | |
1010 else: | |
1011 fp = open(download_file_name, 'wb') | |
1012 start_time = time.time() | |
1013 src_key.get_contents_to_file(fp, headers, cb=cb, num_cb=num_cb, | |
1014 res_download_handler=res_download_handler) | |
1015 # If a custom test method is defined, call it here. For the copy command, | |
1016 # test methods are expected to take one argument: an open file pointer, | |
1017 # and are used to perturb the open file during download to exercise | |
1018 # download error detection. | |
1019 if self.test_method: | |
1020 self.test_method(fp) | |
1021 end_time = time.time() | |
1022 finally: | |
1023 if fp: | |
1024 fp.close() | |
1025 | |
1026 # Discard the md5 if we are resuming a partial download. | |
1027 if res_download_handler and res_download_handler.download_start_point: | |
1028 src_key.md5 = None | |
1029 | |
1030 # Verify downloaded file checksum matched source object's checksum. | |
1031 self._CheckFinalMd5(src_key, download_file_name) | |
1032 | |
1033 if res_download_handler: | |
1034 bytes_transferred = ( | |
1035 src_key.size - res_download_handler.download_start_point) | |
1036 else: | |
1037 bytes_transferred = src_key.size | |
1038 if need_to_unzip: | |
1039 # Log that we're uncompressing if the file is big enough that | |
1040 # decompressing would make it look like the transfer "stalled" at the end. | |
1041 if not self.quiet and bytes_transferred > 10 * 1024 * 1024: | |
1042 self.THREADED_LOGGER.info('Uncompressing downloaded tmp file to %s...', | |
1043 file_name) | |
1044 # Downloaded gzipped file to a filename w/o .gz extension, so unzip. | |
1045 f_in = gzip.open(download_file_name, 'rb') | |
1046 f_out = open(file_name, 'wb') | |
1047 try: | |
1048 while True: | |
1049 data = f_in.read(8192) | |
1050 if not data: | |
1051 break | |
1052 f_out.write(data) | |
1053 finally: | |
1054 f_out.close() | |
1055 f_in.close() | |
1056 os.unlink(download_file_name) | |
1057 return (end_time - start_time, bytes_transferred, dst_uri) | |
1058 | |
1059 def _PerformDownloadToStream(self, src_key, src_uri, str_fp, headers): | |
1060 (cb, num_cb, res_download_handler) = self._GetTransferHandlers( | |
1061 src_uri, src_key.size, False) | |
1062 start_time = time.time() | |
1063 src_key.get_contents_to_file(str_fp, headers, cb=cb, num_cb=num_cb) | |
1064 end_time = time.time() | |
1065 bytes_transferred = src_key.size | |
1066 end_time = time.time() | |
1067 return (end_time - start_time, bytes_transferred) | |
1068 | |
1069 def _CopyFileToFile(self, src_key, src_uri, dst_uri, headers): | |
1070 """Copies a local file to a local file. | |
1071 | |
1072 Args: | |
1073 src_key: Source StorageUri. Must be a file URI. | |
1074 src_uri: Source StorageUri. | |
1075 dst_uri: Destination StorageUri. | |
1076 headers: The headers dictionary. | |
1077 Returns: | |
1078 (elapsed_time, bytes_transferred, dst_uri), excluding | |
1079 overhead like initial HEAD. | |
1080 | |
1081 Raises: | |
1082 CommandException: if errors encountered. | |
1083 """ | |
1084 self._LogCopyOperation(src_uri, dst_uri, headers) | |
1085 dst_key = dst_uri.new_key(False, headers) | |
1086 start_time = time.time() | |
1087 dst_key.set_contents_from_file(src_key.fp, headers) | |
1088 end_time = time.time() | |
1089 return (end_time - start_time, os.path.getsize(src_key.fp.name), dst_uri) | |
1090 | |
1091 def _CopyObjToObjDaisyChainMode(self, src_key, src_uri, dst_uri, headers): | |
1092 """Copies from src_uri to dst_uri in "daisy chain" mode. | |
1093 See -D OPTION documentation about what daisy chain mode is. | |
1094 | |
1095 Args: | |
1096 src_key: Source Key. | |
1097 src_uri: Source StorageUri. | |
1098 dst_uri: Destination StorageUri. | |
1099 headers: A copy of the headers dictionary. | |
1100 | |
1101 Returns: | |
1102 (elapsed_time, bytes_transferred, version-specific dst_uri) excluding | |
1103 overhead like initial HEAD. | |
1104 | |
1105 Raises: | |
1106 CommandException: if errors encountered. | |
1107 """ | |
1108 self._SetContentTypeHeader(src_uri, headers) | |
1109 self._LogCopyOperation(src_uri, dst_uri, headers) | |
1110 canned_acl = None | |
1111 if self.sub_opts: | |
1112 for o, a in self.sub_opts: | |
1113 if o == '-a': | |
1114 canned_acls = dst_uri.canned_acls() | |
1115 if a not in canned_acls: | |
1116 raise CommandException('Invalid canned ACL "%s".' % a) | |
1117 canned_acl = a | |
1118 elif o == '-p': | |
1119 # We don't attempt to preserve ACLs across providers because | |
1120 # GCS and S3 support different ACLs and disjoint principals. | |
1121 raise NotImplementedError('Cross-provider cp -p not supported') | |
1122 return self._PerformResumableUploadIfApplies(KeyFile(src_key), dst_uri, | |
1123 canned_acl, headers) | |
1124 | |
1125 def _PerformCopy(self, src_uri, dst_uri): | |
1126 """Performs copy from src_uri to dst_uri, handling various special cases. | |
1127 | |
1128 Args: | |
1129 src_uri: Source StorageUri. | |
1130 dst_uri: Destination StorageUri. | |
1131 | |
1132 Returns: | |
1133 (elapsed_time, bytes_transferred, version-specific dst_uri) excluding | |
1134 overhead like initial HEAD. | |
1135 | |
1136 Raises: | |
1137 CommandException: if errors encountered. | |
1138 """ | |
1139 # Make a copy of the input headers each time so we can set a different | |
1140 # content type for each object. | |
1141 if self.headers: | |
1142 headers = self.headers.copy() | |
1143 else: | |
1144 headers = {} | |
1145 | |
1146 src_key = src_uri.get_key(False, headers) | |
1147 if not src_key: | |
1148 raise CommandException('"%s" does not exist.' % src_uri) | |
1149 | |
1150 # On Windows, stdin is opened as text mode instead of binary which causes | |
1151 # problems when piping a binary file, so this switches it to binary mode. | |
1152 if IS_WINDOWS and src_uri.is_file_uri() and src_key.is_stream(): | |
1153 import msvcrt | |
1154 msvcrt.setmode(src_key.fp.fileno(), os.O_BINARY) | |
1155 | |
1156 if self.no_clobber: | |
1157 # There are two checks to prevent clobbering: | |
1158 # 1) The first check is to see if the item | |
1159 # already exists at the destination and prevent the upload/download | |
1160 # from happening. This is done by the exists() call. | |
1161 # 2) The second check is only relevant if we are writing to gs. We can | |
1162 # enforce that the server only writes the object if it doesn't exist | |
1163 # by specifying the header below. This check only happens at the | |
1164 # server after the complete file has been uploaded. We specify this | |
1165 # header to prevent a race condition where a destination file may | |
1166 # be created after the first check and before the file is fully | |
1167 # uploaded. | |
1168 # In order to save on unnecessary uploads/downloads we perform both | |
1169 # checks. However, this may come at the cost of additional HTTP calls. | |
1170 if dst_uri.exists(headers): | |
1171 if not self.quiet: | |
1172 self.THREADED_LOGGER.info('Skipping existing item: %s' % | |
1173 dst_uri.uri) | |
1174 return (0, 0, None) | |
1175 if dst_uri.is_cloud_uri() and dst_uri.scheme == 'gs': | |
1176 headers['x-goog-if-generation-match'] = '0' | |
1177 | |
1178 if src_uri.is_cloud_uri() and dst_uri.is_cloud_uri(): | |
1179 if src_uri.scheme == dst_uri.scheme and not self.daisy_chain: | |
1180 return self._CopyObjToObjInTheCloud(src_key, src_uri, dst_uri, headers) | |
1181 else: | |
1182 return self._CopyObjToObjDaisyChainMode(src_key, src_uri, dst_uri, | |
1183 headers) | |
1184 elif src_uri.is_file_uri() and dst_uri.is_cloud_uri(): | |
1185 return self._UploadFileToObject(src_key, src_uri, dst_uri, headers) | |
1186 elif src_uri.is_cloud_uri() and dst_uri.is_file_uri(): | |
1187 return self._DownloadObjectToFile(src_key, src_uri, dst_uri, headers) | |
1188 elif src_uri.is_file_uri() and dst_uri.is_file_uri(): | |
1189 return self._CopyFileToFile(src_key, src_uri, dst_uri, headers) | |
1190 else: | |
1191 raise CommandException('Unexpected src/dest case') | |
1192 | |
1193 def _ExpandDstUri(self, dst_uri_str): | |
1194 """ | |
1195 Expands wildcard if present in dst_uri_str. | |
1196 | |
1197 Args: | |
1198 dst_uri_str: String representation of requested dst_uri. | |
1199 | |
1200 Returns: | |
1201 (exp_dst_uri, have_existing_dst_container) | |
1202 where have_existing_dst_container is a bool indicating whether | |
1203 exp_dst_uri names an existing directory, bucket, or bucket subdirectory. | |
1204 | |
1205 Raises: | |
1206 CommandException: if dst_uri_str matched more than 1 URI. | |
1207 """ | |
1208 dst_uri = self.suri_builder.StorageUri(dst_uri_str) | |
1209 | |
1210 # Handle wildcarded dst_uri case. | |
1211 if ContainsWildcard(dst_uri): | |
1212 blr_expansion = list(self.WildcardIterator(dst_uri)) | |
1213 if len(blr_expansion) != 1: | |
1214 raise CommandException('Destination (%s) must match exactly 1 URI' % | |
1215 dst_uri_str) | |
1216 blr = blr_expansion[0] | |
1217 uri = blr.GetUri() | |
1218 if uri.is_cloud_uri(): | |
1219 return (uri, uri.names_bucket() or blr.HasPrefix() | |
1220 or blr.GetKey().endswith('/')) | |
1221 else: | |
1222 return (uri, uri.names_directory()) | |
1223 | |
1224 # Handle non-wildcarded dst_uri: | |
1225 if dst_uri.is_file_uri(): | |
1226 return (dst_uri, dst_uri.names_directory()) | |
1227 if dst_uri.names_bucket(): | |
1228 return (dst_uri, True) | |
1229 # For object URIs check 3 cases: (a) if the name ends with '/' treat as a | |
1230 # subdir; else, perform a wildcard expansion with dst_uri + "*" and then | |
1231 # find if (b) there's a Prefix matching dst_uri, or (c) name is of form | |
1232 # dir_$folder$ (and in both these cases also treat dir as a subdir). | |
1233 if dst_uri.is_cloud_uri() and dst_uri_str.endswith('/'): | |
1234 return (dst_uri, True) | |
1235 blr_expansion = list(self.WildcardIterator( | |
1236 '%s*' % dst_uri_str.rstrip(dst_uri.delim))) | |
1237 for blr in blr_expansion: | |
1238 if blr.GetRStrippedUriString().endswith('_$folder$'): | |
1239 return (dst_uri, True) | |
1240 if blr.GetRStrippedUriString() == dst_uri_str.rstrip(dst_uri.delim): | |
1241 return (dst_uri, blr.HasPrefix()) | |
1242 return (dst_uri, False) | |
1243 | |
1244 def _ConstructDstUri(self, src_uri, exp_src_uri, | |
1245 src_uri_names_container, src_uri_expands_to_multi, | |
1246 have_multiple_srcs, exp_dst_uri, | |
1247 have_existing_dest_subdir): | |
1248 """ | |
1249 Constructs the destination URI for a given exp_src_uri/exp_dst_uri pair, | |
1250 using context-dependent naming rules that mimic Linux cp and mv behavior. | |
1251 | |
1252 Args: | |
1253 src_uri: src_uri to be copied. | |
1254 exp_src_uri: Single StorageUri from wildcard expansion of src_uri. | |
1255 src_uri_names_container: True if src_uri names a container (including the | |
1256 case of a wildcard-named bucket subdir (like gs://bucket/abc, | |
1257 where gs://bucket/abc/* matched some objects). Note that this is | |
1258 additional semantics tha src_uri.names_container() doesn't understand | |
1259 because the latter only understands StorageUris, not wildcards. | |
1260 src_uri_expands_to_multi: True if src_uri expanded to multiple URIs. | |
1261 have_multiple_srcs: True if this is a multi-source request. This can be | |
1262 true if src_uri wildcard-expanded to multiple URIs or if there were | |
1263 multiple source URIs in the request. | |
1264 exp_dst_uri: the expanded StorageUri requested for the cp destination. | |
1265 Final written path is constructed from this plus a context-dependent | |
1266 variant of src_uri. | |
1267 have_existing_dest_subdir: bool indicator whether dest is an existing | |
1268 subdirectory. | |
1269 | |
1270 Returns: | |
1271 StorageUri to use for copy. | |
1272 | |
1273 Raises: | |
1274 CommandException if destination object name not specified for | |
1275 source and source is a stream. | |
1276 """ | |
1277 if self._ShouldTreatDstUriAsSingleton( | |
1278 have_multiple_srcs, have_existing_dest_subdir, exp_dst_uri): | |
1279 # We're copying one file or object to one file or object. | |
1280 return exp_dst_uri | |
1281 | |
1282 if exp_src_uri.is_stream(): | |
1283 if exp_dst_uri.names_container(): | |
1284 raise CommandException('Destination object name needed when ' | |
1285 'source is a stream') | |
1286 return exp_dst_uri | |
1287 | |
1288 if not self.recursion_requested and not have_multiple_srcs: | |
1289 # We're copying one file or object to a subdirectory. Append final comp | |
1290 # of exp_src_uri to exp_dst_uri. | |
1291 src_final_comp = exp_src_uri.object_name.rpartition(src_uri.delim)[-1] | |
1292 return self.suri_builder.StorageUri('%s%s%s' % ( | |
1293 exp_dst_uri.uri.rstrip(exp_dst_uri.delim), exp_dst_uri.delim, | |
1294 src_final_comp)) | |
1295 | |
1296 # Else we're copying multiple sources to a directory, bucket, or a bucket | |
1297 # "sub-directory". | |
1298 | |
1299 # Ensure exp_dst_uri ends in delim char if we're doing a multi-src copy or | |
1300 # a copy to a directory. (The check for copying to a directory needs | |
1301 # special-case handling so that the command: | |
1302 # gsutil cp gs://bucket/obj dir | |
1303 # will turn into file://dir/ instead of file://dir -- the latter would cause | |
1304 # the file "dirobj" to be created.) | |
1305 # Note: need to check have_multiple_srcs or src_uri.names_container() | |
1306 # because src_uri could be a bucket containing a single object, named | |
1307 # as gs://bucket. | |
1308 if ((have_multiple_srcs or src_uri.names_container() | |
1309 or os.path.isdir(exp_dst_uri.object_name)) | |
1310 and not exp_dst_uri.uri.endswith(exp_dst_uri.delim)): | |
1311 exp_dst_uri = exp_dst_uri.clone_replace_name( | |
1312 '%s%s' % (exp_dst_uri.object_name, exp_dst_uri.delim) | |
1313 ) | |
1314 | |
1315 # Making naming behavior match how things work with local Linux cp and mv | |
1316 # operations depends on many factors, including whether the destination is a | |
1317 # container, the plurality of the source(s), and whether the mv command is | |
1318 # being used: | |
1319 # 1. For the "mv" command that specifies a non-existent destination subdir, | |
1320 # renaming should occur at the level of the src subdir, vs appending that | |
1321 # subdir beneath the dst subdir like is done for copying. For example: | |
1322 # gsutil rm -R gs://bucket | |
1323 # gsutil cp -R dir1 gs://bucket | |
1324 # gsutil cp -R dir2 gs://bucket/subdir1 | |
1325 # gsutil mv gs://bucket/subdir1 gs://bucket/subdir2 | |
1326 # would (if using cp naming behavior) end up with paths like: | |
1327 # gs://bucket/subdir2/subdir1/dir2/.svn/all-wcprops | |
1328 # whereas mv naming behavior should result in: | |
1329 # gs://bucket/subdir2/dir2/.svn/all-wcprops | |
1330 # 2. Copying from directories, buckets, or bucket subdirs should result in | |
1331 # objects/files mirroring the source directory hierarchy. For example: | |
1332 # gsutil cp dir1/dir2 gs://bucket | |
1333 # should create the object gs://bucket/dir2/file2, assuming dir1/dir2 | |
1334 # contains file2). | |
1335 # To be consistent with Linux cp behavior, there's one more wrinkle when | |
1336 # working with subdirs: The resulting object names depend on whether the | |
1337 # destination subdirectory exists. For example, if gs://bucket/subdir | |
1338 # exists, the command: | |
1339 # gsutil cp -R dir1/dir2 gs://bucket/subdir | |
1340 # should create objects named like gs://bucket/subdir/dir2/a/b/c. In | |
1341 # contrast, if gs://bucket/subdir does not exist, this same command | |
1342 # should create objects named like gs://bucket/subdir/a/b/c. | |
1343 # 3. Copying individual files or objects to dirs, buckets or bucket subdirs | |
1344 # should result in objects/files named by the final source file name | |
1345 # component. Example: | |
1346 # gsutil cp dir1/*.txt gs://bucket | |
1347 # should create the objects gs://bucket/f1.txt and gs://bucket/f2.txt, | |
1348 # assuming dir1 contains f1.txt and f2.txt. | |
1349 | |
1350 if (self.perform_mv and self.recursion_requested | |
1351 and src_uri_expands_to_multi and not have_existing_dest_subdir): | |
1352 # Case 1. Handle naming rules for bucket subdir mv. Here we want to | |
1353 # line up the src_uri against its expansion, to find the base to build | |
1354 # the new name. For example, running the command: | |
1355 # gsutil mv gs://bucket/abcd gs://bucket/xyz | |
1356 # when processing exp_src_uri=gs://bucket/abcd/123 | |
1357 # exp_src_uri_tail should become /123 | |
1358 # Note: mv.py code disallows wildcard specification of source URI. | |
1359 exp_src_uri_tail = exp_src_uri.uri[len(src_uri.uri):] | |
1360 dst_key_name = '%s/%s' % (exp_dst_uri.object_name.rstrip('/'), | |
1361 exp_src_uri_tail.strip('/')) | |
1362 return exp_dst_uri.clone_replace_name(dst_key_name) | |
1363 | |
1364 if src_uri_names_container and not exp_dst_uri.names_file(): | |
1365 # Case 2. Build dst_key_name from subpath of exp_src_uri past | |
1366 # where src_uri ends. For example, for src_uri=gs://bucket/ and | |
1367 # exp_src_uri=gs://bucket/src_subdir/obj, dst_key_name should be | |
1368 # src_subdir/obj. | |
1369 src_uri_path_sans_final_dir = _GetPathBeforeFinalDir(src_uri) | |
1370 dst_key_name = exp_src_uri.uri[ | |
1371 len(src_uri_path_sans_final_dir):].lstrip(src_uri.delim) | |
1372 # Handle case where dst_uri is a non-existent subdir. | |
1373 if not have_existing_dest_subdir: | |
1374 dst_key_name = dst_key_name.partition(src_uri.delim)[-1] | |
1375 # Handle special case where src_uri was a directory named with '.' or | |
1376 # './', so that running a command like: | |
1377 # gsutil cp -r . gs://dest | |
1378 # will produce obj names of the form gs://dest/abc instead of | |
1379 # gs://dest/./abc. | |
1380 if dst_key_name.startswith('.%s' % os.sep): | |
1381 dst_key_name = dst_key_name[2:] | |
1382 | |
1383 else: | |
1384 # Case 3. | |
1385 dst_key_name = exp_src_uri.object_name.rpartition(src_uri.delim)[-1] | |
1386 | |
1387 if (exp_dst_uri.is_file_uri() | |
1388 or self._ShouldTreatDstUriAsBucketSubDir( | |
1389 have_multiple_srcs, exp_dst_uri, have_existing_dest_subdir)): | |
1390 if exp_dst_uri.object_name.endswith(exp_dst_uri.delim): | |
1391 dst_key_name = '%s%s%s' % ( | |
1392 exp_dst_uri.object_name.rstrip(exp_dst_uri.delim), | |
1393 exp_dst_uri.delim, dst_key_name) | |
1394 else: | |
1395 delim = exp_dst_uri.delim if exp_dst_uri.object_name else '' | |
1396 dst_key_name = '%s%s%s' % (exp_dst_uri.object_name, delim, dst_key_name) | |
1397 | |
1398 return exp_dst_uri.clone_replace_name(dst_key_name) | |
1399 | |
1400 def _FixWindowsNaming(self, src_uri, dst_uri): | |
1401 """ | |
1402 Rewrites the destination URI built by _ConstructDstUri() to translate | |
1403 Windows pathnames to cloud pathnames if needed. | |
1404 | |
1405 Args: | |
1406 src_uri: Source URI to be copied. | |
1407 dst_uri: The destination URI built by _ConstructDstUri(). | |
1408 | |
1409 Returns: | |
1410 StorageUri to use for copy. | |
1411 """ | |
1412 if (src_uri.is_file_uri() and src_uri.delim == '\\' | |
1413 and dst_uri.is_cloud_uri()): | |
1414 trans_uri_str = re.sub(r'\\', '/', dst_uri.uri) | |
1415 dst_uri = self.suri_builder.StorageUri(trans_uri_str) | |
1416 return dst_uri | |
1417 | |
1418 # Command entry point. | |
1419 def RunCommand(self): | |
1420 | |
1421 # Inner funcs. | |
1422 def _CopyExceptionHandler(e): | |
1423 """Simple exception handler to allow post-completion status.""" | |
1424 self.THREADED_LOGGER.error(str(e)) | |
1425 self.copy_failure_count += 1 | |
1426 | |
1427 def _CopyFunc(name_expansion_result): | |
1428 """Worker function for performing the actual copy (and rm, for mv).""" | |
1429 if self.perform_mv: | |
1430 cmd_name = 'mv' | |
1431 else: | |
1432 cmd_name = self.command_name | |
1433 src_uri = self.suri_builder.StorageUri( | |
1434 name_expansion_result.GetSrcUriStr()) | |
1435 exp_src_uri = self.suri_builder.StorageUri( | |
1436 name_expansion_result.GetExpandedUriStr()) | |
1437 src_uri_names_container = name_expansion_result.NamesContainer() | |
1438 src_uri_expands_to_multi = name_expansion_result.NamesContainer() | |
1439 have_multiple_srcs = name_expansion_result.IsMultiSrcRequest() | |
1440 have_existing_dest_subdir = ( | |
1441 name_expansion_result.HaveExistingDstContainer()) | |
1442 | |
1443 if src_uri.names_provider(): | |
1444 raise CommandException( | |
1445 'The %s command does not allow provider-only source URIs (%s)' % | |
1446 (cmd_name, src_uri)) | |
1447 if have_multiple_srcs: | |
1448 self._InsistDstUriNamesContainer(exp_dst_uri, | |
1449 have_existing_dst_container, | |
1450 cmd_name) | |
1451 | |
1452 if self.perform_mv: | |
1453 if name_expansion_result.NamesContainer(): | |
1454 # Use recursion_requested when performing name expansion for the | |
1455 # directory mv case so we can determine if any of the source URIs are | |
1456 # directories (and then use cp -R and rm -R to perform the move, to | |
1457 # match the behavior of Linux mv (which when moving a directory moves | |
1458 # all the contained files). | |
1459 self.recursion_requested = True | |
1460 # Disallow wildcard src URIs when moving directories, as supporting it | |
1461 # would make the name transformation too complex and would also be | |
1462 # dangerous (e.g., someone could accidentally move many objects to the | |
1463 # wrong name, or accidentally overwrite many objects). | |
1464 if ContainsWildcard(src_uri): | |
1465 raise CommandException('The mv command disallows naming source ' | |
1466 'directories using wildcards') | |
1467 | |
1468 if (exp_dst_uri.is_file_uri() | |
1469 and not os.path.exists(exp_dst_uri.object_name) | |
1470 and have_multiple_srcs): | |
1471 os.makedirs(exp_dst_uri.object_name) | |
1472 | |
1473 dst_uri = self._ConstructDstUri(src_uri, exp_src_uri, | |
1474 src_uri_names_container, | |
1475 src_uri_expands_to_multi, | |
1476 have_multiple_srcs, exp_dst_uri, | |
1477 have_existing_dest_subdir) | |
1478 dst_uri = self._FixWindowsNaming(src_uri, dst_uri) | |
1479 | |
1480 self._CheckForDirFileConflict(exp_src_uri, dst_uri) | |
1481 if self._SrcDstSame(exp_src_uri, dst_uri): | |
1482 raise CommandException('%s: "%s" and "%s" are the same file - ' | |
1483 'abort.' % (cmd_name, exp_src_uri, dst_uri)) | |
1484 | |
1485 if dst_uri.is_cloud_uri() and dst_uri.is_version_specific: | |
1486 raise CommandException('%s: a version-specific URI\n(%s)\ncannot be ' | |
1487 'the destination for gsutil cp - abort.' | |
1488 % (cmd_name, dst_uri)) | |
1489 | |
1490 elapsed_time = bytes_transferred = 0 | |
1491 try: | |
1492 (elapsed_time, bytes_transferred, result_uri) = ( | |
1493 self._PerformCopy(exp_src_uri, dst_uri)) | |
1494 except Exception, e: | |
1495 if self._IsNoClobberServerException(e): | |
1496 if not self.quiet: | |
1497 self.THREADED_LOGGER.info('Rejected (noclobber): %s' % dst_uri.uri) | |
1498 elif self.continue_on_error: | |
1499 if not self.quiet: | |
1500 self.THREADED_LOGGER.error('Error copying %s: %s' % (src_uri.uri, | |
1501 str(e))) | |
1502 self.copy_failure_count += 1 | |
1503 else: | |
1504 raise | |
1505 if self.print_ver: | |
1506 # Some cases don't return a version-specific URI (e.g., if destination | |
1507 # is a file). | |
1508 if hasattr(result_uri, 'version_specific_uri'): | |
1509 self.THREADED_LOGGER.info('Created: %s' % | |
1510 result_uri.version_specific_uri) | |
1511 else: | |
1512 self.THREADED_LOGGER.info('Created: %s' % result_uri.uri) | |
1513 | |
1514 # TODO: If we ever use -n (noclobber) with -M (move) (not possible today | |
1515 # since we call copy internally from move and don't specify the -n flag) | |
1516 # we'll need to only remove the source when we have not skipped the | |
1517 # destination. | |
1518 if self.perform_mv: | |
1519 if not self.quiet: | |
1520 self.THREADED_LOGGER.info('Removing %s...', exp_src_uri) | |
1521 exp_src_uri.delete_key(validate=False, headers=self.headers) | |
1522 stats_lock.acquire() | |
1523 self.total_elapsed_time += elapsed_time | |
1524 self.total_bytes_transferred += bytes_transferred | |
1525 stats_lock.release() | |
1526 | |
1527 # Start of RunCommand code. | |
1528 self._ParseArgs() | |
1529 | |
1530 self.total_elapsed_time = self.total_bytes_transferred = 0 | |
1531 if self.args[-1] == '-' or self.args[-1] == 'file://-': | |
1532 self._HandleStreamingDownload() | |
1533 return 0 | |
1534 | |
1535 if self.read_args_from_stdin: | |
1536 if len(self.args) != 1: | |
1537 raise CommandException('Source URIs cannot be specified with -I option') | |
1538 uri_strs = self._StdinIterator() | |
1539 else: | |
1540 if len(self.args) < 2: | |
1541 raise CommandException('Wrong number of arguments for "cp" command.') | |
1542 uri_strs = self.args[0:len(self.args)-1] | |
1543 | |
1544 (exp_dst_uri, have_existing_dst_container) = self._ExpandDstUri( | |
1545 self.args[-1]) | |
1546 name_expansion_iterator = NameExpansionIterator( | |
1547 self.command_name, self.proj_id_handler, self.headers, self.debug, | |
1548 self.bucket_storage_uri_class, uri_strs, | |
1549 self.recursion_requested or self.perform_mv, | |
1550 have_existing_dst_container) | |
1551 | |
1552 # Use a lock to ensure accurate statistics in the face of | |
1553 # multi-threading/multi-processing. | |
1554 stats_lock = threading.Lock() | |
1555 | |
1556 # Tracks if any copies failed. | |
1557 self.copy_failure_count = 0 | |
1558 | |
1559 # Start the clock. | |
1560 start_time = time.time() | |
1561 | |
1562 # Tuple of attributes to share/manage across multiple processes in | |
1563 # parallel (-m) mode. | |
1564 shared_attrs = ('copy_failure_count', 'total_bytes_transferred') | |
1565 | |
1566 # Perform copy requests in parallel (-m) mode, if requested, using | |
1567 # configured number of parallel processes and threads. Otherwise, | |
1568 # perform requests with sequential function calls in current process. | |
1569 self.Apply(_CopyFunc, name_expansion_iterator, _CopyExceptionHandler, | |
1570 shared_attrs) | |
1571 if self.debug: | |
1572 print 'total_bytes_transferred:' + str(self.total_bytes_transferred) | |
1573 | |
1574 end_time = time.time() | |
1575 self.total_elapsed_time = end_time - start_time | |
1576 | |
1577 # Sometimes, particularly when running unit tests, the total elapsed time | |
1578 # is really small. On Windows, the timer resolution is too small and | |
1579 # causes total_elapsed_time to be zero. | |
1580 try: | |
1581 float(self.total_bytes_transferred) / float(self.total_elapsed_time) | |
1582 except ZeroDivisionError: | |
1583 self.total_elapsed_time = 0.01 | |
1584 | |
1585 self.total_bytes_per_second = (float(self.total_bytes_transferred) / | |
1586 float(self.total_elapsed_time)) | |
1587 | |
1588 if self.debug == 3: | |
1589 # Note that this only counts the actual GET and PUT bytes for the copy | |
1590 # - not any transfers for doing wildcard expansion, the initial HEAD | |
1591 # request boto performs when doing a bucket.get_key() operation, etc. | |
1592 if self.total_bytes_transferred != 0: | |
1593 self.THREADED_LOGGER.info( | |
1594 'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)', | |
1595 self.total_bytes_transferred, self.total_elapsed_time, | |
1596 MakeHumanReadable(self.total_bytes_per_second)) | |
1597 if self.copy_failure_count: | |
1598 plural_str = '' | |
1599 if self.copy_failure_count > 1: | |
1600 plural_str = 's' | |
1601 raise CommandException('%d file%s/object%s could not be transferred.' % ( | |
1602 self.copy_failure_count, plural_str, plural_str)) | |
1603 | |
1604 return 0 | |
1605 | |
1606 def _ParseArgs(self): | |
1607 self.perform_mv = False | |
1608 self.exclude_symlinks = False | |
1609 self.quiet = False | |
1610 self.no_clobber = False | |
1611 self.continue_on_error = False | |
1612 self.daisy_chain = False | |
1613 self.read_args_from_stdin = False | |
1614 self.print_ver = False | |
1615 # self.recursion_requested initialized in command.py (so can be checked | |
1616 # in parent class for all commands). | |
1617 if self.sub_opts: | |
1618 for o, unused_a in self.sub_opts: | |
1619 if o == '-c': | |
1620 self.continue_on_error = True | |
1621 elif o == '-D': | |
1622 self.daisy_chain = True | |
1623 elif o == '-e': | |
1624 self.exclude_symlinks = True | |
1625 elif o == '-I': | |
1626 self.read_args_from_stdin = True | |
1627 elif o == '-M': | |
1628 # Note that we signal to the cp command to perform a move (copy | |
1629 # followed by remove) and use directory-move naming rules by passing | |
1630 # the undocumented (for internal use) -M option when running the cp | |
1631 # command from mv.py. | |
1632 self.perform_mv = True | |
1633 elif o == '-n': | |
1634 self.no_clobber = True | |
1635 elif o == '-q': | |
1636 self.quiet = True | |
1637 elif o == '-r' or o == '-R': | |
1638 self.recursion_requested = True | |
1639 elif o == '-v': | |
1640 self.print_ver = True | |
1641 | |
1642 def _HandleStreamingDownload(self): | |
1643 # Destination is <STDOUT>. Manipulate sys.stdout so as to redirect all | |
1644 # debug messages to <STDERR>. | |
1645 stdout_fp = sys.stdout | |
1646 sys.stdout = sys.stderr | |
1647 did_some_work = False | |
1648 for uri_str in self.args[0:len(self.args)-1]: | |
1649 for uri in self.WildcardIterator(uri_str).IterUris(): | |
1650 did_some_work = True | |
1651 key = uri.get_key(False, self.headers) | |
1652 (elapsed_time, bytes_transferred) = self._PerformDownloadToStream( | |
1653 key, uri, stdout_fp, self.headers) | |
1654 self.total_elapsed_time += elapsed_time | |
1655 self.total_bytes_transferred += bytes_transferred | |
1656 if not did_some_work: | |
1657 raise CommandException('No URIs matched') | |
1658 if self.debug == 3: | |
1659 if self.total_bytes_transferred != 0: | |
1660 self.THREADED_LOGGER.info( | |
1661 'Total bytes copied=%d, total elapsed time=%5.3f secs (%sps)', | |
1662 self.total_bytes_transferred, self.total_elapsed_time, | |
1663 MakeHumanReadable(float(self.total_bytes_transferred) / | |
1664 float(self.total_elapsed_time))) | |
1665 | |
1666 def _StdinIterator(self): | |
1667 """A generator function that returns lines from stdin.""" | |
1668 for line in sys.stdin: | |
1669 # Strip CRLF. | |
1670 yield line.rstrip() | |
1671 | |
1672 def _SrcDstSame(self, src_uri, dst_uri): | |
1673 """Checks if src_uri and dst_uri represent the same object or file. | |
1674 | |
1675 We don't handle anything about hard or symbolic links. | |
1676 | |
1677 Args: | |
1678 src_uri: Source StorageUri. | |
1679 dst_uri: Destination StorageUri. | |
1680 | |
1681 Returns: | |
1682 Bool indicator. | |
1683 """ | |
1684 if src_uri.is_file_uri() and dst_uri.is_file_uri(): | |
1685 # Translate a/b/./c to a/b/c, so src=dst comparison below works. | |
1686 new_src_path = os.path.normpath(src_uri.object_name) | |
1687 new_dst_path = os.path.normpath(dst_uri.object_name) | |
1688 return (src_uri.clone_replace_name(new_src_path).uri == | |
1689 dst_uri.clone_replace_name(new_dst_path).uri) | |
1690 else: | |
1691 return (src_uri.uri == dst_uri.uri and | |
1692 src_uri.generation == dst_uri.generation and | |
1693 src_uri.version_id == dst_uri.version_id) | |
1694 | |
1695 def _ShouldTreatDstUriAsBucketSubDir(self, have_multiple_srcs, dst_uri, | |
1696 have_existing_dest_subdir): | |
1697 """ | |
1698 Checks whether dst_uri should be treated as a bucket "sub-directory". The | |
1699 decision about whether something constitutes a bucket "sub-directory" | |
1700 depends on whether there are multiple sources in this request and whether | |
1701 there is an existing bucket subdirectory. For example, when running the | |
1702 command: | |
1703 gsutil cp file gs://bucket/abc | |
1704 if there's no existing gs://bucket/abc bucket subdirectory we should copy | |
1705 file to the object gs://bucket/abc. In contrast, if | |
1706 there's an existing gs://bucket/abc bucket subdirectory we should copy | |
1707 file to gs://bucket/abc/file. And regardless of whether gs://bucket/abc | |
1708 exists, when running the command: | |
1709 gsutil cp file1 file2 gs://bucket/abc | |
1710 we should copy file1 to gs://bucket/abc/file1 (and similarly for file2). | |
1711 | |
1712 Note that we don't disallow naming a bucket "sub-directory" where there's | |
1713 already an object at that URI. For example it's legitimate (albeit | |
1714 confusing) to have an object called gs://bucket/dir and | |
1715 then run the command | |
1716 gsutil cp file1 file2 gs://bucket/dir | |
1717 Doing so will end up with objects gs://bucket/dir, gs://bucket/dir/file1, | |
1718 and gs://bucket/dir/file2. | |
1719 | |
1720 Args: | |
1721 have_multiple_srcs: Bool indicator of whether this is a multi-source | |
1722 operation. | |
1723 dst_uri: StorageUri to check. | |
1724 have_existing_dest_subdir: bool indicator whether dest is an existing | |
1725 subdirectory. | |
1726 | |
1727 Returns: | |
1728 bool indicator. | |
1729 """ | |
1730 return ((have_multiple_srcs and dst_uri.is_cloud_uri()) | |
1731 or (have_existing_dest_subdir)) | |
1732 | |
1733 def _ShouldTreatDstUriAsSingleton(self, have_multiple_srcs, | |
1734 have_existing_dest_subdir, dst_uri): | |
1735 """ | |
1736 Checks that dst_uri names a singleton (file or object) after | |
1737 dir/wildcard expansion. The decision is more nuanced than simply | |
1738 dst_uri.names_singleton()) because of the possibility that an object path | |
1739 might name a bucket sub-directory. | |
1740 | |
1741 Args: | |
1742 have_multiple_srcs: Bool indicator of whether this is a multi-source | |
1743 operation. | |
1744 have_existing_dest_subdir: bool indicator whether dest is an existing | |
1745 subdirectory. | |
1746 dst_uri: StorageUri to check. | |
1747 | |
1748 Returns: | |
1749 bool indicator. | |
1750 """ | |
1751 if have_multiple_srcs: | |
1752 # Only a file meets the criteria in this case. | |
1753 return dst_uri.names_file() | |
1754 return not have_existing_dest_subdir and dst_uri.names_singleton() | |
1755 | |
1756 def _IsNoClobberServerException(self, e): | |
1757 """ | |
1758 Checks to see if the server attempted to clobber a file after we specified | |
1759 in the header that we didn't want the file clobbered. | |
1760 | |
1761 Args: | |
1762 e: The Exception that was generated by a failed copy operation | |
1763 | |
1764 Returns: | |
1765 bool indicator - True indicates that the server did attempt to clobber | |
1766 an existing file. | |
1767 """ | |
1768 return self.no_clobber and ( | |
1769 (isinstance(e, GSResponseError) and e.status==412) or | |
1770 (isinstance(e, ResumableUploadException) and 'code 412' in e.message)) | |
1771 | |
1772 def _GetPathBeforeFinalDir(uri): | |
1773 """ | |
1774 Returns the part of the path before the final directory component for the | |
1775 given URI, handling cases for file system directories, bucket, and bucket | |
1776 subdirectories. Example: for gs://bucket/dir/ we'll return 'gs://bucket', | |
1777 and for file://dir we'll return file:// | |
1778 | |
1779 Args: | |
1780 uri: StorageUri. | |
1781 | |
1782 Returns: | |
1783 String name of above-described path, sans final path separator. | |
1784 """ | |
1785 sep = uri.delim | |
1786 assert not uri.names_file() | |
1787 if uri.names_directory(): | |
1788 past_scheme = uri.uri[len('file://'):] | |
1789 if past_scheme.find(sep) == -1: | |
1790 return 'file://' | |
1791 else: | |
1792 return 'file://%s' % past_scheme.rstrip(sep).rpartition(sep)[0] | |
1793 if uri.names_bucket(): | |
1794 return '%s://' % uri.scheme | |
1795 # Else it names a bucket subdir. | |
1796 return uri.uri.rstrip(sep).rpartition(sep)[0] | |
1797 | |
1798 def _hash_filename(filename): | |
1799 """ | |
1800 Apply a hash function (SHA1) to shorten the passed file name. The spec | |
1801 for the hashed file name is as follows: | |
1802 | |
1803 TRACKER_<hash>_<trailing> | |
1804 | |
1805 where hash is a SHA1 hash on the original file name and trailing is | |
1806 the last 16 chars from the original file name. Max file name lengths | |
1807 vary by operating system so the goal of this function is to ensure | |
1808 the hashed version takes fewer than 100 characters. | |
1809 | |
1810 Args: | |
1811 filename: file name to be hashed. | |
1812 | |
1813 Returns: | |
1814 shorter, hashed version of passed file name | |
1815 """ | |
1816 if not isinstance(filename, unicode): | |
1817 filename = unicode(filename, 'utf8').encode('utf-8') | |
1818 m = hashlib.sha1(filename) | |
1819 return "TRACKER_" + m.hexdigest() + '.' + filename[-16:] | |
OLD | NEW |