| OLD | NEW |
| 1 # -*- coding: utf-8 -*- |
| 1 # Copyright 2012 Google Inc. All Rights Reserved. | 2 # Copyright 2012 Google Inc. All Rights Reserved. |
| 2 # | 3 # |
| 3 # Licensed under the Apache License, Version 2.0 (the "License"); | 4 # Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 # you may not use this file except in compliance with the License. | 5 # you may not use this file except in compliance with the License. |
| 5 # You may obtain a copy of the License at | 6 # You may obtain a copy of the License at |
| 6 # | 7 # |
| 7 # http://www.apache.org/licenses/LICENSE-2.0 | 8 # http://www.apache.org/licenses/LICENSE-2.0 |
| 8 # | 9 # |
| 9 # Unless required by applicable law or agreed to in writing, software | 10 # Unless required by applicable law or agreed to in writing, software |
| 10 # distributed under the License is distributed on an "AS IS" BASIS, | 11 # distributed under the License is distributed on an "AS IS" BASIS, |
| 11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 # See the License for the specific language governing permissions and | 13 # See the License for the specific language governing permissions and |
| 13 # limitations under the License. | 14 # limitations under the License. |
| 15 """Additional help about using gsutil for production tasks.""" |
| 14 | 16 |
| 15 from gslib.help_provider import HELP_NAME | 17 from __future__ import absolute_import |
| 16 from gslib.help_provider import HELP_NAME_ALIASES | 18 |
| 17 from gslib.help_provider import HELP_ONE_LINE_SUMMARY | |
| 18 from gslib.help_provider import HelpProvider | 19 from gslib.help_provider import HelpProvider |
| 19 from gslib.help_provider import HELP_TEXT | |
| 20 from gslib.help_provider import HelpType | |
| 21 from gslib.help_provider import HELP_TYPE | |
| 22 | 20 |
| 23 _detailed_help_text = (""" | 21 _DETAILED_HELP_TEXT = (""" |
| 24 <B>OVERVIEW</B> | 22 <B>OVERVIEW</B> |
| 25 If you use gsutil in large production tasks (such as uploading or | 23 If you use gsutil in large production tasks (such as uploading or |
| 26 downloading many GB of data each night), there are a number of things | 24 downloading many GBs of data each night), there are a number of things |
| 27 you can do to help ensure success. Specifically, this section discusses | 25 you can do to help ensure success. Specifically, this section discusses |
| 28 how to script large production tasks around gsutil's resumable transfer | 26 how to script large production tasks around gsutil's resumable transfer |
| 29 mechanism. | 27 mechanism. |
| 30 | 28 |
| 31 | 29 |
| 32 <B>BACKGROUND ON RESUMABLE TRANSFERS</B> | 30 <B>BACKGROUND ON RESUMABLE TRANSFERS</B> |
| 33 First, it's helpful to understand gsutil's resumable transfer mechanism, | 31 First, it's helpful to understand gsutil's resumable transfer mechanism, |
| 34 and how your script needs to be implemented around this mechanism to work | 32 and how your script needs to be implemented around this mechanism to work |
| 35 reliably. gsutil uses the resumable transfer support in the boto library | 33 reliably. gsutil uses resumable transfer support when you attempt to upload |
| 36 when you attempt to upload or download a file larger than a configurable | 34 or download a file larger than a configurable threshold (by default, this |
| 37 threshold (by default, this threshold is 2 MB). When a transfer fails | 35 threshold is 2 MB). When a transfer fails partway through (e.g., because of |
| 38 partway through (e.g., because of an intermittent network problem), | 36 an intermittent network problem), gsutil uses a truncated randomized binary |
| 39 boto uses a randomized binary exponential backoff-and-retry strategy: | 37 exponential backoff-and-retry strategy that by default will retry transfers up |
| 40 wait a random period between [0..1] seconds and retry; if that fails, | 38 to 6 times over a 63 second period of time (see "gsutil help retries" for |
| 41 wait a random period between [0..2] seconds and retry; and if that | 39 details). If the transfer fails each of these attempts with no intervening |
| 42 fails, wait a random period between [0..4] seconds, and so on, up to a | 40 progress, gsutil gives up on the transfer, but keeps a "tracker" file for |
| 43 configurable number of times (the default is 6 times). Thus, the retry | 41 it in a configurable location (the default location is ~/.gsutil/, in a file |
| 44 actually spans a randomized period up to 1+2+4+8+16+32=63 seconds. | 42 named by a combination of the SHA1 hash of the name of the bucket and object |
| 45 | 43 being transferred and the last 16 characters of the file name). When transfers |
| 46 If the transfer fails each of these attempts with no intervening | 44 fail in this fashion, you can rerun gsutil at some later time (e.g., after |
| 47 progress, gsutil gives up on the transfer, but keeps a "tracker" file | 45 the networking problem has been resolved), and the resumable transfer picks |
| 48 for it in a configurable location (the default location is ~/.gsutil/, | 46 up where it left off. |
| 49 in a file named by a combination of the SHA1 hash of the name of the | |
| 50 bucket and object being transferred and the last 16 characters of the | |
| 51 file name). When transfers fail in this fashion, you can rerun gsutil | |
| 52 at some later time (e.g., after the networking problem has been | |
| 53 resolved), and the resumable transfer picks up where it left off. | |
| 54 | 47 |
| 55 | 48 |
| 56 <B>SCRIPTING DATA TRANSFER TASKS</B> | 49 <B>SCRIPTING DATA TRANSFER TASKS</B> |
| 57 To script large production data transfer tasks around this mechanism, | 50 To script large production data transfer tasks around this mechanism, |
| 58 you can implement a script that runs periodically, determines which file | 51 you can implement a script that runs periodically, determines which file |
| 59 transfers have not yet succeeded, and runs gsutil to copy them. Below, | 52 transfers have not yet succeeded, and runs gsutil to copy them. Below, |
| 60 we offer a number of suggestions about how this type of scripting should | 53 we offer a number of suggestions about how this type of scripting should |
| 61 be implemented: | 54 be implemented: |
| 62 | 55 |
| 63 1. When resumable transfers fail without any progress 6 times in a row | 56 1. When resumable transfers fail without any progress 6 times in a row |
| (...skipping 65 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 129 multi-processing) option. Be aware, however, that gsutil doesn't attempt to | 122 multi-processing) option. Be aware, however, that gsutil doesn't attempt to |
| 130 keep track of which files were downloaded successfully in cases where some | 123 keep track of which files were downloaded successfully in cases where some |
| 131 files failed to download. For example, if you use multi-threaded transfers | 124 files failed to download. For example, if you use multi-threaded transfers |
| 132 to download 100 files and 3 failed to download, it is up to your scripting | 125 to download 100 files and 3 failed to download, it is up to your scripting |
| 133 process to determine which transfers didn't succeed, and retry them. A | 126 process to determine which transfers didn't succeed, and retry them. A |
| 134 periodic check-and-run approach like outlined earlier would handle this | 127 periodic check-and-run approach like outlined earlier would handle this |
| 135 case. | 128 case. |
| 136 | 129 |
| 137 If you use parallel transfers (gsutil -m) you might want to experiment with | 130 If you use parallel transfers (gsutil -m) you might want to experiment with |
| 138 the number of threads being used (via the parallel_thread_count setting | 131 the number of threads being used (via the parallel_thread_count setting |
| 139 in the .boto config file). By default, gsutil uses 24 threads. Depending | 132 in the .boto config file). By default, gsutil uses 10 threads for Linux |
| 140 on your network speed, available memory, CPU load, and other conditions, | 133 and 24 threads for other operating systems. Depending on your network |
| 141 this may or may not be optimal. Try experimenting with higher or lower | 134 speed, available memory, CPU load, and other conditions, this may or may |
| 142 numbers of threads, to find the best number of threads for your | 135 not be optimal. Try experimenting with higher or lower numbers of threads |
| 143 environment. | 136 to find the best number of threads for your environment. |
| 144 | 137 |
| 145 <B>RUNNING GSUTIL ON MULTIPLE MACHINES</B> | 138 <B>RUNNING GSUTIL ON MULTIPLE MACHINES</B> |
| 146 When running gsutil on multiple machines that are all attempting to use the | 139 When running gsutil on multiple machines that are all attempting to use the |
| 147 same OAuth2 refresh token, it is possible to encounter rate limiting errors | 140 same OAuth2 refresh token, it is possible to encounter rate limiting errors |
| 148 for the refresh requests (especially if all of these machines are likely to | 141 for the refresh requests (especially if all of these machines are likely to |
| 149 start running gsutil at the same time). To account for this, gsutil will | 142 start running gsutil at the same time). To account for this, gsutil will |
| 150 automatically retry OAuth2 refresh requests with a randomized exponential | 143 automatically retry OAuth2 refresh requests with a truncated randomized |
| 151 backoff strategy like that which is described in the | 144 exponential backoff strategy like that which is described in the |
| 152 "BACKGROUND ON RESUMABLE TRANSFERS" section above. The number of retries | 145 "BACKGROUND ON RESUMABLE TRANSFERS" section above. The number of retries |
| 153 attempted for OAuth2 refresh requests can be controlled via the | 146 attempted for OAuth2 refresh requests can be controlled via the |
| 154 "oauth2_refresh_retries" variable in the .boto config file. | 147 "oauth2_refresh_retries" variable in the .boto config file. |
| 155 """) | 148 """) |
| 156 | 149 |
| 157 | 150 |
| 158 class CommandOptions(HelpProvider): | 151 class CommandOptions(HelpProvider): |
| 159 """Additional help about using gsutil for production tasks.""" | 152 """Additional help about using gsutil for production tasks.""" |
| 160 | 153 |
| 161 help_spec = { | 154 # Help specification. See help_provider.py for documentation. |
| 162 # Name of command or auxiliary help info for which this help applies. | 155 help_spec = HelpProvider.HelpSpec( |
| 163 HELP_NAME : 'prod', | 156 help_name='prod', |
| 164 # List of help name aliases. | 157 help_name_aliases=[ |
| 165 HELP_NAME_ALIASES : ['production', 'resumable', 'resumable upload', | 158 'production', 'resumable', 'resumable upload', 'resumable transfer', |
| 166 'resumable transfer', 'resumable download', | 159 'resumable download', 'scripts', 'scripting'], |
| 167 'scripts', 'scripting'], | 160 help_type='additional_help', |
| 168 # Type of help: | 161 help_one_line_summary='Scripting Production Transfers', |
| 169 HELP_TYPE : HelpType.ADDITIONAL_HELP, | 162 help_text=_DETAILED_HELP_TEXT, |
| 170 # One line summary of this help. | 163 subcommand_help_text={}, |
| 171 HELP_ONE_LINE_SUMMARY : 'Scripting Production Transfers', | 164 ) |
| 172 # The full help text. | |
| 173 HELP_TEXT : _detailed_help_text, | |
| 174 } | |
| OLD | NEW |