| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env python | |
| 2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 3 # Use of this source code is governed by a BSD-style license that can be | |
| 4 # found in the LICENSE file. | |
| 5 | |
| 6 """This module contains utilities related to Google Storage manipulations. | |
| 7 | |
| 8 TODO(epoger): Replace this old gs_utils.py with a new one, within the common | |
| 9 repo, that uses google-api-python-client rather than the gsutil tool. | |
| 10 See http://skbug.com/2618 ('buildbot code: use google-api-python-client instead | |
| 11 of gsutil tool') | |
| 12 """ | |
| 13 | |
| 14 import hashlib | |
| 15 import os | |
| 16 import posixpath | |
| 17 import re | |
| 18 import shutil | |
| 19 import tempfile | |
| 20 import time | |
| 21 | |
| 22 from py.utils import shell_utils | |
| 23 from slave import slave_utils | |
| 24 | |
| 25 import file_utils | |
| 26 | |
| 27 | |
| 28 DEFAULT_DEST_GSBASE = 'gs://chromium-skia-gm' | |
| 29 TIMESTAMP_STARTED_FILENAME = 'TIMESTAMP_LAST_UPLOAD_STARTED' | |
| 30 TIMESTAMP_COMPLETED_FILENAME = 'TIMESTAMP_LAST_UPLOAD_COMPLETED' | |
| 31 LAST_REBASELINED_BY_FILENAME = 'LAST_REBASELINED_BY' | |
| 32 | |
| 33 FILES_CHUNK = 500 | |
| 34 BUFSIZE = 64 * 1024 | |
| 35 | |
| 36 ETAG_REGEX = re.compile(r'ETag:\s*(\S+)') | |
| 37 | |
| 38 | |
| 39 def delete_storage_object(object_name): | |
| 40 """Delete an object on Google Storage.""" | |
| 41 gsutil = slave_utils.GSUtilSetup() | |
| 42 command = [gsutil] | |
| 43 command.extend(['rm', '-R', object_name]) | |
| 44 print 'Running command: %s' % command | |
| 45 shell_utils.run(command) | |
| 46 | |
| 47 | |
| 48 def upload_file(local_src_path, remote_dest_path, gs_acl='private', | |
| 49 http_header_lines=None, only_if_modified=False): | |
| 50 """Upload contents of a local file to Google Storage. | |
| 51 | |
| 52 params: | |
| 53 local_src_path: path to file on local disk | |
| 54 remote_dest_path: GS URL (gs://BUCKETNAME/PATH) | |
| 55 gs_acl: which predefined ACL to apply to the file on Google Storage; see | |
| 56 https://developers.google.com/storage/docs/accesscontrol#extension | |
| 57 http_header_lines: a list of HTTP header strings to add, if any | |
| 58 only_if_modified: if True, only upload the file if it would actually change | |
| 59 the content on Google Storage (uploads the file if remote_dest_path | |
| 60 does not exist, or if it exists but has different contents than | |
| 61 local_src_path). Note that this may take longer than just uploading the | |
| 62 file without checking first, due to extra round-trips! | |
| 63 | |
| 64 TODO(epoger): Consider adding a do_compress parameter that would compress | |
| 65 the file using gzip before upload, and add a "Content-Encoding:gzip" header | |
| 66 so that HTTP downloads of the file would be unzipped automatically. | |
| 67 See https://developers.google.com/storage/docs/gsutil/addlhelp/ | |
| 68 WorkingWithObjectMetadata#content-encoding | |
| 69 """ | |
| 70 gsutil = slave_utils.GSUtilSetup() | |
| 71 | |
| 72 if only_if_modified: | |
| 73 # Return early if we don't need to do the upload. | |
| 74 command = [gsutil, 'ls', '-L', remote_dest_path] | |
| 75 try: | |
| 76 ls_output = shell_utils.run(command) | |
| 77 matches = ETAG_REGEX.search(ls_output) | |
| 78 if matches: | |
| 79 # TODO(epoger): In my testing, this has always returned an MD5 hash | |
| 80 # that is comparable to local_md5 below. But from my reading of | |
| 81 # https://developers.google.com/storage/docs/hashes-etags , this is | |
| 82 # not something we can always rely on ("composite objects don't support | |
| 83 # MD5 hashes"; I'm not sure if we ever encounter composite objects, | |
| 84 # though). It would be good for us to find a more reliable hash, but | |
| 85 # I haven't found a way to get one out of gsutil yet. | |
| 86 # | |
| 87 # For now: if the remote_md5 is not found, or is computed in | |
| 88 # such a way that is different from local_md5, then we will re-upload | |
| 89 # the file even if it did not change. | |
| 90 remote_md5 = matches.group(1) | |
| 91 hasher = hashlib.md5() | |
| 92 with open(local_src_path, 'rb') as filereader: | |
| 93 while True: | |
| 94 data = filereader.read(BUFSIZE) | |
| 95 if not data: | |
| 96 break | |
| 97 hasher.update(data) | |
| 98 local_md5 = hasher.hexdigest() | |
| 99 if local_md5 == remote_md5: | |
| 100 print ('local_src_path %s and remote_dest_path %s have same hash %s' % | |
| 101 (local_src_path, remote_dest_path, local_md5)) | |
| 102 return | |
| 103 except shell_utils.CommandFailedException: | |
| 104 # remote_dest_path probably does not exist. Go ahead and do the upload. | |
| 105 pass | |
| 106 | |
| 107 command = [gsutil] | |
| 108 if http_header_lines: | |
| 109 for http_header_line in http_header_lines: | |
| 110 command.extend(['-h', http_header_line]) | |
| 111 command.extend(['cp', '-a', gs_acl, local_src_path, remote_dest_path]) | |
| 112 print 'Running command: %s' % command | |
| 113 shell_utils.run(command) | |
| 114 | |
| 115 | |
| 116 def upload_dir_contents(local_src_dir, remote_dest_dir, gs_acl='private', | |
| 117 http_header_lines=None): | |
| 118 """Upload contents of a local directory to Google Storage. | |
| 119 | |
| 120 params: | |
| 121 local_src_dir: directory on local disk to upload contents of | |
| 122 remote_dest_dir: GS URL (gs://BUCKETNAME/PATH) | |
| 123 gs_acl: which predefined ACL to apply to the files on Google Storage; see | |
| 124 https://developers.google.com/storage/docs/accesscontrol#extension | |
| 125 http_header_lines: a list of HTTP header strings to add, if any | |
| 126 | |
| 127 The copy operates as a "merge with overwrite": any files in src_dir will be | |
| 128 "overlaid" on top of the existing content in dest_dir. Existing files with | |
| 129 the same names will be overwritten. | |
| 130 | |
| 131 We upload each file as a separate call to gsutil. This takes longer than | |
| 132 calling "gsutil -m cp -R <source> <dest>", which can perform the uploads in | |
| 133 parallel... but in http://skbug.com/2618 ('The Case of the Missing | |
| 134 Mandrills') we figured out that was silently failing in some cases! | |
| 135 | |
| 136 TODO(epoger): Use the google-api-python-client API, like we do in | |
| 137 https://skia.googlesource.com/skia/+/master/tools/pyutils/gs_utils.py , | |
| 138 rather than calling out to the gsutil tool. See http://skbug.com/2618 | |
| 139 | |
| 140 TODO(epoger): Upload multiple files simultaneously to reduce latency. | |
| 141 | |
| 142 TODO(epoger): Add a "noclobber" mode that will not upload any files would | |
| 143 overwrite existing files in Google Storage. | |
| 144 | |
| 145 TODO(epoger): Consider adding a do_compress parameter that would compress | |
| 146 the file using gzip before upload, and add a "Content-Encoding:gzip" header | |
| 147 so that HTTP downloads of the file would be unzipped automatically. | |
| 148 See https://developers.google.com/storage/docs/gsutil/addlhelp/ | |
| 149 WorkingWithObjectMetadata#content-encoding | |
| 150 """ | |
| 151 gsutil = slave_utils.GSUtilSetup() | |
| 152 command = [gsutil] | |
| 153 if http_header_lines: | |
| 154 for http_header_line in http_header_lines: | |
| 155 command.extend(['-h', http_header_line]) | |
| 156 command.extend(['cp', '-a', gs_acl]) | |
| 157 | |
| 158 abs_local_src_dir = os.path.abspath(local_src_dir) | |
| 159 for (abs_src_dirpath, _, filenames) in os.walk(abs_local_src_dir): | |
| 160 if abs_src_dirpath == abs_local_src_dir: | |
| 161 # This file is within local_src_dir; no need to add subdirs to | |
| 162 # abs_dest_dirpath. | |
| 163 abs_dest_dirpath = remote_dest_dir | |
| 164 else: | |
| 165 # This file is within a subdir, so add subdirs to abs_dest_dirpath. | |
| 166 abs_dest_dirpath = posixpath.join( | |
| 167 remote_dest_dir, | |
| 168 _convert_to_posixpath( | |
| 169 os.path.relpath(abs_src_dirpath, abs_local_src_dir))) | |
| 170 for filename in sorted(filenames): | |
| 171 abs_src_filepath = os.path.join(abs_src_dirpath, filename) | |
| 172 abs_dest_filepath = posixpath.join(abs_dest_dirpath, filename) | |
| 173 shell_utils.run(command + [abs_src_filepath, abs_dest_filepath]) | |
| 174 | |
| 175 | |
| 176 def download_dir_contents(remote_src_dir, local_dest_dir, multi=True): | |
| 177 """Download contents of a Google Storage directory to local disk. | |
| 178 | |
| 179 params: | |
| 180 remote_src_dir: GS URL (gs://BUCKETNAME/PATH) | |
| 181 local_dest_dir: directory on local disk to write the contents into | |
| 182 multi: boolean; whether to perform the copy in multithreaded mode. | |
| 183 | |
| 184 The copy operates as a "merge with overwrite": any files in src_dir will be | |
| 185 "overlaid" on top of the existing content in dest_dir. Existing files with | |
| 186 the same names will be overwritten. | |
| 187 """ | |
| 188 gsutil = slave_utils.GSUtilSetup() | |
| 189 command = [gsutil] | |
| 190 if multi: | |
| 191 command.append('-m') | |
| 192 command.extend(['cp', '-R', remote_src_dir, local_dest_dir]) | |
| 193 print 'Running command: %s' % command | |
| 194 shell_utils.run(command) | |
| 195 | |
| 196 | |
| 197 def copy_dir_contents(remote_src_dir, remote_dest_dir, gs_acl='private', | |
| 198 http_header_lines=None): | |
| 199 """Copy contents of one Google Storage directory to another. | |
| 200 | |
| 201 params: | |
| 202 remote_src_dir: source GS URL (gs://BUCKETNAME/PATH) | |
| 203 remote_dest_dir: dest GS URL (gs://BUCKETNAME/PATH) | |
| 204 gs_acl: which predefined ACL to apply to the new files; see | |
| 205 https://developers.google.com/storage/docs/accesscontrol#extension | |
| 206 http_header_lines: a list of HTTP header strings to add, if any | |
| 207 | |
| 208 The copy operates as a "merge with overwrite": any files in src_dir will be | |
| 209 "overlaid" on top of the existing content in dest_dir. Existing files with | |
| 210 the same names will be overwritten. | |
| 211 | |
| 212 Performs the copy in multithreaded mode, in case there are a large number of | |
| 213 files. | |
| 214 """ | |
| 215 gsutil = slave_utils.GSUtilSetup() | |
| 216 command = [gsutil, '-m'] | |
| 217 if http_header_lines: | |
| 218 for http_header_line in http_header_lines: | |
| 219 command.extend(['-h', http_header_line]) | |
| 220 command.extend(['cp', '-a', gs_acl, '-R', remote_src_dir, remote_dest_dir]) | |
| 221 print 'Running command: %s' % command | |
| 222 shell_utils.run(command) | |
| 223 | |
| 224 | |
| 225 def move_storage_directory(src_dir, dest_dir): | |
| 226 """Move a directory on Google Storage.""" | |
| 227 gsutil = slave_utils.GSUtilSetup() | |
| 228 command = [gsutil] | |
| 229 command.extend(['mv', '-p', src_dir, dest_dir]) | |
| 230 print 'Running command: %s' % command | |
| 231 shell_utils.run(command) | |
| 232 | |
| 233 | |
| 234 def list_storage_directory(dest_gsbase, subdir): | |
| 235 """List the contents of the specified Storage directory.""" | |
| 236 gsbase_subdir = posixpath.join(dest_gsbase, subdir) | |
| 237 status, output_gsutil_ls = slave_utils.GSUtilListBucket(gsbase_subdir, []) | |
| 238 if status != 0: | |
| 239 raise Exception( | |
| 240 'Could not list contents of %s in Google Storage!' % gsbase_subdir) | |
| 241 | |
| 242 gs_files = [] | |
| 243 for line in set(output_gsutil_ls.splitlines()): | |
| 244 # Ignore lines with warnings and status messages. | |
| 245 if line and line.startswith(gsbase_subdir) and line != gsbase_subdir: | |
| 246 gs_files.append(line) | |
| 247 return gs_files | |
| 248 | |
| 249 | |
| 250 def does_storage_object_exist(object_name): | |
| 251 """Checks if an object exists on Google Storage. | |
| 252 | |
| 253 Returns True if it exists else returns False. | |
| 254 """ | |
| 255 gsutil = slave_utils.GSUtilSetup() | |
| 256 command = [gsutil] | |
| 257 command.extend(['ls', object_name]) | |
| 258 print 'Running command: %s' % command | |
| 259 try: | |
| 260 shell_utils.run(command) | |
| 261 return True | |
| 262 except shell_utils.CommandFailedException: | |
| 263 return False | |
| 264 | |
| 265 | |
| 266 def download_directory_contents_if_changed(gs_base, gs_relative_dir, local_dir): | |
| 267 """Compares the TIMESTAMP_LAST_UPLOAD_COMPLETED and downloads if different. | |
| 268 | |
| 269 The goal of download_directory_contents_if_changed and | |
| 270 upload_directory_contents_if_changed is to attempt to replicate directory | |
| 271 level rsync functionality to the Google Storage directories we care about. | |
| 272 """ | |
| 273 if _are_timestamps_equal(gs_base, gs_relative_dir, local_dir): | |
| 274 print '\n\n=======Local directory is current=======\n\n' | |
| 275 else: | |
| 276 file_utils.create_clean_local_dir(local_dir) | |
| 277 gs_source = posixpath.join(gs_base, gs_relative_dir, '*') | |
| 278 slave_utils.GSUtilDownloadFile(src=gs_source, dst=local_dir) | |
| 279 if not _are_timestamps_equal(gs_base, gs_relative_dir, local_dir): | |
| 280 raise Exception('Failed to download from GS: %s' % gs_source) | |
| 281 | |
| 282 | |
| 283 def _get_chunks(seq, n): | |
| 284 """Yield successive n-sized chunks from the specified sequence.""" | |
| 285 for i in xrange(0, len(seq), n): | |
| 286 yield seq[i:i+n] | |
| 287 | |
| 288 | |
| 289 def delete_directory_contents(gs_base, gs_relative_dir, files_to_delete): | |
| 290 """Deletes the specified files from the Google Storage Directory. | |
| 291 | |
| 292 Args: | |
| 293 gs_base: str - The Google Storage base. Eg: gs://rmistry. | |
| 294 gs_relative_dir: str - Relative directory to the Google Storage base. | |
| 295 files_to_delete: Files that should be deleted from the Google Storage | |
| 296 directory. The files are deleted one at a time. If files_to_delete is | |
| 297 None or empty then all directory contents are deleted. | |
| 298 """ | |
| 299 gs_dest = posixpath.join(gs_base, gs_relative_dir) | |
| 300 if files_to_delete: | |
| 301 for file_to_delete in files_to_delete: | |
| 302 delete_storage_object(object_name=posixpath.join(gs_dest, file_to_delete)) | |
| 303 else: | |
| 304 delete_storage_object(gs_dest) | |
| 305 | |
| 306 | |
| 307 def upload_directory_contents_if_changed(gs_base, gs_relative_dir, gs_acl, | |
| 308 local_dir, force_upload=False, | |
| 309 upload_chunks=False, | |
| 310 files_to_upload=None): | |
| 311 """Compares the TIMESTAMP_LAST_UPLOAD_COMPLETED and uploads if different. | |
| 312 | |
| 313 Args: | |
| 314 gs_base: str - The Google Storage base. Eg: gs://rmistry. | |
| 315 gs_relative_dir: str - Relative directory to the Google Storage base. | |
| 316 gs_acl: str - ACL to use when uploading to Google Storage. | |
| 317 local_dir: str - The local directory to upload. | |
| 318 force_upload: bool - Whether upload should be done regardless of timestamps | |
| 319 matching or not. | |
| 320 upload_chunks: bool - Whether upload should be done in chunks or in a single | |
| 321 command. | |
| 322 files_to_upload: str seq - Specific files that should be uploaded, if not | |
| 323 specified then all files in local_dir are uploaded. If upload_chunks is | |
| 324 True then files will be uploaded in chunks else they will be uploaded | |
| 325 one at a time. The Google Storage directory is not cleaned before upload | |
| 326 if files_to_upload is specified. | |
| 327 | |
| 328 The goal of download_directory_contents_if_changed and | |
| 329 upload_directory_contents_if_changed is to attempt to replicate directory | |
| 330 level rsync functionality to the Google Storage directories we care about. | |
| 331 | |
| 332 Returns True if contents were uploaded, else returns False. | |
| 333 """ | |
| 334 if not force_upload and _are_timestamps_equal(gs_base, gs_relative_dir, | |
| 335 local_dir): | |
| 336 print '\n\n=======Local directory is current=======\n\n' | |
| 337 return False | |
| 338 else: | |
| 339 local_src = os.path.join(local_dir, '*') | |
| 340 gs_dest = posixpath.join(gs_base, gs_relative_dir) | |
| 341 timestamp_value = time.time() | |
| 342 | |
| 343 if not files_to_upload: | |
| 344 print '\n\n=======Delete Storage directory before uploading=======\n\n' | |
| 345 delete_storage_object(gs_dest) | |
| 346 | |
| 347 print '\n\n=======Writing new TIMESTAMP_LAST_UPLOAD_STARTED=======\n\n' | |
| 348 write_timestamp_file( | |
| 349 timestamp_file_name=TIMESTAMP_STARTED_FILENAME, | |
| 350 timestamp_value=timestamp_value, gs_base=gs_base, | |
| 351 gs_relative_dir=gs_relative_dir, local_dir=local_dir, gs_acl=gs_acl) | |
| 352 | |
| 353 if upload_chunks: | |
| 354 if files_to_upload: | |
| 355 local_files = [ | |
| 356 os.path.join(local_dir, local_file) | |
| 357 for local_file in files_to_upload] | |
| 358 else: | |
| 359 local_files = [ | |
| 360 os.path.join(local_dir, local_file) | |
| 361 for local_file in os.listdir(local_dir)] | |
| 362 for files_chunk in _get_chunks(local_files, FILES_CHUNK): | |
| 363 gsutil = slave_utils.GSUtilSetup() | |
| 364 command = [gsutil, 'cp'] + files_chunk + [gs_dest] | |
| 365 try: | |
| 366 shell_utils.run(command) | |
| 367 except shell_utils.CommandFailedException: | |
| 368 raise Exception( | |
| 369 'Could not upload the chunk to Google Storage! The chunk: %s' | |
| 370 % files_chunk) | |
| 371 else: | |
| 372 if files_to_upload: | |
| 373 for file_to_upload in files_to_upload: | |
| 374 if slave_utils.GSUtilDownloadFile( | |
| 375 src=os.path.join(local_dir, file_to_upload), dst=gs_dest) != 0: | |
| 376 raise Exception( | |
| 377 'Could not upload %s to Google Storage!' % file_to_upload) | |
| 378 else: | |
| 379 if slave_utils.GSUtilDownloadFile(src=local_src, dst=gs_dest) != 0: | |
| 380 raise Exception('Could not upload %s to Google Storage!' % local_src) | |
| 381 | |
| 382 print '\n\n=======Writing new TIMESTAMP_LAST_UPLOAD_COMPLETED=======\n\n' | |
| 383 write_timestamp_file( | |
| 384 timestamp_file_name=TIMESTAMP_COMPLETED_FILENAME, | |
| 385 timestamp_value=timestamp_value, gs_base=gs_base, | |
| 386 gs_relative_dir=gs_relative_dir, local_dir=local_dir, gs_acl=gs_acl) | |
| 387 return True | |
| 388 | |
| 389 | |
| 390 def _are_timestamps_equal(gs_base, gs_relative_dir, local_dir): | |
| 391 """Compares the local TIMESTAMP with the TIMESTAMP from Google Storage.""" | |
| 392 | |
| 393 local_timestamp_file = os.path.join(local_dir, TIMESTAMP_COMPLETED_FILENAME) | |
| 394 # Make sure that the local TIMESTAMP file exists. | |
| 395 if not os.path.exists(local_timestamp_file): | |
| 396 return False | |
| 397 | |
| 398 # Get the timestamp file from Google Storage. | |
| 399 src = posixpath.join(gs_base, gs_relative_dir, TIMESTAMP_COMPLETED_FILENAME) | |
| 400 temp_file = tempfile.mkstemp()[1] | |
| 401 slave_utils.GSUtilDownloadFile(src=src, dst=temp_file) | |
| 402 | |
| 403 local_file_obj = open(local_timestamp_file, 'r') | |
| 404 storage_file_obj = open(temp_file, 'r') | |
| 405 try: | |
| 406 local_timestamp = local_file_obj.read().strip() | |
| 407 storage_timestamp = storage_file_obj.read().strip() | |
| 408 return local_timestamp == storage_timestamp | |
| 409 finally: | |
| 410 local_file_obj.close() | |
| 411 storage_file_obj.close() | |
| 412 | |
| 413 | |
| 414 def read_timestamp_file(timestamp_file_name, gs_base, gs_relative_dir): | |
| 415 """Reads the specified TIMESTAMP file from the specified GS dir. | |
| 416 | |
| 417 Returns 0 if the file is empty or does not exist. | |
| 418 """ | |
| 419 src = posixpath.join(gs_base, gs_relative_dir, timestamp_file_name) | |
| 420 temp_file = tempfile.mkstemp()[1] | |
| 421 slave_utils.GSUtilDownloadFile(src=src, dst=temp_file) | |
| 422 | |
| 423 storage_file_obj = open(temp_file, 'r') | |
| 424 try: | |
| 425 timestamp_value = storage_file_obj.read().strip() | |
| 426 return timestamp_value if timestamp_value else "0" | |
| 427 finally: | |
| 428 storage_file_obj.close() | |
| 429 | |
| 430 | |
| 431 def write_timestamp_file(timestamp_file_name, timestamp_value, gs_base=None, | |
| 432 gs_relative_dir=None, gs_acl=None, local_dir=None): | |
| 433 """Adds a timestamp file to a Google Storage and/or a Local Directory. | |
| 434 | |
| 435 If gs_base, gs_relative_dir and gs_acl are provided then the timestamp is | |
| 436 written to Google Storage. If local_dir is provided then the timestamp is | |
| 437 written to a local directory. | |
| 438 """ | |
| 439 timestamp_file = os.path.join(tempfile.gettempdir(), timestamp_file_name) | |
| 440 f = open(timestamp_file, 'w') | |
| 441 try: | |
| 442 f.write(str(timestamp_value)) | |
| 443 finally: | |
| 444 f.close() | |
| 445 if local_dir: | |
| 446 shutil.copyfile(timestamp_file, | |
| 447 os.path.join(local_dir, timestamp_file_name)) | |
| 448 if gs_base and gs_relative_dir and gs_acl: | |
| 449 slave_utils.GSUtilCopyFile(filename=timestamp_file, gs_base=gs_base, | |
| 450 subdir=gs_relative_dir, gs_acl=gs_acl) | |
| 451 | |
| 452 | |
| 453 def _convert_to_posixpath(localpath): | |
| 454 """Convert localpath to posix format.""" | |
| 455 if os.sep == '/': | |
| 456 return localpath | |
| 457 else: | |
| 458 return '/'.join(localpath.split(os.sep)) | |
| OLD | NEW |