OLD | NEW |
(Empty) | |
| 1 # Copyright (c) 2013 The WebM project authors. All Rights Reserved. |
| 2 # |
| 3 # Use of this source code is governed by a BSD-style license |
| 4 # that can be found in the LICENSE file in the root of the source |
| 5 # tree. An additional intellectual property rights grant can be found |
| 6 # in the file PATENTS. All contributing project authors may |
| 7 # be found in the AUTHORS file in the root of the source tree. |
| 8 # |
| 9 # This simple script pulls test files from the webm homepage |
| 10 # It is intelligent enough to only pull files if |
| 11 # 1) File / test_data folder does not exist |
| 12 # 2) SHA mismatch |
| 13 |
| 14 import pycurl |
| 15 import csv |
| 16 import hashlib |
| 17 import re |
| 18 import os.path |
| 19 import time |
| 20 import itertools |
| 21 import sys |
| 22 import getopt |
| 23 |
| 24 #globals |
| 25 url = '' |
| 26 file_list_path = '' |
| 27 local_resource_path = '' |
| 28 |
| 29 # Helper functions: |
| 30 # A simple function which returns the sha hash of a file in hex |
| 31 def get_file_sha(filename): |
| 32 try: |
| 33 sha_hash = hashlib.sha1() |
| 34 with open(filename, 'rb') as file: |
| 35 buf = file.read(HASH_CHUNK) |
| 36 while len(buf) > 0: |
| 37 sha_hash.update(buf) |
| 38 buf = file.read(HASH_CHUNK) |
| 39 return sha_hash.hexdigest() |
| 40 except IOError: |
| 41 print "Error reading " + filename |
| 42 |
| 43 # Downloads a file from a url, and then checks the sha against the passed |
| 44 # in sha |
| 45 def download_and_check_sha(url, filename, sha): |
| 46 path = os.path.join(local_resource_path, filename) |
| 47 fp = open(path, "wb") |
| 48 curl = pycurl.Curl() |
| 49 curl.setopt(pycurl.URL, url + "/" + filename) |
| 50 curl.setopt(pycurl.WRITEDATA, fp) |
| 51 curl.perform() |
| 52 curl.close() |
| 53 fp.close() |
| 54 return get_file_sha(path) == sha |
| 55 |
| 56 #constants |
| 57 ftp_retries = 3 |
| 58 |
| 59 SHA_COL = 0 |
| 60 NAME_COL = 1 |
| 61 EXPECTED_COL = 2 |
| 62 HASH_CHUNK = 65536 |
| 63 |
| 64 # Main script |
| 65 try: |
| 66 opts, args = \ |
| 67 getopt.getopt(sys.argv[1:], \ |
| 68 "u:i:o:", ["url=", "input_csv=", "output_dir="]) |
| 69 except: |
| 70 print 'get_files.py -u <url> -i <input_csv> -o <output_dir>' |
| 71 sys.exit(2) |
| 72 |
| 73 for opt, arg in opts: |
| 74 if opt == '-u': |
| 75 url = arg |
| 76 elif opt in ("-i", "--input_csv"): |
| 77 file_list_path = os.path.join(arg) |
| 78 elif opt in ("-o", "--output_dir"): |
| 79 local_resource_path = os.path.join(arg) |
| 80 |
| 81 if len(sys.argv) != 7: |
| 82 print "Expects two paths and a url!" |
| 83 exit(1) |
| 84 |
| 85 if not os.path.isdir(local_resource_path): |
| 86 os.makedirs(local_resource_path) |
| 87 |
| 88 file_list_csv = open(file_list_path, "rb") |
| 89 |
| 90 # Our 'csv' file uses multiple spaces as a delimiter, python's |
| 91 # csv class only uses single character delimiters, so we convert them below |
| 92 file_list_reader = csv.reader((re.sub(' +', ' ', line) \ |
| 93 for line in file_list_csv), delimiter = ' ') |
| 94 |
| 95 file_shas = [] |
| 96 file_names = [] |
| 97 |
| 98 for row in file_list_reader: |
| 99 if len(row) != EXPECTED_COL: |
| 100 continue |
| 101 file_shas.append(row[SHA_COL]) |
| 102 file_names.append(row[NAME_COL]) |
| 103 |
| 104 file_list_csv.close() |
| 105 |
| 106 # Download files, only if they don't already exist and have correct shas |
| 107 for filename, sha in itertools.izip(file_names, file_shas): |
| 108 path = os.path.join(local_resource_path, filename) |
| 109 if os.path.isfile(path) \ |
| 110 and get_file_sha(path) == sha: |
| 111 print path + ' exists, skipping' |
| 112 continue |
| 113 for retry in range(0, ftp_retries): |
| 114 print "Downloading " + path |
| 115 if not download_and_check_sha(url, filename, sha): |
| 116 print "Sha does not match, retrying..." |
| 117 else: |
| 118 break |
OLD | NEW |