| Index: py/utils/anyfile_utils.py | 
| diff --git a/py/utils/anyfile_utils.py b/py/utils/anyfile_utils.py | 
| new file mode 100644 | 
| index 0000000000000000000000000000000000000000..23c6c51ca2042893d04d904b5bc95ff4605d327e | 
| --- /dev/null | 
| +++ b/py/utils/anyfile_utils.py | 
| @@ -0,0 +1,151 @@ | 
| +#!/usr/bin/python | 
| + | 
| +""" | 
| +Copyright 2014 Google Inc. | 
| + | 
| +Use of this source code is governed by a BSD-style license that can be | 
| +found in the LICENSE file. | 
| + | 
| +Utilities for uniformly handling files whether they are local or remote. | 
| + | 
| +EPOGER: Add unittests | 
| +""" | 
| + | 
| +# System-level imports | 
| +import contextlib | 
| +import os | 
| +import shutil | 
| +import urllib | 
| +import urlparse | 
| + | 
| +# Local imports | 
| +import gs_utils | 
| + | 
| + | 
| +class AnyFileUtils(object): | 
| +  """Utilities for uniformly handling files whether they are local or remote.""" | 
| + | 
| +  class BaseFileObject(object): | 
| +    """Base object used for all file types.""" | 
| +    pass | 
| + | 
| +  class GsFile(BaseFileObject): | 
| +    def __init__(self, gs_url): | 
| +      self._gs_url = gs_url | 
| +      self._bucket = None | 
| +      self._path = None | 
| +      self._url = None | 
| + | 
| +    @property | 
| +    def bucket(self): | 
| +      if self._bucket == None: | 
| +        (self._bucket, self._path) = gs_utils.GSUtils.split_gs_url(self._gs_url) | 
| +      return self._bucket | 
| + | 
| +    @property | 
| +    def path(self): | 
| +      if self._path == None: | 
| +        (self._bucket, self._path) = gs_utils.GSUtils.split_gs_url(self._gs_url) | 
| +      return self._path | 
| + | 
| +    @property | 
| +    def url(self): | 
| +      if self._url == None: | 
| +        self._url = 'http://%s.commondatastorage.googleapis.com' % self.bucket | 
| +        if self.path: | 
| +          self._url += '/' + self.path | 
| +      return self._url | 
| + | 
| +  class HttpFile(BaseFileObject): | 
| +    def __init__(self, url): | 
| +      self._url = url | 
| + | 
| +    @property | 
| +    def url(self): | 
| +      return self._url | 
| + | 
| +  class LocalFile(BaseFileObject): | 
| +    def __init__(self, url=None, path=None): | 
| +      """Must be constructed with EITHER url OR path, but not both.""" | 
| +      assert (url == None and path != None) or (url != None and path == None) | 
| +      self._url = url | 
| +      if path == None: | 
| +        self._abspath = None | 
| +      else: | 
| +        self._abspath = os.path.abspath(path) | 
| + | 
| +    @property | 
| +    def abspath(self): | 
| +      if self._abspath == None: | 
| +        self._abspath = os.path.abspath( | 
| +            urllib.url2pathname(urlparse.urlparse(self._url).path)) | 
| +      return self._abspath | 
| + | 
| +    @property | 
| +    def url(self): | 
| +      if self._url == None: | 
| +        self._url = urlparse.urljoin( | 
| +            'file:', urllib.pathname2url(self._abspath)) | 
| +      return self._url | 
| + | 
| +  def __init__(self, boto_file_path=None): | 
| +    """Constructor. | 
| + | 
| +    Params: | 
| +      boto_file_path: full path (local-OS-style) on local disk where .boto | 
| +          credentials file can be found.  If None, then the AnyFileUtils object | 
| +          created will be able to access only public files in Google Storage. | 
| + | 
| +    Raises an exception if no file is found at boto_file_path, or if the file | 
| +    found there is malformed. | 
| +    """ | 
| +    self._gs = gs_utils.GSUtils(boto_file_path=boto_file_path) | 
| + | 
| +  def copy_file(self, source, dest): | 
| +    """Copy a single file from one place to another. | 
| + | 
| +    Args: | 
| +      source: The file to make a copy of. | 
| +          May be a URL, filepath, or BaseFileObject subclass instance. | 
| +      dest: Where to write the copy to. | 
| +          May be a URL, filepath, or BaseFileObject subclass instance. | 
| +      dest: URL or filepath telling us where to write the copy to | 
| +    """ | 
| +    source_object = self.create_file_object(source) | 
| +    dest_object = self.create_file_object(dest) | 
| + | 
| +    if not isinstance(dest_object, self.LocalFile): | 
| +      raise Exception('unsupported dest_object type %s' % type(dest_object)) | 
| + | 
| +    if isinstance(source_object, self.HttpFile): | 
| +      with contextlib.closing(urllib.urlopen(source_object.url)) as fsrc: | 
| +        with open(dest_object.abspath, 'wb') as fdst: | 
| +          shutil.copyfileobj(fsrc=fsrc, fdst=fdst) | 
| +    elif isinstance(source_object, self.GsFile): | 
| +      (bucket, path) = gs_utils.GSUtils.split_gs_url(url) | 
| +      self._gs.download_file( | 
| +          source_bucket=source_object.bucket, source_path=source_object.path, | 
| +          dest_path=dest_object.abspath) | 
| +    else: | 
| +      raise Exception('unsupported source_object type %s' % type(source_object)) | 
| + | 
| +  @staticmethod | 
| +  def create_file_object(url_or_filepath): | 
| +    """Returns a subclass instance of BaseFileObject referring to a file or url. | 
| + | 
| +    Args: | 
| +      url_or_filepath: URL referring to the file object, or a path to file on | 
| +          local disk | 
| +    """ | 
| +    # EPOGER: first, check if it's already a BaseFileObject | 
| +    lowercase_url = url_or_filepath.lower() | 
| +    if lowercase_url.startswith('http:'): | 
| +      return AnyFileUtils.HttpFile(url=url_or_filepath) | 
| +    if lowercase_url.startswith('https:'): | 
| +      return AnyFileUtils.HttpFile(url=url_or_filepath) | 
| +    if lowercase_url.startswith('gs:'): | 
| +      return AnyFileUtils.GsFile(gs_url=url_or_filepath) | 
| +    if lowercase_url.startswith('file:'): | 
| +      return AnyFileUtils.LocalFile(url=url_or_filepath) | 
| +    else: | 
| +      return AnyFileUtils.LocalFile(path=url_or_filepath) | 
|  |