Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(34)

Unified Diff: client/isolateserver.py

Issue 2414543003: isolateserver: DiskCache format v2 (Closed)
Patch Set: typo and nit Created 4 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | client/tests/isolateserver_test.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: client/isolateserver.py
diff --git a/client/isolateserver.py b/client/isolateserver.py
index 8a8bed6763c2ba98dbe89e007a77131fd99fedd8..b41ea3f2fa1c9457b2fa555d67b69595290b7c9a 100755
--- a/client/isolateserver.py
+++ b/client/isolateserver.py
@@ -5,11 +5,13 @@
"""Archives a set of files or directories to an Isolate Server."""
-__version__ = '0.6.0'
+__version__ = '0.7.0'
import base64
+import collections
import errno
import functools
+import json
import io
import logging
import optparse
@@ -32,7 +34,6 @@ from libs import arfile
from utils import file_path
from utils import fs
from utils import logging_utils
-from utils import lru
from utils import net
from utils import on_error
from utils import subprocess42
@@ -1481,28 +1482,35 @@ class CachePolicies(object):
class DiskCache(LocalCache):
- """Stateful LRU cache in a flat hash table in a directory.
+ """Stateful LRU cache in a semi-flat hash table in a directory.
Saves its state as json file.
"""
STATE_FILE = u'state.json'
- def __init__(self, cache_dir, policies, hash_algo):
+ # All protected methods (starting with '_') except _path should be called
+ # with self._lock held.
+
+ def __init__(self, cache_dir, policies, hash_algo, time_fn=None):
"""
Arguments:
cache_dir: directory where to place the cache.
policies: cache retention policies.
algo: hashing algorithm used.
+ time_fn: function to take current timestamp when adding new items.
+ Defaults to time.time.
"""
- # All protected methods (starting with '_') except _path should be called
- # with self._lock held.
super(DiskCache, self).__init__()
+ self.time_fn = time_fn or time.time
self.cache_dir = cache_dir
self.policies = policies
self.hash_algo = hash_algo
self.state_file = os.path.join(cache_dir, self.STATE_FILE)
- # Items in a LRU lookup dict(digest: size).
- self._lru = lru.LRUDict()
+ # Items in a LRU lookup dict(digest: [size, timestamp]).
+ # We use lists instead of tuples because JSON arrays are parsed to lists.
+ # Mutations must be followed by `self._dirty = True`.
+ self._lru = collections.OrderedDict()
+ self._dirty = False # if True, self._lru was modified since loading/saving.
# Current cached free disk space. It is updated by self._trim().
self._free_disk = 0
# The first item in the LRU cache that must not be evicted during this run
@@ -1512,9 +1520,11 @@ class DiskCache(LocalCache):
self._protected = None
# Cleanup operations done by self._load(), if any.
self._operations = []
+
+ self._free_disk = file_path.get_free_space(self.cache_dir)
+
with tools.Profiler('Setup'):
with self._lock:
- # self._load() calls self._trim() which initializes self._free_disk.
self._load()
def __contains__(self, digest):
@@ -1535,7 +1545,7 @@ class DiskCache(LocalCache):
logging.info(
'%5d (%8dkb) current',
len(self._lru),
- sum(self._lru.itervalues()) / 1024)
+ self._cache_disk_size() / 1024)
logging.info(
'%5d (%8dkb) evicted',
len(self._evicted), sum(self._evicted) / 1024)
@@ -1544,9 +1554,15 @@ class DiskCache(LocalCache):
self._free_disk / 1024)
return False
+ def _sizes(self):
+ """Returns an iterator of pairs (digest, size)."""
+ return (
+ (digest, size)
+ for digest, (size, _) in self._lru.iteritems())
+
def cached_set(self):
with self._lock:
- return self._lru.keys_set()
+ return set(self._lru)
def cleanup(self):
"""Cleans up the cache directory.
@@ -1557,36 +1573,44 @@ class DiskCache(LocalCache):
At that point, the cache was already loaded, trimmed to respect cache
policies.
"""
- fs.chmod(self.cache_dir, 0700)
- # Ensure that all files listed in the state still exist and add new ones.
- previous = self._lru.keys_set()
- # It'd be faster if there were a readdir() function.
- for filename in fs.listdir(self.cache_dir):
- if filename == self.STATE_FILE:
- fs.chmod(os.path.join(self.cache_dir, filename), 0600)
- continue
- if filename in previous:
- fs.chmod(os.path.join(self.cache_dir, filename), 0400)
- previous.remove(filename)
- continue
-
- # An untracked file. Delete it.
+ def try_remove(filename):
+ assert os.path.isabs(filename)
logging.warning('Removing unknown file %s from cache', filename)
- p = self._path(filename)
- if fs.isdir(p):
+ if fs.isdir(filename):
try:
- file_path.rmtree(p)
+ file_path.rmtree(filename)
except OSError:
pass
else:
- file_path.try_remove(p)
- continue
+ file_path.try_remove(filename)
+
+ fs.chmod(self.cache_dir, 0700)
+ # Ensure that all files listed in the state still exist and add new ones.
+ previous = set(self._lru)
M-A Ruel 2016/10/13 20:51:12 self.cached_set() ? just for the form. :)
+ # It'd be faster if there were a readdir() function.
+ for filename_l1 in fs.listdir(self.cache_dir):
+ full_name_l1 = os.path.join(self.cache_dir, filename_l1)
+ if filename_l1 == self.STATE_FILE:
+ fs.chmod(full_name_l1, 0600)
+ continue
+ if len(filename_l1) != 2:
+ try_remove(full_name_l1)
+ continue
+ for filename_l2 in fs.listdir(full_name_l1):
+ digest = filename_l1 + filename_l2
+ full_name_l2 = os.path.join(full_name_l1, filename_l2)
+ if digest in previous:
+ fs.chmod(full_name_l2, 0400)
+ previous.remove(digest)
+ else:
+ try_remove(full_name_l2)
if previous:
# Filter out entries that were not found.
logging.warning('Removed %d lost files', len(previous))
- for filename in previous:
- self._lru.pop(filename)
+ for digest in previous:
+ self._lru.pop(digest)
+ self._dirty = True
# What remains to be done is to hash every single item to
# detect corruption, then save to ensure state.json is up to date.
@@ -1616,9 +1640,10 @@ class DiskCache(LocalCache):
# Update it's LRU position.
with self._lock:
- if digest not in self._lru:
+ if self._lru.pop(digest, None) is None:
return False
- self._lru.touch(digest)
+ self._lru[digest] = [size, self.time_fn()]
+ self._dirty = True
self._protected = self._protected or digest
return True
@@ -1626,14 +1651,15 @@ class DiskCache(LocalCache):
with self._lock:
# Do not check for 'digest == self._protected' since it could be because
# the object is corrupted.
- self._lru.pop(digest)
+ self._lru.pop(digest, None)
+ self._dirty = True
self._delete_file(digest, UNKNOWN_FILE_SIZE)
def getfileobj(self, digest):
try:
f = fs.open(self._path(digest), 'rb')
with self._lock:
- self._used.append(self._lru[digest])
+ self._used.append(self._lru[digest][0])
return f
except IOError:
raise CacheMiss(digest)
@@ -1665,10 +1691,46 @@ class DiskCache(LocalCache):
self._add(digest, size)
return digest
+ def _cache_disk_size(self):
M-A Ruel 2016/10/13 20:51:12 can you move it aside cache_set? I feel it'd make
+ """Returns number of bytes that cache files take."""
+ return sum(size for (size, _) in self._lru.itervalues())
+
+ @staticmethod
+ def _validate_state(state):
+ """Raises ValueError if the given serialized state is invalid."""
+ if not isinstance(state, dict):
+ raise ValueError('not a JSON object')
+
+ version = state.get('version')
+ if not isinstance(version, int):
+ raise ValueError('version %r is not an integer' % (version,))
+
+ entries = state.get('entries')
+ if not isinstance(entries, list):
+ raise ValueError('entries is not an array')
+ for entry in entries:
+ try:
+ if not isinstance(entry, list) or len(entry) != 2:
+ raise ValueError('not a pair')
+ if not isinstance(entry[0], basestring):
+ raise ValueError('first item is not a string')
+
+ pair_of_numbers = (
+ isinstance(entry[1], list) and
+ len(entry[1]) == 2 and
+ all(isinstance(x, (int, float)) for x in entry[1])
+ )
+ if not pair_of_numbers:
+ raise ValueError('second item is not a pair of numbers')
+ except ValueError as ex:
+ raise ValueError('invalid entry %r: %s' % (entry, ex))
+
def _load(self):
- """Loads state of the cache from json file.
+ """Loads state of the cache from json file if it exists.
If cache_dir does not exist on disk, it is created.
+
+ Raises ValueError if the state file is broken.
"""
self._lock.assert_locked()
@@ -1678,32 +1740,54 @@ class DiskCache(LocalCache):
else:
# Load state of the cache.
try:
- self._lru = lru.LRUDict.load(self.state_file)
- except ValueError as err:
- logging.error('Failed to load cache state: %s' % (err,))
- # Don't want to keep broken state file.
- file_path.try_remove(self.state_file)
+ with open(self.state_file, 'r') as f:
M-A Ruel 2016/10/13 20:51:12 'rb'
+ state = json.load(f)
+ if not isinstance(state, dict):
+ # Possibly old format.
M-A Ruel 2016/10/13 20:51:12 # Possibly old format. Hard reset with a clear cac
+ file_path.rmtree(self.cache_dir)
+ fs.makedirs(self.cache_dir)
+ self._lru = collections.OrderedDict()
+ else:
+ self._validate_state(state)
+ if state['version'] != 2:
+ raise ValueError(
+ 'unsupported version %s, supported is 2' % (state['version']))
+ self._lru = collections.OrderedDict(state['entries'])
+ self._dirty = False
+ except (IOError, ValueError) as e:
+ raise ValueError('Cannot load state file %s: %s' % (self.state_file, e))
+
self._trim()
# We want the initial cache size after trimming, i.e. what is readily
# avaiable.
self._initial_number_items = len(self._lru)
- self._initial_size = sum(self._lru.itervalues())
+ self._initial_size = self._cache_disk_size()
if self._evicted:
logging.info(
'Trimming evicted items with the following sizes: %s',
sorted(self._evicted))
def _save(self):
- """Saves the LRU ordering."""
+ """Saves the LRU ordering if changed."""
self._lock.assert_locked()
+ if not self._dirty:
+ return
+
if sys.platform != 'win32':
d = os.path.dirname(self.state_file)
if fs.isdir(d):
# Necessary otherwise the file can't be created.
file_path.set_read_only(d, False)
+
if fs.isfile(self.state_file):
file_path.set_read_only(self.state_file, False)
- self._lru.save(self.state_file)
+ with open(self.state_file, 'w') as f:
M-A Ruel 2016/10/13 20:51:12 'wb'
+ contents = {
+ 'version': 2,
+ 'entries': self._lru.items(),
+ }
+ json.dump(contents, f)
M-A Ruel 2016/10/13 20:51:12 separators=(',',':') will save a few valuable byt
+ self._dirty = False
def _trim(self):
"""Trims anything we don't know, make sure enough free space exists."""
@@ -1711,7 +1795,7 @@ class DiskCache(LocalCache):
# Ensure maximum cache size.
if self.policies.max_cache_size:
- total_size = sum(self._lru.itervalues())
+ total_size = self._cache_disk_size()
while total_size > self.policies.max_cache_size:
total_size -= self._remove_lru_file(True)
@@ -1731,7 +1815,7 @@ class DiskCache(LocalCache):
self._remove_lru_file(True)
if trimmed_due_to_space:
- total_usage = sum(self._lru.itervalues())
+ total_usage = self._cache_disk_size()
usage_percent = 0.
if total_usage:
usage_percent = 100. * float(total_usage) / self.policies.max_cache_size
@@ -1748,29 +1832,41 @@ class DiskCache(LocalCache):
def _path(self, digest):
"""Returns the path to one item."""
- return os.path.join(self.cache_dir, digest)
+ assert len(digest) > 2
+ return os.path.join(self.cache_dir, digest[:2], digest[2:])
def _remove_lru_file(self, allow_protected):
"""Removes the lastest recently used file and returns its size."""
M-A Ruel 2016/10/13 20:51:12 Probably a typo of mine; latest ?
self._lock.assert_locked()
- try:
- digest, size = self._lru.get_oldest()
- if not allow_protected and digest == self._protected:
- raise Error('Not enough space to map the whole isolated tree')
- except KeyError:
+
+ digest = None
+ for key in self._lru:
+ digest = key
+ break
+ if digest is None:
raise Error('Nothing to remove')
- digest, size = self._lru.pop_oldest()
+
+ if not allow_protected and digest == self._protected:
+ raise Error('Not enough space to map the whole isolated tree')
+
+ digest, (size, _) = self._lru.popitem(last=False)
+ self._dirty = True
logging.debug("Removing LRU file %s", digest)
self._delete_file(digest, size)
return size
def _add(self, digest, size=UNKNOWN_FILE_SIZE):
- """Adds an item into LRU cache marking it as a newest one."""
+ """Adds an item into LRU cache marking it as a newest one.
+
+ Asumes the file exists.
M-A Ruel 2016/10/13 20:51:12 Assumes
+ """
self._lock.assert_locked()
if size == UNKNOWN_FILE_SIZE:
size = fs.stat(self._path(digest)).st_size
self._added.append(size)
- self._lru.add(digest, size)
+ self._lru.pop(digest, None)
+ self._lru[digest] = [size, self.time_fn()]
M-A Ruel 2016/10/13 20:51:12 self._lru[digest] = (size, self.time_fn()) it's s
+ self._dirty = True
self._free_disk -= size
# Do a quicker version of self._trim(). It only enforces free disk space,
# not cache size limits. It doesn't actually look at real free disk space,
@@ -1790,9 +1886,14 @@ class DiskCache(LocalCache):
try:
if size == UNKNOWN_FILE_SIZE:
size = fs.stat(self._path(digest)).st_size
- file_path.try_remove(self._path(digest))
+ path = self._path(digest)
+ file_path.try_remove(path)
self._evicted.append(size)
self._free_disk += size
+
+ parent = os.path.dirname(path)
+ if len(os.listdir(parent)) == 0:
M-A Ruel 2016/10/13 20:51:12 I don't think it should be done. It's better to le
+ fs.rmtree(parent)
M-A Ruel 2016/10/13 20:51:12 I don't see a corresponding call to mkdir() to cre
except OSError as e:
logging.error('Error attempting to delete a file %s:\n%s' % (digest, e))
« no previous file with comments | « no previous file | client/tests/isolateserver_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698