Skip to content
Snippets Groups Projects
Verified Commit 639262db authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

Add simple cache with size

parent cdd9199d
Branches with-simple-cache
No related tags found
No related merge requests found
# Copyright (C) 2015-2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import deque
class SimpleCache():
def __init__(self, max_size=10000, eviction_percent=0.2):
"""Initialize cache of max_size elements.
Args:
- max_size: the max number of elements to cache.
- eviction_percent: Percent of elements to evict from cache
when max_size is reached. The eviction removes the first
elements from the cache.
"""
self.max_size = max_size
assert eviction_percent >= 0 and eviction_percent <= 1
self.nb_elements_to_purge = int(max_size * eviction_percent)
self.s = set()
self.stack = deque([], maxlen=max_size)
self.count = 0
def __str__(self):
return ('set: %s, stack: %s, count: %s, max-size: %s, nb-purge: %s' % (
self.s,
self.stack,
self.count,
self.max_size,
self.nb_elements_to_purge))
def _evict(self):
"""Remove self.nb_elements_to_purge from cache.
"""
elems_to_remove = set()
for x in range(0, self.nb_elements_to_purge):
e = self.stack.popleft()
elems_to_remove.add(e)
self.s = self.s - elems_to_remove
self.count = self.count - self.nb_elements_to_purge
def add(self, e):
if e not in self.s:
self.s.add(e)
self.stack.append(e)
self.count += 1
if self.count >= self.max_size:
self._evict()
def set(self):
return self.s
......@@ -17,6 +17,7 @@ from . import converters
from swh.model.git import GitType
from swh.storage import get_storage
from .cache import SimpleCache
from .queue import QueuePerSizeAndNbUniqueElements
from .queue import QueuePerNbUniqueElements
from .queue import QueuePerNbElements
......@@ -65,30 +66,32 @@ class SWHLoader(config.SWHConfig):
self.log = logging.getLogger(logging_class)
self.max_content_size = config['content_packet_size_bytes']
self.contents = QueuePerSizeAndNbUniqueElements(
key='sha1',
max_nb_elements=self.config['content_packet_size'],
max_size=self.config['content_packet_block_size_bytes'])
self.contents_seen = set()
self.contents_seen = SimpleCache()
self.directories = QueuePerNbUniqueElements(
key='id',
max_nb_elements=self.config['directory_packet_size'])
self.directories_seen = set()
self.directories_seen = SimpleCache()
self.revisions = QueuePerNbUniqueElements(
key='id',
max_nb_elements=self.config['revision_packet_size'])
self.revisions_seen = set()
self.revisions_seen = SimpleCache()
self.releases = QueuePerNbUniqueElements(
key='id',
max_nb_elements=self.config['release_packet_size'])
self.releases_seen = set()
self.releases_seen = SimpleCache()
self.occurrences = QueuePerNbElements(
self.config['occurrence_packet_size'])
......@@ -210,20 +213,22 @@ class SWHLoader(config.SWHConfig):
"""Filter missing blob from swh.
"""
max_content_size = self.config['content_packet_size_bytes']
blobs_per_sha1 = {}
shallow_blobs = []
for key, blob in ((b['sha1'], b) for b in blobs
if b['sha1'] not in self.contents_seen):
for blob in blobs:
key = blob['sha1']
if key in self.contents_seen.set():
continue
blobs_per_sha1[key] = blob
shallow_blobs.append(converters.shallow_blob(blob))
self.contents_seen.add(key)
for sha1 in self.storage.content_missing(shallow_blobs,
key_hash='sha1'):
yield converters.blob_to_content(blobs_per_sha1[sha1],
max_content_size=max_content_size,
origin_id=self.origin_id)
yield converters.blob_to_content(
blobs_per_sha1[sha1],
max_content_size=self.max_content_size,
origin_id=self.origin_id)
def bulk_send_blobs(self, blobs):
"""Format blobs as swh contents and send them to the database"""
......@@ -238,8 +243,11 @@ class SWHLoader(config.SWHConfig):
"""
trees_per_sha1 = {}
shallow_trees = []
for key, tree in ((t['sha1_git'], t) for t in trees
if t['sha1_git'] not in self.directories_seen):
for tree in trees:
key = tree['sha1_git']
if key in self.directories_seen.set():
continue
trees_per_sha1[key] = tree
shallow_trees.append(converters.shallow_tree(tree))
self.directories_seen.add(key)
......@@ -261,8 +269,11 @@ class SWHLoader(config.SWHConfig):
"""
commits_per_sha1 = {}
shallow_commits = []
for key, commit in ((c['id'], c) for c in commits
if c['id'] not in self.revisions_seen):
for commit in commits:
key = commit['id']
if key in self.revisions_seen.set():
continue
commits_per_sha1[key] = commit
shallow_commits.append(converters.shallow_commit(commit))
self.revisions_seen.add(key)
......@@ -287,8 +298,11 @@ class SWHLoader(config.SWHConfig):
"""
tags_per_sha1 = {}
shallow_tags = []
for key, tag in ((t['id'], t) for t in tags
if t['id'] not in self.releases_seen):
for tag in tags:
key = tag['id']
if key in self.releases_seen.set():
continue
tags_per_sha1[key] = tag
shallow_tags.append(converters.shallow_tag(tag))
self.releases_seen.add(key)
......
# Copyright (C) 2015-2016 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import unittest
from nose.tools import istest
from swh.loader.vcs.cache import SimpleCache
class TestSimpleCache(unittest.TestCase):
@istest
def simple_cache_behavior_fails_to_init(self):
try:
SimpleCache(max_size=6, eviction_percent=10)
except AssertionError:
self.assertTrue(True)
@istest
def simple_cache_behavior(self):
# given
cache = SimpleCache(max_size=6, eviction_percent=0.5)
cache.add(3)
cache.add(2)
cache.add(1)
cache.add(1) # duplicate elements are dismissed
# when
self.assertEquals(cache.set(), {1, 2, 3})
self.assertEquals(cache.count, 3)
cache.add(4)
cache.add(5)
self.assertEquals(cache.set(), {1, 2, 3, 4, 5})
self.assertEquals(cache.count, 5)
cache.add(6) # we hit max-size, 50% of elements (here 3) are evicted
self.assertEquals(cache.set(), {4, 5, 6})
self.assertEquals(cache.count, 3)
cache.add(7)
cache.add(8)
self.assertEquals(cache.set(), {4, 5, 6, 7, 8})
self.assertEquals(cache.count, 5)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment