From 8e8577e8d49194c79e730bd7978253f98e4e0176 Mon Sep 17 00:00:00 2001 From: Valentin Lorentz <vlorentz@softwareheritage.org> Date: Wed, 8 Apr 2020 10:30:28 +0200 Subject: [PATCH] Prevent erroneous HashCollisions by using the same ctime for all rows. 'swh_content_add' tries to avoid this issue with a DISTINCT clause on the entire row; but it is useless because 'ctime' cells differ by a few microseconds. This commit ensures all ctime values are exactly the same, so they are filtered out. An alternative would be to change 'swh_content_add' to do: ``` select distinct on (sha1, sha1_git, sha256, blake2s256, length, status) sha1, sha1_git, sha256, blake2s256, length, status, ctime from tmp_content ``` instead of: ``` select distinct sha1, sha1_git, sha256, blake2s256, length, status, ctime from tmp_content ``` but this is more verbose and there's no good reason to call 'now()' for every row. --- swh/storage/storage.py | 6 ++++-- swh/storage/tests/test_storage.py | 6 ++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/swh/storage/storage.py b/swh/storage/storage.py index 7739431d0..772cd58ae 100644 --- a/swh/storage/storage.py +++ b/swh/storage/storage.py @@ -188,7 +188,8 @@ class Storage(): @process_metrics def content_add( self, content: Iterable[Content]) -> Dict: - contents = [attr.evolve(c, ctime=now()) for c in content] + ctime = now() + contents = [attr.evolve(c, ctime=ctime) for c in content] objstorage_summary = self.objstorage.content_add(contents) @@ -398,7 +399,8 @@ class Storage(): @db_transaction() def skipped_content_add(self, content: Iterable[SkippedContent], db=None, cur=None) -> Dict: - content = [attr.evolve(c, ctime=now()) for c in content] + ctime = now() + content = [attr.evolve(c, ctime=ctime) for c in content] missing_contents = self.skipped_content_missing( (c.to_dict() for c in content), diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py index 50ae7b253..246825894 100644 --- a/swh/storage/tests/test_storage.py +++ b/swh/storage/tests/test_storage.py @@ -330,6 +330,12 @@ class TestStorage: Content.from_dict(cont1b).hashes() ] + def test_content_add_duplicate(self, swh_storage): + swh_storage.content_add([data.cont, data.cont]) + + assert list(swh_storage.content_get([data.cont['sha1']])) == \ + [{'sha1': data.cont['sha1'], 'data': data.cont['data']}] + def test_content_update(self, swh_storage): if hasattr(swh_storage, 'storage'): swh_storage.journal_writer.journal = None # TODO, not supported -- GitLab