From 8e8577e8d49194c79e730bd7978253f98e4e0176 Mon Sep 17 00:00:00 2001
From: Valentin Lorentz <vlorentz@softwareheritage.org>
Date: Wed, 8 Apr 2020 10:30:28 +0200
Subject: [PATCH] Prevent erroneous HashCollisions by using the same ctime for
 all rows.

'swh_content_add' tries to avoid this issue with a DISTINCT clause
on the entire row; but it is useless because 'ctime' cells differ by
a few microseconds.
This commit ensures all ctime values are exactly the same, so they
are filtered out.

An alternative would be to change 'swh_content_add' to do:

```
select distinct on (sha1, sha1_git, sha256, blake2s256, length, status) sha1, sha1_git, sha256, blake2s256, length, status, ctime from tmp_content
```

instead of:

```
select distinct sha1, sha1_git, sha256, blake2s256, length, status, ctime from tmp_content
```

but this is more verbose and there's no good reason to call 'now()' for
every row.
---
 swh/storage/storage.py            | 6 ++++--
 swh/storage/tests/test_storage.py | 6 ++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/swh/storage/storage.py b/swh/storage/storage.py
index 7739431d0..772cd58ae 100644
--- a/swh/storage/storage.py
+++ b/swh/storage/storage.py
@@ -188,7 +188,8 @@ class Storage():
     @process_metrics
     def content_add(
             self, content: Iterable[Content]) -> Dict:
-        contents = [attr.evolve(c, ctime=now()) for c in content]
+        ctime = now()
+        contents = [attr.evolve(c, ctime=ctime) for c in content]
 
         objstorage_summary = self.objstorage.content_add(contents)
 
@@ -398,7 +399,8 @@ class Storage():
     @db_transaction()
     def skipped_content_add(self, content: Iterable[SkippedContent],
                             db=None, cur=None) -> Dict:
-        content = [attr.evolve(c, ctime=now()) for c in content]
+        ctime = now()
+        content = [attr.evolve(c, ctime=ctime) for c in content]
 
         missing_contents = self.skipped_content_missing(
             (c.to_dict() for c in content),
diff --git a/swh/storage/tests/test_storage.py b/swh/storage/tests/test_storage.py
index 50ae7b253..246825894 100644
--- a/swh/storage/tests/test_storage.py
+++ b/swh/storage/tests/test_storage.py
@@ -330,6 +330,12 @@ class TestStorage:
             Content.from_dict(cont1b).hashes()
         ]
 
+    def test_content_add_duplicate(self, swh_storage):
+        swh_storage.content_add([data.cont, data.cont])
+
+        assert list(swh_storage.content_get([data.cont['sha1']])) == \
+            [{'sha1': data.cont['sha1'], 'data': data.cont['data']}]
+
     def test_content_update(self, swh_storage):
         if hasattr(swh_storage, 'storage'):
             swh_storage.journal_writer.journal = None  # TODO, not supported
-- 
GitLab