From ed908fef71edd7c10572cefd40d912acd297f8d0 Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <antoine.romain.dumont@gmail.com>
Date: Fri, 23 Sep 2016 11:47:02 +0200
Subject: [PATCH] sql/archiver/schema: Filter unknown sha1s from
 content_archive endpoint

---
 sql/swh-archiver-func.sql | 17 +++++++++++++++++
 sql/upgrades/005.sql      | 24 ++++++++++++++++++++++++
 swh/archiver/db.py        |  8 ++++++++
 swh/archiver/storage.py   | 24 ++++++++++++++++++++++--
 4 files changed, 71 insertions(+), 2 deletions(-)
 create mode 100644 sql/upgrades/005.sql

diff --git a/sql/swh-archiver-func.sql b/sql/swh-archiver-func.sql
index 5510262..5113056 100644
--- a/sql/swh-archiver-func.sql
+++ b/sql/swh-archiver-func.sql
@@ -29,3 +29,20 @@ end
 $$;
 
 COMMENT ON FUNCTION swh_content_archive_missing(text) IS 'Filter missing data from a specific backend';
+
+create or replace function swh_content_archive_unknown()
+    returns setof sha1
+    language plpgsql
+as $$
+begin
+    return query
+        select content_id
+        from tmp_content_archive tmp where not exists (
+            select 1
+            from content_archive c
+            where tmp.content_id = c.content_id
+        );
+end
+$$;
+
+COMMENT ON FUNCTION swh_content_archive_unknown() IS 'Retrieve list of unknown sha1s';
diff --git a/sql/upgrades/005.sql b/sql/upgrades/005.sql
new file mode 100644
index 0000000..bc50631
--- /dev/null
+++ b/sql/upgrades/005.sql
@@ -0,0 +1,24 @@
+-- SWH DB schema upgrade
+-- from_version: 4
+-- to_version: 5
+-- description: List unknown sha1s from content_archive
+
+INSERT INTO dbversion(version, release, description)
+VALUES(5, now(), 'Work In Progress');
+
+create or replace function swh_content_archive_unknown()
+    returns setof sha1
+    language plpgsql
+as $$
+begin
+    return query
+        select content_id
+        from tmp_content_archive tmp where not exists (
+            select 1
+            from content_archive c
+            where tmp.content_id = c.content_id
+        );
+end
+$$;
+
+COMMENT ON FUNCTION swh_content_archive_unknown() IS 'Retrieve list of unknown sha1';
diff --git a/swh/archiver/db.py b/swh/archiver/db.py
index a4611d9..b1156ae 100644
--- a/swh/archiver/db.py
+++ b/swh/archiver/db.py
@@ -196,6 +196,14 @@ class ArchiverDb(BaseDb):
                     (backend_name,))
         yield from cursor_to_bytes(cur)
 
+    def content_archive_get_unknown(self, cur=None):
+        """Retrieve unknown sha1 from archiver db.
+
+        """
+        cur = self._cursor(cur)
+        cur.execute('select * from swh_content_archive_unknown()')
+        yield from cursor_to_bytes(cur)
+
     def content_archive_insert(self, content_id, source, status, cur=None):
         """Insert a new entry in the db for the content_id.
 
diff --git a/swh/archiver/storage.py b/swh/archiver/storage.py
index 1336c17..b207a70 100644
--- a/swh/archiver/storage.py
+++ b/swh/archiver/storage.py
@@ -98,14 +98,14 @@ class ArchiverStorage():
 
     @db_transaction_generator
     def content_archive_get_missing(self, content_ids, backend_name, cur=None):
-        """Retrieve the list of missing copies from source_name.
+        """Retrieve missing sha1s from source_name.
 
         Args:
             content_ids ([sha1s]): list of sha1s to test
             source_name (str): Name of the backend to check for content
 
         Yields:
-            List of ids effectively missing from backend_name
+            missing sha1s from backend_name
 
         """
         db = self.db
@@ -117,6 +117,26 @@ class ArchiverStorage():
         for content_id in db.content_archive_get_missing(backend_name, cur):
             yield content_id[0]
 
+    @db_transaction_generator
+    def content_archive_get_unknown(self, content_ids, cur=None):
+        """Retrieve unknown sha1s from content_archive.
+
+        Args:
+            content_ids ([sha1s]): list of sha1s to test
+
+        Yields:
+            Unknown sha1s from content_archive
+
+        """
+        db = self.db
+
+        db.mktemp_content_archive()
+
+        db.copy_to(content_ids, 'tmp_content_archive', ['content_id'], cur)
+
+        for content_id in db.content_archive_get_unknown(cur):
+            yield content_id[0]
+
     @db_transaction
     def content_archive_update(self, content_id, archive_id,
                                new_status=None, cur=None):
-- 
GitLab