diff --git a/sql/swh-archiver-func.sql b/sql/swh-archiver-func.sql index 551026278c5be5e5a0117406d372d1b4bcd5204a..5113056375d19a5b1ba4f08e2d51ec57222a4a41 100644 --- a/sql/swh-archiver-func.sql +++ b/sql/swh-archiver-func.sql @@ -29,3 +29,20 @@ end $$; COMMENT ON FUNCTION swh_content_archive_missing(text) IS 'Filter missing data from a specific backend'; + +create or replace function swh_content_archive_unknown() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select content_id + from tmp_content_archive tmp where not exists ( + select 1 + from content_archive c + where tmp.content_id = c.content_id + ); +end +$$; + +COMMENT ON FUNCTION swh_content_archive_unknown() IS 'Retrieve list of unknown sha1s'; diff --git a/sql/upgrades/005.sql b/sql/upgrades/005.sql new file mode 100644 index 0000000000000000000000000000000000000000..bc50631c13de461564783e5bf3db9a61998332d6 --- /dev/null +++ b/sql/upgrades/005.sql @@ -0,0 +1,24 @@ +-- SWH DB schema upgrade +-- from_version: 4 +-- to_version: 5 +-- description: List unknown sha1s from content_archive + +INSERT INTO dbversion(version, release, description) +VALUES(5, now(), 'Work In Progress'); + +create or replace function swh_content_archive_unknown() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select content_id + from tmp_content_archive tmp where not exists ( + select 1 + from content_archive c + where tmp.content_id = c.content_id + ); +end +$$; + +COMMENT ON FUNCTION swh_content_archive_unknown() IS 'Retrieve list of unknown sha1'; diff --git a/swh/archiver/db.py b/swh/archiver/db.py index a4611d96b6d20b9dea68f0502396bcfd0e972cd9..b1156aef054144864a3a7598603800f50008ab78 100644 --- a/swh/archiver/db.py +++ b/swh/archiver/db.py @@ -196,6 +196,14 @@ class ArchiverDb(BaseDb): (backend_name,)) yield from cursor_to_bytes(cur) + def content_archive_get_unknown(self, cur=None): + """Retrieve unknown sha1 from archiver db. + + """ + cur = self._cursor(cur) + cur.execute('select * from swh_content_archive_unknown()') + yield from cursor_to_bytes(cur) + def content_archive_insert(self, content_id, source, status, cur=None): """Insert a new entry in the db for the content_id. diff --git a/swh/archiver/storage.py b/swh/archiver/storage.py index 1336c17afcd9cd71c06b7b4dc459d00801d8e84e..b207a704721f623bc858cb163ca8f835a18617de 100644 --- a/swh/archiver/storage.py +++ b/swh/archiver/storage.py @@ -98,14 +98,14 @@ class ArchiverStorage(): @db_transaction_generator def content_archive_get_missing(self, content_ids, backend_name, cur=None): - """Retrieve the list of missing copies from source_name. + """Retrieve missing sha1s from source_name. Args: content_ids ([sha1s]): list of sha1s to test source_name (str): Name of the backend to check for content Yields: - List of ids effectively missing from backend_name + missing sha1s from backend_name """ db = self.db @@ -117,6 +117,26 @@ class ArchiverStorage(): for content_id in db.content_archive_get_missing(backend_name, cur): yield content_id[0] + @db_transaction_generator + def content_archive_get_unknown(self, content_ids, cur=None): + """Retrieve unknown sha1s from content_archive. + + Args: + content_ids ([sha1s]): list of sha1s to test + + Yields: + Unknown sha1s from content_archive + + """ + db = self.db + + db.mktemp_content_archive() + + db.copy_to(content_ids, 'tmp_content_archive', ['content_id'], cur) + + for content_id in db.content_archive_get_unknown(cur): + yield content_id[0] + @db_transaction def content_archive_update(self, content_id, archive_id, new_status=None, cur=None):