From ed908fef71edd7c10572cefd40d912acd297f8d0 Mon Sep 17 00:00:00 2001 From: "Antoine R. Dumont (@ardumont)" <antoine.romain.dumont@gmail.com> Date: Fri, 23 Sep 2016 11:47:02 +0200 Subject: [PATCH] sql/archiver/schema: Filter unknown sha1s from content_archive endpoint --- sql/swh-archiver-func.sql | 17 +++++++++++++++++ sql/upgrades/005.sql | 24 ++++++++++++++++++++++++ swh/archiver/db.py | 8 ++++++++ swh/archiver/storage.py | 24 ++++++++++++++++++++++-- 4 files changed, 71 insertions(+), 2 deletions(-) create mode 100644 sql/upgrades/005.sql diff --git a/sql/swh-archiver-func.sql b/sql/swh-archiver-func.sql index 5510262..5113056 100644 --- a/sql/swh-archiver-func.sql +++ b/sql/swh-archiver-func.sql @@ -29,3 +29,20 @@ end $$; COMMENT ON FUNCTION swh_content_archive_missing(text) IS 'Filter missing data from a specific backend'; + +create or replace function swh_content_archive_unknown() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select content_id + from tmp_content_archive tmp where not exists ( + select 1 + from content_archive c + where tmp.content_id = c.content_id + ); +end +$$; + +COMMENT ON FUNCTION swh_content_archive_unknown() IS 'Retrieve list of unknown sha1s'; diff --git a/sql/upgrades/005.sql b/sql/upgrades/005.sql new file mode 100644 index 0000000..bc50631 --- /dev/null +++ b/sql/upgrades/005.sql @@ -0,0 +1,24 @@ +-- SWH DB schema upgrade +-- from_version: 4 +-- to_version: 5 +-- description: List unknown sha1s from content_archive + +INSERT INTO dbversion(version, release, description) +VALUES(5, now(), 'Work In Progress'); + +create or replace function swh_content_archive_unknown() + returns setof sha1 + language plpgsql +as $$ +begin + return query + select content_id + from tmp_content_archive tmp where not exists ( + select 1 + from content_archive c + where tmp.content_id = c.content_id + ); +end +$$; + +COMMENT ON FUNCTION swh_content_archive_unknown() IS 'Retrieve list of unknown sha1'; diff --git a/swh/archiver/db.py b/swh/archiver/db.py index a4611d9..b1156ae 100644 --- a/swh/archiver/db.py +++ b/swh/archiver/db.py @@ -196,6 +196,14 @@ class ArchiverDb(BaseDb): (backend_name,)) yield from cursor_to_bytes(cur) + def content_archive_get_unknown(self, cur=None): + """Retrieve unknown sha1 from archiver db. + + """ + cur = self._cursor(cur) + cur.execute('select * from swh_content_archive_unknown()') + yield from cursor_to_bytes(cur) + def content_archive_insert(self, content_id, source, status, cur=None): """Insert a new entry in the db for the content_id. diff --git a/swh/archiver/storage.py b/swh/archiver/storage.py index 1336c17..b207a70 100644 --- a/swh/archiver/storage.py +++ b/swh/archiver/storage.py @@ -98,14 +98,14 @@ class ArchiverStorage(): @db_transaction_generator def content_archive_get_missing(self, content_ids, backend_name, cur=None): - """Retrieve the list of missing copies from source_name. + """Retrieve missing sha1s from source_name. Args: content_ids ([sha1s]): list of sha1s to test source_name (str): Name of the backend to check for content Yields: - List of ids effectively missing from backend_name + missing sha1s from backend_name """ db = self.db @@ -117,6 +117,26 @@ class ArchiverStorage(): for content_id in db.content_archive_get_missing(backend_name, cur): yield content_id[0] + @db_transaction_generator + def content_archive_get_unknown(self, content_ids, cur=None): + """Retrieve unknown sha1s from content_archive. + + Args: + content_ids ([sha1s]): list of sha1s to test + + Yields: + Unknown sha1s from content_archive + + """ + db = self.db + + db.mktemp_content_archive() + + db.copy_to(content_ids, 'tmp_content_archive', ['content_id'], cur) + + for content_id in db.content_archive_get_unknown(cur): + yield content_id[0] + @db_transaction def content_archive_update(self, content_id, archive_id, new_status=None, cur=None): -- GitLab