From 64a9bc691d37fd46a309dcac2e3397fc0158df93 Mon Sep 17 00:00:00 2001
From: "Antoine R. Dumont (@ardumont)" <antoine.romain.dumont@gmail.com>
Date: Thu, 13 Jun 2019 11:00:19 +0200
Subject: [PATCH] lister.core: Stop creating origins when scheduling tasks

Prior to this commit, lister did create origins as well in the archive. Now, we
only schedule new origins for ingestion.
---
 swh/lister/core/lister_base.py       | 53 ++++++++--------------------
 swh/lister/core/simple_lister.py     |  2 +-
 swh/lister/core/tests/test_lister.py | 12 +++----
 swh/lister/debian/lister.py          |  4 +--
 swh/lister/phabricator/lister.py     |  2 +-
 5 files changed, 24 insertions(+), 49 deletions(-)

diff --git a/swh/lister/core/lister_base.py b/swh/lister/core/lister_base.py
index 22a35f89..93b78b81 100644
--- a/swh/lister/core/lister_base.py
+++ b/swh/lister/core/lister_base.py
@@ -384,20 +384,6 @@ class SWHListerBase(abc.ABC, config.SWHConfig):
 
         return sql_repo
 
-    def origin_dict(self, origin_type, origin_url, **kwargs):
-        """Return special dict format for the origins list
-
-        Args:
-            origin_type (string)
-            origin_url (string)
-        Returns:
-            the same information in a different form
-        """
-        return {
-            'type': origin_type,
-            'url': origin_url,
-        }
-
     def task_dict(self, origin_type, origin_url, **kwargs):
         """Return special dict format for the tasks list
 
@@ -452,45 +438,34 @@ class SWHListerBase(abc.ABC, config.SWHConfig):
             injected_repos[m['uid']] = self.db_inject_repo(m)
         return injected_repos
 
-    def create_missing_origins_and_tasks(self, models_list, injected_repos):
-        """Find any newly created db entries that don't yet have tasks or
-            origin objects assigned.
+    def schedule_missing_tasks(self, models_list, injected_repos):
+        """Find any newly created db entries that do not have been scheduled
+           yet.
 
         Args:
-            models_list: a list of dicts mapping keys in the db model for
-                each repo
-            injected_repos: dict of uid:sql_repo pairs that have just
-                been created
+            models_list ([Model]): List of dicts mapping keys in the db model
+                                   for each repo
+            injected_repos ([dict]): Dict of uid:sql_repo pairs that have just
+                            been created
+
         Returns:
             Nothing. Modifies injected_repos.
+
         """
-        origins = {}
         tasks = {}
 
-        def _origin_key(m):
-            _type = m.get('origin_type', m.get('type'))
-            _url = m.get('origin_url', m.get('url'))
-            return '%s-%s' % (_type, _url)
-
         def _task_key(m):
-            return '%s-%s' % (m['type'],
-                              json.dumps(m['arguments'], sort_keys=True))
+            return '%s-%s' % (
+                m['type'],
+                json.dumps(m['arguments'], sort_keys=True)
+            )
 
         for m in models_list:
             ir = injected_repos[m['uid']]
-            if not ir.origin_id:
-                origin_dict = self.origin_dict(**m)
-                origins[_origin_key(m)] = (ir, m, origin_dict)
             if not ir.task_id:
                 task_dict = self.task_dict(**m)
                 tasks[_task_key(task_dict)] = (ir, m, task_dict)
 
-        new_origins = self.storage.origin_add(
-            (origin_dicts for (_, _, origin_dicts) in origins.values()))
-        for origin in new_origins:
-            ir, m, _ = origins[_origin_key(origin)]
-            ir.origin_id = origin['id']
-
         new_tasks = self.scheduler.create_tasks(
             (task_dicts for (_, _, task_dicts) in tasks.values()))
         for task in new_tasks:
@@ -519,7 +494,7 @@ class SWHListerBase(abc.ABC, config.SWHConfig):
         # inject into local db
         injected = self.inject_repo_data_into_db(models_list)
         # queue workers
-        self.create_missing_origins_and_tasks(models_list, injected)
+        self.schedule_missing_tasks(models_list, injected)
         return response, injected
 
     def save_response(self, response):
diff --git a/swh/lister/core/simple_lister.py b/swh/lister/core/simple_lister.py
index 11060eb5..40c47b29 100644
--- a/swh/lister/core/simple_lister.py
+++ b/swh/lister/core/simple_lister.py
@@ -51,7 +51,7 @@ class SimpleLister(SWHListerBase):
             # inject into local db
             injected = self.inject_repo_data_into_db(models)
             # queue workers
-            self.create_missing_origins_and_tasks(models, injected)
+            self.schedule_missing_tasks(models, injected)
             all_injected.append(injected)
             # flush
             self.db_session.commit()
diff --git a/swh/lister/core/tests/test_lister.py b/swh/lister/core/tests/test_lister.py
index 29dcd2a9..5b93b644 100644
--- a/swh/lister/core/tests/test_lister.py
+++ b/swh/lister/core/tests/test_lister.py
@@ -164,8 +164,8 @@ class HttpListerTesterBase(abc.ABC):
             if k not in ['last_seen', 'task_id', 'origin_id', 'id']:
                 self.assertIn(k, di)
 
-    def disable_storage_and_scheduler(self, fl):
-        fl.create_missing_origins_and_tasks = Mock(return_value=None)
+    def disable_scheduler(self, fl):
+        fl.schedule_missing_tasks = Mock(return_value=None)
 
     def disable_db(self, fl):
         fl.winnow_models = Mock(return_value=[])
@@ -176,7 +176,7 @@ class HttpListerTesterBase(abc.ABC):
         http_mocker.get(self.test_re, text=self.mock_response)
         fl = self.get_fl()
 
-        self.disable_storage_and_scheduler(fl)
+        self.disable_scheduler(fl)
         self.disable_db(fl)
 
         fl.run(min_bound=1, max_bound=1)  # stores no results
@@ -185,7 +185,7 @@ class HttpListerTesterBase(abc.ABC):
         http_mocker.get(self.test_re, text=self.mock_response)
         fl = self.get_fl()
 
-        self.disable_storage_and_scheduler(fl)
+        self.disable_scheduler(fl)
         self.disable_db(fl)
 
         fl.run(min_bound=self.first_index, max_bound=self.first_index)
@@ -194,7 +194,7 @@ class HttpListerTesterBase(abc.ABC):
         http_mocker.get(self.test_re, text=self.mock_response)
         fl = self.get_fl()
 
-        self.disable_storage_and_scheduler(fl)
+        self.disable_scheduler(fl)
         self.disable_db(fl)
 
         fl.run(min_bound=self.first_index)
@@ -222,7 +222,7 @@ class HttpListerTester(HttpListerTesterBase, abc.ABC):
             })
         self.init_db(db, fl.MODEL)
 
-        self.disable_storage_and_scheduler(fl)
+        self.disable_scheduler(fl)
 
         fl.run(min_bound=self.first_index)
 
diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py
index 5b72c6f2..a0dc11af 100644
--- a/swh/lister/debian/lister.py
+++ b/swh/lister/debian/lister.py
@@ -119,7 +119,7 @@ class DebianLister(SWHListerHttpTransport, SWHListerBase):
         """Generate the Package entries that didn't previously exist.
 
         Contrary to SWHListerBase, we don't actually insert the data in
-        database. `create_missing_origins_and_tasks` does it once we have the
+        database. `schedule_missing_tasks` does it once we have the
         origin and task identifiers.
         """
         by_name_version = {}
@@ -173,7 +173,7 @@ class DebianLister(SWHListerHttpTransport, SWHListerBase):
         self.db_session.add_all(added_packages)
         return added_packages
 
-    def create_missing_origins_and_tasks(self, models_list, added_packages):
+    def schedule_missing_tasks(self, models_list, added_packages):
         """We create tasks at the end of the full snapshot processing"""
         return
 
diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py
index d6e062e4..c02103d3 100644
--- a/swh/lister/phabricator/lister.py
+++ b/swh/lister/phabricator/lister.py
@@ -97,7 +97,7 @@ class PhabricatorLister(SWHIndexingHttpLister):
         self.max_index = models_list[0]['indexable']
         models_list = self.filter_before_inject(models_list)
         injected = self.inject_repo_data_into_db(models_list)
-        self.create_missing_origins_and_tasks(models_list, injected)
+        self.schedule_missing_tasks(models_list, injected)
         return self.max_index
 
     def run(self, min_bound=None, max_bound=None):
-- 
GitLab