Skip to content
Snippets Groups Projects
Verified Commit d30d574d authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

cran.lister: Refactor and fix cran lister

Prior to this commit, the code was actually duplicated with an old version
which would not work.

Related D1492#41287
parent 85d00106
No related branches found
No related tags found
1 merge request!372cran.lister: Refactor and fix cran lister
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import subprocess
import json
import logging
import pkg_resources
import subprocess
from collections import defaultdict
from typing import List, Dict
from swh.lister.cran.models import CRANModel
from swh.scheduler.utils import create_task_dict
from swh.core import utils
from swh.lister.core.simple_lister import SimpleLister
from swh.scheduler.utils import create_task_dict
logger = logging.getLogger(__name__)
class CRANLister(SimpleLister):
......@@ -32,15 +37,17 @@ class CRANLister(SimpleLister):
kwargs.get('name'), origin_url, kwargs.get('version'),
project_metadata=self.descriptions[kwargs.get('name')])
def r_script_request(self):
"""Runs R script which uses inbuilt API to return a json
response containing data about all the R packages
def safely_issue_request(self, identifier: str) -> List[Dict]:
"""Runs R script which uses inbuilt API to return a json response
containing data about all the R packages.
Returns:
List of dictionaries
example
List of Dict about r packages.
Sample:
[
{'Package': 'A3',
{
'Package': 'A3',
'Version': '1.0.0',
'Title':
'Accurate, Adaptable, and Accessible Error Metrics for
......@@ -48,22 +55,27 @@ class CRANLister(SimpleLister):
'Description':
'Supplies tools for tabulating and analyzing the results
of predictive models. The methods employed are ... '
}
{'Package': 'abbyyR',
},
{
'Package': 'abbyyR',
'Version': '0.5.4',
'Title':
'Access to Abbyy Optical Character Recognition (OCR) API',
'Description': 'Get text from images of text using Abbyy
Cloud Optical Character\n ...'
}
Cloud Optical Character\n ...'
},
...
]
"""
file_path = pkg_resources.resource_filename('swh.lister.cran',
'list_all_packages.R')
response = subprocess.run(file_path, stdout=subprocess.PIPE,
shell=False)
return json.loads(response.stdout)
filepath = pkg_resources.resource_filename('swh.lister.cran',
'list_all_packages.R')
logger.debug('script list-all-packages.R path: %s', filepath)
response = subprocess.run(
filepath, stdout=subprocess.PIPE, shell=False)
data = json.loads(response.stdout)
logger.debug('r-script-request: %s', data)
return data
def get_model_from_repo(self, repo):
"""Transform from repository representation to model
......@@ -87,36 +99,3 @@ class CRANLister(SimpleLister):
"""
return [self.get_model_from_repo(repo) for repo in response]
def ingest_data(self, identifier, checks=False):
"""Rework the base ingest_data.
Request server endpoint which gives all in one go.
Simplify and filter response list of repositories. Inject
repo information into local db. Queue loader tasks for
linked repositories.
Args:
identifier: Resource identifier (unused)
checks (bool): Additional checks required (unused)
"""
response = self.r_script_request()
if not response:
return response, []
models_list = self.transport_response_simplified(response)
models_list = self.filter_before_inject(models_list)
all_injected = []
for models in utils.grouper(models_list, n=10000):
models = list(models)
logging.debug('models: %s' % len(models))
# inject into local db
injected = self.inject_repo_data_into_db(models)
# queue workers
self.create_missing_origins_and_tasks(models, injected)
all_injected.append(injected)
# flush
self.db_session.commit()
self.db_session = self.mk_session()
return response, all_injected
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment