Skip to content
Snippets Groups Projects
Commit 151f6cd2 authored by Archit Agrawal's avatar Archit Agrawal
Browse files

swh.lister.gnu

Implement first pass of gnu lister to list all the
packages present in https://ftp.gnu.org/
Add GNU lister in README and cli.py

Closes T1722
parent f8a2ae86
No related branches found
No related tags found
1 merge request!369GNU Lister
......@@ -177,6 +177,18 @@ logging.basicConfig(level=logging.DEBUG)
incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX')
```
## lister-gnu
Once configured, you can execute a PyPI lister using the following instructions in a `python3` script:
```lang=python
import logging
from swh.lister.gnu.tasks import gnu_lister
logging.basicConfig(level=logging.DEBUG)
gnu_lister()
```
Licensing
---------
......
......@@ -12,7 +12,7 @@ from swh.core.cli import CONTEXT_SETTINGS
logger = logging.getLogger(__name__)
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
'npm', 'phabricator']
'npm', 'phabricator', 'gnu']
@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
......@@ -115,6 +115,11 @@ def cli(ctx, db_url, listers, drop_tables):
api_token='',
override_config=override_conf)
elif lister == 'gnu':
from .gnu.models import ModelBase
from .gnu.lister import GNULister
_lister = GNULister(override_config=override_conf)
else:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
......
......@@ -12,4 +12,5 @@ def celery_includes():
'swh.lister.npm.tasks',
'swh.lister.pypi.tasks',
'swh.lister.phabricator.tasks',
'swh.lister.gnu.tasks'
]
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import random
import gzip
import json
import os
import requests
from urllib.parse import urlparse
from .models import GNUModel
from swh.scheduler import utils
from swh.lister.core.simple_lister import SimpleLister
from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
class LocalResponse:
"""Local Response class with iter_content api
"""
def __init__(self, path):
self.path = path
def iter_content(self, chunk_size=None):
with open(self.path, 'rb') as f:
while True:
chunk = f.read(chunk_size)
if not chunk:
break
yield chunk
class ArchiveFetcher:
"""Http/Local client in charge of downloading archives from a
remote/local server.
Args:
temp_directory (str): Path to the temporary disk location used
for downloading the release artifacts
"""
def __init__(self, temp_directory=None):
self.temp_directory = os.getcwd()
self.session = requests.session()
self.params = {
'headers': {
'User-Agent': 'Software Heritage Lister ( __devl__)'
}
}
def download(self, url):
"""Download the remote tarball url locally.
Args:
url (str): Url (file or http*)
Raises:
ValueError in case of failing to query
Returns:
Tuple of local (filepath, hashes of filepath)
"""
url_parsed = urlparse(url)
if url_parsed.scheme == 'file':
path = url_parsed.path
response = LocalResponse(path)
length = os.path.getsize(path)
else:
response = self.session.get(url, **self.params, stream=True)
if response.status_code != 200:
raise ValueError("Fail to query '%s'. Reason: %s" % (
url, response.status_code))
length = int(response.headers['content-length'])
filepath = os.path.join(self.temp_directory, os.path.basename(url))
h = MultiHash(length=length)
with open(filepath, 'wb') as f:
for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
h.update(chunk)
f.write(chunk)
actual_length = os.path.getsize(filepath)
if length != actual_length:
raise ValueError('Error when checking size: %s != %s' % (
length, actual_length))
return filepath
class GNULister(SimpleLister, ArchiveFetcher):
MODEL = GNUModel
LISTER_NAME = 'gnu'
TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
def __init__(self, override_config=None):
SimpleLister.__init__(self, override_config=override_config)
ArchiveFetcher.__init__(self, override_config=override_config)
def task_dict(self, origin_type, origin_url, **kwargs):
"""(Override)
Return task format dict
This is overridden from the lister_base as more information is
needed for the ingestion task creation.
"""
_type = 'load-%s' % origin_type
_policy = 'recurring'
project_name = kwargs.get('name')
project_metadata_url = kwargs.get('html_url')
return utils.create_task_dict(
_type, _policy, project_name, origin_url,
project_metadata_url=project_metadata_url)
def download_file(self):
'''
Downloads tree.json file and returns its location
Returns
File path of the downloaded file
'''
file_path, hash_dict = self.download(self.TREE_URL)
return file_path
def read_downloaded_file(self, file_path):
'''
Reads the downloaded file content and convert it into json format
Returns
File content in json format
'''
with gzip.GzipFile(file_path, 'r') as fin:
response = json.loads(fin.read().decode('utf-8'))
return response
def safely_issue_request(self, identifier):
'''(Override)Make network request with to download the file which
has file structure of the GNU website.
Args:
identifier: resource identifier
Returns:
server response
'''
file_path = self.download_file()
response = self.read_downloaded_file(file_path)
return response
def list_packages(self, response):
"""(Override) List the actual gnu origins with their names and
time last updated from the response.
"""
response = clean_up_response(response)
_packages = []
for directory in response:
content = directory['contents']
for repo in content:
if repo['type'] == 'directory':
repo_details = {
'name': repo['name'],
'url': self._get_project_url(directory['name'],
repo['name']),
'time_modified': repo['time']
}
_packages.append(repo_details)
random.shuffle(_packages)
return _packages
def _get_project_url(self, dir_name, package_name):
"""Returns project_url
"""
return 'https://ftp.gnu.org/%s/%s/' % (dir_name, package_name)
def get_model_from_repo(self, repo):
"""(Override) Transform from repository representation to model
"""
return {
'uid': repo['name'],
'name': repo['name'],
'full_name': repo['name'],
'html_url': repo['url'],
'origin_url': repo['url'],
'time_last_upated': repo['time_modified'],
'origin_type': 'gnu',
'description': None,
}
def transport_response_simplified(self, response):
"""(Override) Transform response to list for model manipulation
"""
return [self.get_model_from_repo(repo) for repo in response]
def transport_request(self):
pass
def transport_response_to_string(self):
pass
def transport_quota_check(self):
pass
def clean_up_response(response):
final_response = []
file_system = response[0]['content']
for directory in file_system:
if directory['name'] in ('gnu', 'mirrors', 'old-gnu'):
final_response.append(directory)
return final_response
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from sqlalchemy import Column, String, Integer
from ..core.models import ModelBase
class GNUModel(ModelBase):
"""a GNU repository representation
"""
__tablename__ = 'gnu_repo'
uid = Column(String, primary_key=True)
time_last_upated = Column(Integer)
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.scheduler.celery_backend.config import app
from .lister import GNULister
@app.task(name=__name__ + '.GNUListerTask')
def gnu_lister(**lister_args):
GNULister(**lister_args).run()
@app.task(name=__name__ + '.ping')
def ping():
return 'OK'
from swh.lister.core.tests.conftest import * # noqa
from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.gnu.tasks.ping')
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
@patch('swh.lister.gnu.tasks.GNULister')
def test_lister(lister, swh_app, celery_session_worker):
# setup the mocked GNULister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.gnu.tasks.GNUListerTask')
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with()
lister.db_last_index.assert_not_called()
lister.run.assert_called_once_with()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment