From 151f6cd2235cedd420174ffad01fc8030ee4a787 Mon Sep 17 00:00:00 2001
From: Archit Agrawal <archit18221@iiit.ac.in>
Date: Fri, 17 May 2019 15:54:20 +0530
Subject: [PATCH] swh.lister.gnu

Implement first pass of gnu lister to list all the
packages present in https://ftp.gnu.org/
Add GNU lister in README and cli.py

Closes T1722
---
 README.md                          |  12 ++
 swh/lister/cli.py                  |   7 +-
 swh/lister/core/tests/conftest.py  |   1 +
 swh/lister/gnu/__init__.py         |   0
 swh/lister/gnu/lister.py           | 217 +++++++++++++++++++++++++++++
 swh/lister/gnu/models.py           |  17 +++
 swh/lister/gnu/tasks.py            |  17 +++
 swh/lister/gnu/tests/__init__.py   |   0
 swh/lister/gnu/tests/conftest.py   |   1 +
 swh/lister/gnu/tests/test_tasks.py |  27 ++++
 10 files changed, 298 insertions(+), 1 deletion(-)
 create mode 100644 swh/lister/gnu/__init__.py
 create mode 100644 swh/lister/gnu/lister.py
 create mode 100644 swh/lister/gnu/models.py
 create mode 100644 swh/lister/gnu/tasks.py
 create mode 100644 swh/lister/gnu/tests/__init__.py
 create mode 100644 swh/lister/gnu/tests/conftest.py
 create mode 100644 swh/lister/gnu/tests/test_tasks.py

diff --git a/README.md b/README.md
index 887b5995..4d56957d 100644
--- a/README.md
+++ b/README.md
@@ -177,6 +177,18 @@ logging.basicConfig(level=logging.DEBUG)
 incremental_phabricator_lister(forge_url='https://forge.softwareheritage.org', api_token='XXXX')
 ```
 
+## lister-gnu
+
+Once configured, you can execute a PyPI lister using the following instructions in a `python3` script:
+
+```lang=python
+import logging
+from swh.lister.gnu.tasks import gnu_lister
+
+logging.basicConfig(level=logging.DEBUG)
+gnu_lister()
+```
+
 Licensing
 ---------
 
diff --git a/swh/lister/cli.py b/swh/lister/cli.py
index e6563c96..22b520cd 100644
--- a/swh/lister/cli.py
+++ b/swh/lister/cli.py
@@ -12,7 +12,7 @@ from swh.core.cli import CONTEXT_SETTINGS
 logger = logging.getLogger(__name__)
 
 SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
-                     'npm', 'phabricator']
+                     'npm', 'phabricator', 'gnu']
 
 
 @click.group(name='lister', context_settings=CONTEXT_SETTINGS)
@@ -115,6 +115,11 @@ def cli(ctx, db_url, listers, drop_tables):
                 api_token='',
                 override_config=override_conf)
 
+        elif lister == 'gnu':
+            from .gnu.models import ModelBase
+            from .gnu.lister import GNULister
+            _lister = GNULister(override_config=override_conf)
+
         else:
             raise ValueError(
                 'Invalid lister %s: only supported listers are %s' %
diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py
index 17ce8f22..16a9a07c 100644
--- a/swh/lister/core/tests/conftest.py
+++ b/swh/lister/core/tests/conftest.py
@@ -12,4 +12,5 @@ def celery_includes():
         'swh.lister.npm.tasks',
         'swh.lister.pypi.tasks',
         'swh.lister.phabricator.tasks',
+        'swh.lister.gnu.tasks'
     ]
diff --git a/swh/lister/gnu/__init__.py b/swh/lister/gnu/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py
new file mode 100644
index 00000000..bd821d49
--- /dev/null
+++ b/swh/lister/gnu/lister.py
@@ -0,0 +1,217 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+import random
+import gzip
+import json
+import os
+import requests
+from urllib.parse import urlparse
+
+from .models import GNUModel
+
+from swh.scheduler import utils
+from swh.lister.core.simple_lister import SimpleLister
+from swh.model.hashutil import MultiHash, HASH_BLOCK_SIZE
+
+
+class LocalResponse:
+    """Local Response class with iter_content api
+
+    """
+    def __init__(self, path):
+        self.path = path
+
+    def iter_content(self, chunk_size=None):
+        with open(self.path, 'rb') as f:
+            while True:
+                chunk = f.read(chunk_size)
+                if not chunk:
+                    break
+                yield chunk
+
+
+class ArchiveFetcher:
+    """Http/Local client in charge of downloading archives from a
+       remote/local server.
+
+    Args:
+        temp_directory (str): Path to the temporary disk location used
+                              for downloading the release artifacts
+
+    """
+    def __init__(self, temp_directory=None):
+        self.temp_directory = os.getcwd()
+        self.session = requests.session()
+        self.params = {
+            'headers': {
+                'User-Agent': 'Software Heritage Lister ( __devl__)'
+            }
+        }
+
+    def download(self, url):
+        """Download the remote tarball url locally.
+
+        Args:
+            url (str): Url (file or http*)
+
+        Raises:
+            ValueError in case of failing to query
+
+        Returns:
+            Tuple of local (filepath, hashes of filepath)
+
+        """
+        url_parsed = urlparse(url)
+        if url_parsed.scheme == 'file':
+            path = url_parsed.path
+            response = LocalResponse(path)
+            length = os.path.getsize(path)
+        else:
+            response = self.session.get(url, **self.params, stream=True)
+            if response.status_code != 200:
+                raise ValueError("Fail to query '%s'. Reason: %s" % (
+                    url, response.status_code))
+            length = int(response.headers['content-length'])
+
+        filepath = os.path.join(self.temp_directory, os.path.basename(url))
+
+        h = MultiHash(length=length)
+        with open(filepath, 'wb') as f:
+            for chunk in response.iter_content(chunk_size=HASH_BLOCK_SIZE):
+                h.update(chunk)
+                f.write(chunk)
+
+        actual_length = os.path.getsize(filepath)
+        if length != actual_length:
+            raise ValueError('Error when checking size: %s != %s' % (
+                length, actual_length))
+
+        return filepath
+
+
+class GNULister(SimpleLister, ArchiveFetcher):
+    MODEL = GNUModel
+    LISTER_NAME = 'gnu'
+    TREE_URL = 'https://ftp.gnu.org/tree.json.gz'
+
+    def __init__(self, override_config=None):
+        SimpleLister.__init__(self, override_config=override_config)
+        ArchiveFetcher.__init__(self, override_config=override_config)
+
+    def task_dict(self, origin_type, origin_url, **kwargs):
+        """(Override)
+        Return task format dict
+
+        This is overridden from the lister_base as more information is
+        needed for the ingestion task creation.
+
+        """
+        _type = 'load-%s' % origin_type
+        _policy = 'recurring'
+        project_name = kwargs.get('name')
+        project_metadata_url = kwargs.get('html_url')
+        return utils.create_task_dict(
+            _type, _policy, project_name, origin_url,
+            project_metadata_url=project_metadata_url)
+
+    def download_file(self):
+        '''
+            Downloads tree.json file and returns its location
+
+            Returns
+            File path of the downloaded file
+        '''
+        file_path, hash_dict = self.download(self.TREE_URL)
+        return file_path
+
+    def read_downloaded_file(self, file_path):
+        '''
+            Reads the downloaded file content and convert it into json format
+
+            Returns
+            File content in json format
+        '''
+        with gzip.GzipFile(file_path, 'r') as fin:
+            response = json.loads(fin.read().decode('utf-8'))
+        return response
+
+    def safely_issue_request(self, identifier):
+        '''(Override)Make network request with to download the file which
+            has file structure of the GNU website.
+
+            Args:
+                identifier: resource identifier
+            Returns:
+                server response
+        '''
+        file_path = self.download_file()
+        response = self.read_downloaded_file(file_path)
+        return response
+
+    def list_packages(self, response):
+        """(Override) List the actual gnu origins with their names and
+            time last updated from the response.
+
+        """
+        response = clean_up_response(response)
+        _packages = []
+        for directory in response:
+            content = directory['contents']
+            for repo in content:
+                if repo['type'] == 'directory':
+                    repo_details = {
+                        'name': repo['name'],
+                        'url': self._get_project_url(directory['name'],
+                                                     repo['name']),
+                        'time_modified': repo['time']
+                    }
+                    _packages.append(repo_details)
+        random.shuffle(_packages)
+        return _packages
+
+    def _get_project_url(self, dir_name, package_name):
+        """Returns project_url
+
+        """
+        return 'https://ftp.gnu.org/%s/%s/' % (dir_name, package_name)
+
+    def get_model_from_repo(self, repo):
+        """(Override) Transform from repository representation to model
+
+        """
+        return {
+            'uid': repo['name'],
+            'name': repo['name'],
+            'full_name': repo['name'],
+            'html_url': repo['url'],
+            'origin_url': repo['url'],
+            'time_last_upated': repo['time_modified'],
+            'origin_type': 'gnu',
+            'description': None,
+        }
+
+    def transport_response_simplified(self, response):
+        """(Override) Transform response to list for model manipulation
+
+        """
+        return [self.get_model_from_repo(repo) for repo in response]
+
+    def transport_request(self):
+        pass
+
+    def transport_response_to_string(self):
+        pass
+
+    def transport_quota_check(self):
+        pass
+
+
+def clean_up_response(response):
+    final_response = []
+    file_system = response[0]['content']
+    for directory in file_system:
+        if directory['name'] in ('gnu', 'mirrors', 'old-gnu'):
+            final_response.append(directory)
+    return final_response
diff --git a/swh/lister/gnu/models.py b/swh/lister/gnu/models.py
new file mode 100644
index 00000000..ebad0396
--- /dev/null
+++ b/swh/lister/gnu/models.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from sqlalchemy import Column, String, Integer
+
+from ..core.models import ModelBase
+
+
+class GNUModel(ModelBase):
+    """a GNU repository representation
+
+    """
+    __tablename__ = 'gnu_repo'
+
+    uid = Column(String, primary_key=True)
+    time_last_upated = Column(Integer)
diff --git a/swh/lister/gnu/tasks.py b/swh/lister/gnu/tasks.py
new file mode 100644
index 00000000..251eccf2
--- /dev/null
+++ b/swh/lister/gnu/tasks.py
@@ -0,0 +1,17 @@
+# Copyright (C) 2019 the Software Heritage developers
+# License: GNU General Public License version 3, or any later version
+# See top-level LICENSE file for more information
+
+from swh.scheduler.celery_backend.config import app
+
+from .lister import GNULister
+
+
+@app.task(name=__name__ + '.GNUListerTask')
+def gnu_lister(**lister_args):
+    GNULister(**lister_args).run()
+
+
+@app.task(name=__name__ + '.ping')
+def ping():
+    return 'OK'
diff --git a/swh/lister/gnu/tests/__init__.py b/swh/lister/gnu/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/swh/lister/gnu/tests/conftest.py b/swh/lister/gnu/tests/conftest.py
new file mode 100644
index 00000000..507fef91
--- /dev/null
+++ b/swh/lister/gnu/tests/conftest.py
@@ -0,0 +1 @@
+from swh.lister.core.tests.conftest import *  # noqa
diff --git a/swh/lister/gnu/tests/test_tasks.py b/swh/lister/gnu/tests/test_tasks.py
new file mode 100644
index 00000000..4c82f777
--- /dev/null
+++ b/swh/lister/gnu/tests/test_tasks.py
@@ -0,0 +1,27 @@
+from unittest.mock import patch
+
+
+def test_ping(swh_app, celery_session_worker):
+    res = swh_app.send_task(
+        'swh.lister.gnu.tasks.ping')
+    assert res
+    res.wait()
+    assert res.successful()
+    assert res.result == 'OK'
+
+
+@patch('swh.lister.gnu.tasks.GNULister')
+def test_lister(lister, swh_app, celery_session_worker):
+    # setup the mocked GNULister
+    lister.return_value = lister
+    lister.run.return_value = None
+
+    res = swh_app.send_task(
+        'swh.lister.gnu.tasks.GNUListerTask')
+    assert res
+    res.wait()
+    assert res.successful()
+
+    lister.assert_called_once_with()
+    lister.db_last_index.assert_not_called()
+    lister.run.assert_called_once_with()
-- 
GitLab