diff --git a/README.md b/README.md index afe94f832c25cdbcd24e03692989bda2748018ab..acc86e29e0a7dd212989b1ff2a29d177eabff5e7 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,19 @@ logging.basicConfig(level=logging.DEBUG) cran_lister() ``` +## lister-cgit + +Once configured, you can execute a cgit lister using the following instructions +in a `python3` script: + +```lang=python +import logging +from swh.lister.cgit.tasks import cgit_lister + +logging.basicConfig(level=logging.DEBUG) +cgit_lister(base_url='http://git.savannah.gnu.org/cgit/') +``` + Licensing --------- diff --git a/requirements.txt b/requirements.txt index 3ad87c4c8686d31f622e814cb8d1ee03d6d5eb1b..51e86f4cd1be5c16278be7792ab9335cd2a03f1f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,4 @@ requests setuptools xmltodict iso8601 +beautifulsoup4 diff --git a/swh/lister/cgit/__init__.py b/swh/lister/cgit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py new file mode 100644 index 0000000000000000000000000000000000000000..a16f92208ed16d7f4b494d9bfe3650891f6daad9 --- /dev/null +++ b/swh/lister/cgit/lister.py @@ -0,0 +1,180 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import random +from bs4 import BeautifulSoup +from collections import defaultdict +import requests +import urllib.parse + +from .models import CGitModel + +from swh.lister.core.simple_lister import SimpleLister +from swh.lister.core.lister_transports import ListerOnePageApiTransport + + +class CGitLister(ListerOnePageApiTransport, SimpleLister): + MODEL = CGitModel + LISTER_NAME = 'cgit' + PAGE = '' + + def __init__(self, base_url, instance=None, override_config=None): + if not base_url.endswith('/'): + base_url = base_url+'/' + self.PAGE = base_url + + # This part removes any suffix from the base url and stores it in + # next_url. For example for base_url = https://git.kernel.org/pub/scm/ + # it will convert it into https://git.kernel.org and then attach + # the suffix + (part1, part2, next_url) = self.PAGE.split('/', 2) + self.next_url = part1 + '//' + next_url + + if not instance: + instance = urllib.parse.urlparse(base_url).hostname + self.instance = instance + ListerOnePageApiTransport .__init__(self) + SimpleLister.__init__(self, override_config=override_config) + + def list_packages(self, response): + """List the actual cgit instance origins from the response. + + """ + repos_details = [] + soup = BeautifulSoup(response.text, features="html.parser") \ + .find('div', {"class": "content"}) + repos = soup.find_all("tr", {"class": ""}) + for repo in repos: + repo_name = repo.a.text + repo_url = self.get_url(repo) + origin_url = find_origin_url(repo_url) + + try: + time = repo.span['title'] + except Exception: + time = None + + if origin_url is not None: + repos_details.append({ + 'name': repo_name, + 'time': time, + 'origin_url': origin_url, + }) + + random.shuffle(repos_details) + return repos_details + + def get_url(self, repo): + """Finds url of a repo page. + + Finds the url of a repo page by parsing over the html of the row of + that repo present in the base url. + + Args: + repo: a beautifulsoup object of the html code of the repo row + present in base url. + + Returns: + string: The url of a repo. + """ + suffix = repo.a['href'] + return self.next_url + suffix + + def get_model_from_repo(self, repo): + """Transform from repository representation to model. + + """ + return { + 'uid': self.PAGE + repo['name'], + 'name': repo['name'], + 'full_name': repo['name'], + 'html_url': repo['origin_url'], + 'origin_url': repo['origin_url'], + 'origin_type': 'git', + 'time_updated': repo['time'], + } + + def transport_response_simplified(self, response): + """Transform response to list for model manipulation. + + """ + return [self.get_model_from_repo(repo) for repo in response] + + +def find_origin_url(repo_url): + """Finds origin url for a repo. + + Finds the origin url for a particular repo by parsing over the page of + that repo. + + Args: + repo_url: URL of the repo. + + Returns: + string: Origin url for the repo. + + Examples: + + >>> find_origin_url( + 'http://git.savannah.gnu.org/cgit/fbvbconv-py.git/') + 'https://git.savannah.gnu.org/git/fbvbconv-py.git' + + """ + + response = requests.get(repo_url) + soup = BeautifulSoup(response.text, features="html.parser") + + origin_urls = find_all_origin_url(soup) + return priority_origin_url(origin_urls) + + +def find_all_origin_url(soup): + """ + Finds all the origin url for a particular repo by parsing over the html of + repo page. + + Args: + soup: a beautifulsoup object of the html code of the repo. + + Returns: + dictionary: All possible origin urls with their protocol as key. + + Examples: + If soup is beautifulsoup object of the html code at + http://git.savannah.gnu.org/cgit/fbvbconv-py.git/ + + >>> print(find_all_origin_url(soup)) + { 'https': 'https://git.savannah.gnu.org/git/fbvbconv-py.git', + 'ssh': 'ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git', + 'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'} + """ + origin_urls = defaultdict(dict) + found_clone_word = False + + for i in soup.find_all('tr'): + if found_clone_word: + link = i.text + protocol = link[:link.find(':')] + origin_urls[protocol] = link + if i.text == 'Clone': + found_clone_word = True + + return origin_urls + + +def priority_origin_url(origin_url): + """Finds the highest priority link for a particular repo. + + Priority order is https>http>git>ssh. + + Args: + origin_urls: A dictionary of origin links with their protocol as key. + + Returns: + string: URL with the highest priority. + + """ + for protocol in ['https', 'http', 'git', 'ssh']: + if protocol in origin_url: + return origin_url[protocol] diff --git a/swh/lister/cgit/models.py b/swh/lister/cgit/models.py new file mode 100644 index 0000000000000000000000000000000000000000..8ecf40ffc5cbeafd146cd364aff19bc479066c5d --- /dev/null +++ b/swh/lister/cgit/models.py @@ -0,0 +1,17 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from sqlalchemy import Column, String + +from ..core.models import ModelBase + + +class CGitModel(ModelBase): + """a CGit repository representation + + """ + __tablename__ = 'cgit_repo' + + uid = Column(String, primary_key=True) + time_updated = Column(String) diff --git a/swh/lister/cgit/tasks.py b/swh/lister/cgit/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..af1ab63b43954063a0360cc0bd65405472006ccb --- /dev/null +++ b/swh/lister/cgit/tasks.py @@ -0,0 +1,23 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.scheduler.celery_backend.config import app + +from .lister import CGitLister + + +def new_lister(base_url='https://git.savannah.gnu.org/cgit/', + instance='savannah-gnu', **kw): + return CGitLister(base_url=base_url, instance=instance, **kw) + + +@app.task(name=__name__ + '.CGitListerTask') +def cgit_lister(**lister_args): + lister = new_lister(**lister_args) + lister.run() + + +@app.task(name=__name__ + '.ping') +def ping(): + return 'OK' diff --git a/swh/lister/cgit/tests/__init__.py b/swh/lister/cgit/tests/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/swh/lister/cgit/tests/api_response.html b/swh/lister/cgit/tests/api_response.html new file mode 100644 index 0000000000000000000000000000000000000000..1d34a4c8e1228fbc06780251ccd495938129431f --- /dev/null +++ b/swh/lister/cgit/tests/api_response.html @@ -0,0 +1,47 @@ +<!DOCTYPE html> +<html lang='en'> +<head> +<title>fbvbconv-py.git - Unnamed repository; edit this file 'description' to name the repository.</title> +<meta name='generator' content='cgit v1.0-41-gc330'/> +<meta name='robots' content='index, nofollow'/> +<link rel='stylesheet' type='text/css' href='/cgit/cgit.css'/> +<link rel='shortcut icon' href='/gitweb/git-favicon.png'/> +<link rel='alternate' title='Atom feed' href='http://git.savannah.gnu.org/cgit/fbvbconv-py.git/atom/?h=master' type='application/atom+xml'/> +<link rel='vcs-git' href='git://git.savannah.gnu.org/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/> +<link rel='vcs-git' href='https://git.savannah.gnu.org/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/> +<link rel='vcs-git' href='ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/> +</head> +<body> +<div id='cgit'><table id='header'> +<tr> +<td class='logo' rowspan='2'><a href='/cgit/'><img src='/cgit/cgit.png' alt='cgit logo'/></a></td> +<td class='main'><a href='/cgit/'>index</a> : <a title='fbvbconv-py.git' href='/cgit/fbvbconv-py.git/'>fbvbconv-py.git</a></td><td class='form'><form method='get'> +<select name='h' onchange='this.form.submit();'> +<option value='master' selected='selected'>master</option> +</select> <input type='submit' value='switch'/></form></td></tr> +<tr><td class='sub'>Unnamed repository; edit this file 'description' to name the repository.</td><td class='sub right'></td></tr></table> +<table class='tabs'><tr><td> +<a class='active' href='/cgit/fbvbconv-py.git/'>summary</a><a href='/cgit/fbvbconv-py.git/refs/'>refs</a><a href='/cgit/fbvbconv-py.git/log/'>log</a><a href='/cgit/fbvbconv-py.git/tree/'>tree</a><a href='/cgit/fbvbconv-py.git/commit/'>commit</a><a href='/cgit/fbvbconv-py.git/diff/'>diff</a></td><td class='form'><form class='right' method='get' action='/cgit/fbvbconv-py.git/log/'> +<select name='qt'> +<option value='grep'>log msg</option> +<option value='author'>author</option> +<option value='committer'>committer</option> +<option value='range'>range</option> +</select> +<input class='txt' type='text' size='10' name='q' value=''/> +<input type='submit' value='search'/> +</form> +</td></tr></table> +<div class='content'><table summary='repository info' class='list nowrap'><tr class='nohover'><th class='left'>Branch</th><th class='left'>Commit message</th><th class='left'>Author</th><th class='left' colspan='2'>Age</th></tr> +<tr><td><a href='/cgit/fbvbconv-py.git/log/'>master</a></td><td><a href='/cgit/fbvbconv-py.git/commit/'>initial import</a></td><td>Johannes Stezenbach</td><td colspan='2'><span class='age-years' title='2017-06-02 09:57:38 +0200'>2 years</span></td></tr> +<tr class='nohover'><td colspan='5'> </td></tr><tr class='nohover'><td colspan='5'> </td></tr><tr class='nohover'><th class='left'>Age</th><th class='left'>Commit message</th><th class='left'>Author</th><th class='left'>Files</th><th class='left'>Lines</th></tr> +<tr><td><span title='2017-06-02 09:57:38 +0200'>2017-06-02</span></td><td><a href='/cgit/fbvbconv-py.git/commit/?id=9766bcfae598e3e077f321bade823028eb5553bb'>initial import</a><span class='decoration'><a class='deco' href='/cgit/fbvbconv-py.git/commit/?id=9766bcfae598e3e077f321bade823028eb5553bb'>HEAD</a><a class='branch-deco' href='/cgit/fbvbconv-py.git/log/'>master</a></span></td><td>Johannes Stezenbach</td><td>3</td><td><span class='deletions'>-0</span>/<span class='insertions'>+889</span></td></tr> +<tr class='nohover'><td colspan='5'> </td></tr><tr class='nohover'><th class='left' colspan='5'>Clone</th></tr> +<tr><td colspan='5'><a rel='vcs-git' href='git://git.savannah.gnu.org/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>git://git.savannah.gnu.org/fbvbconv-py.git</a></td></tr> +<tr><td colspan='5'><a rel='vcs-git' href='https://git.savannah.gnu.org/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>https://git.savannah.gnu.org/git/fbvbconv-py.git</a></td></tr> +<tr><td colspan='5'><a rel='vcs-git' href='ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git</a></td></tr> +</table></div> <!-- class=content --> +<div class='footer'>generated by <a href='https://git.zx2c4.com/cgit/about/'>cgit v1.0-41-gc330</a> at 2019-06-19 10:51:46 +0000</div> +</div> <!-- id=cgit --> +</body> +</html> diff --git a/swh/lister/cgit/tests/conftest.py b/swh/lister/cgit/tests/conftest.py new file mode 100644 index 0000000000000000000000000000000000000000..507fef9155c09d3c24b5bbd30826801e6ee4ff6c --- /dev/null +++ b/swh/lister/cgit/tests/conftest.py @@ -0,0 +1 @@ +from swh.lister.core.tests.conftest import * # noqa diff --git a/swh/lister/cgit/tests/test_lister.py b/swh/lister/cgit/tests/test_lister.py new file mode 100644 index 0000000000000000000000000000000000000000..600758ae1f1947458f46d89faaed00777aceb3ed --- /dev/null +++ b/swh/lister/cgit/tests/test_lister.py @@ -0,0 +1,40 @@ +# Copyright (C) 2019 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from bs4 import BeautifulSoup + +from swh.lister.cgit.lister import priority_origin_url, find_all_origin_url + + +def test_find_all_origin_url(): + f = open('swh/lister/cgit/tests/api_response.html') + soup = BeautifulSoup(f.read(), features="html.parser") + expected_output = {'https': 'https://git.savannah.gnu.org/git/' + 'fbvbconv-py.git', + 'ssh': 'ssh://git.savannah.gnu.org/srv/git/' + 'fbvbconv-py.git', + 'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'} + + output = find_all_origin_url(soup) + + for protocol, url in expected_output.items(): + assert url == output[protocol] + + +def test_priority_origin_url(): + first_input = {'https': 'https://kernel.googlesource.com/pub/scm/docs/' + 'man-pages/man-pages.git', + 'git': 'git://git.kernel.org/pub/scm/docs/man-pages/' + 'man-pages.git'} + second_input = {'git': 'git://git.savannah.gnu.org/perl-pesel.git', + 'ssh': 'ssh://git.savannah.gnu.org/srv/git/perl-pesel.git'} + third_input = {} + + assert (priority_origin_url(first_input) == + 'https://kernel.googlesource.com/pub/scm/docs/man-pages/' + 'man-pages.git') + assert (priority_origin_url(second_input) == + 'git://git.savannah.gnu.org/perl-pesel.git') + assert priority_origin_url(third_input) is None diff --git a/swh/lister/cgit/tests/test_tasks.py b/swh/lister/cgit/tests/test_tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..b8faabc3cd9949341e506a789303d6865fcdfc5c --- /dev/null +++ b/swh/lister/cgit/tests/test_tasks.py @@ -0,0 +1,29 @@ +from unittest.mock import patch + + +def test_ping(swh_app, celery_session_worker): + res = swh_app.send_task( + 'swh.lister.cgit.tasks.ping') + assert res + res.wait() + assert res.successful() + assert res.result == 'OK' + + +@patch('swh.lister.cgit.tasks.CGitLister') +def test_lister(lister, swh_app, celery_session_worker): + # setup the mocked CGitLister + lister.return_value = lister + lister.run.return_value = None + + res = swh_app.send_task( + 'swh.lister.cgit.tasks.CGitListerTask') + assert res + res.wait() + assert res.successful() + + lister.assert_called_once_with( + base_url='https://git.savannah.gnu.org/cgit/', + instance='savannah-gnu') + lister.db_last_index.assert_not_called() + lister.run.assert_called_once_with() diff --git a/swh/lister/cli.py b/swh/lister/cli.py index 6bf68018a3b4b17db2b3b8ebb985e004ac3b8e0d..bf5c439ab8989830e6fcb6b1c599361a33de5b61 100644 --- a/swh/lister/cli.py +++ b/swh/lister/cli.py @@ -12,7 +12,7 @@ from swh.core.cli import CONTEXT_SETTINGS logger = logging.getLogger(__name__) SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi', - 'npm', 'phabricator', 'gnu', 'cran'] + 'npm', 'phabricator', 'gnu', 'cran', 'cgit'] @click.group(name='lister', context_settings=CONTEXT_SETTINGS) @@ -125,6 +125,13 @@ def cli(ctx, db_url, listers, drop_tables): from .cran.lister import CRANLister _lister = CRANLister(override_config=override_conf) + elif lister == 'cgit': + from .cgit.models import ModelBase + from .cgit.lister import CGitLister + _lister = CGitLister( + base_url='http://git.savannah.gnu.org/cgit/', + override_config=override_conf) + else: raise ValueError( 'Invalid lister %s: only supported listers are %s' % diff --git a/swh/lister/core/tests/conftest.py b/swh/lister/core/tests/conftest.py index e241d274abd680829380b29c94d84a7c8199197e..b8dd868009fdef82e85f838d75a4171fa76872db 100644 --- a/swh/lister/core/tests/conftest.py +++ b/swh/lister/core/tests/conftest.py @@ -6,6 +6,7 @@ from swh.scheduler.tests.conftest import * # noqa def celery_includes(): return [ 'swh.lister.bitbucket.tasks', + 'swh.lister.cgit.tasks', 'swh.lister.cran.tasks', 'swh.lister.debian.tasks', 'swh.lister.github.tasks',