Skip to content
Snippets Groups Projects
Commit b972a2a8 authored by Archit Agrawal's avatar Archit Agrawal
Browse files

swh.lister.cgit

Implemented a lister to list the repos for a given CGit instance.

Closes T1659
parent d85bcdac
No related branches found
No related tags found
1 merge request!72swh.lister.cgit
......@@ -203,6 +203,19 @@ logging.basicConfig(level=logging.DEBUG)
cran_lister()
```
## lister-cgit
Once configured, you can execute a cgit lister using the following instructions
in a `python3` script:
```lang=python
import logging
from swh.lister.cgit.tasks import cgit_lister
logging.basicConfig(level=logging.DEBUG)
cgit_lister(base_url='http://git.savannah.gnu.org/cgit/')
```
Licensing
---------
......
......@@ -5,3 +5,4 @@ requests
setuptools
xmltodict
iso8601
beautifulsoup4
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import random
from bs4 import BeautifulSoup
from collections import defaultdict
import requests
import urllib.parse
from .models import CGitModel
from swh.lister.core.simple_lister import SimpleLister
from swh.lister.core.lister_transports import ListerOnePageApiTransport
class CGitLister(ListerOnePageApiTransport, SimpleLister):
MODEL = CGitModel
LISTER_NAME = 'cgit'
PAGE = ''
def __init__(self, base_url, instance=None, override_config=None):
if not base_url.endswith('/'):
base_url = base_url+'/'
self.PAGE = base_url
# This part removes any suffix from the base url and stores it in
# next_url. For example for base_url = https://git.kernel.org/pub/scm/
# it will convert it into https://git.kernel.org and then attach
# the suffix
(part1, part2, next_url) = self.PAGE.split('/', 2)
self.next_url = part1 + '//' + next_url
if not instance:
instance = urllib.parse.urlparse(base_url).hostname
self.instance = instance
ListerOnePageApiTransport .__init__(self)
SimpleLister.__init__(self, override_config=override_config)
def list_packages(self, response):
"""List the actual cgit instance origins from the response.
"""
repos_details = []
soup = BeautifulSoup(response.text, features="html.parser") \
.find('div', {"class": "content"})
repos = soup.find_all("tr", {"class": ""})
for repo in repos:
repo_name = repo.a.text
repo_url = self.get_url(repo)
origin_url = find_origin_url(repo_url)
try:
time = repo.span['title']
except Exception:
time = None
if origin_url is not None:
repos_details.append({
'name': repo_name,
'time': time,
'origin_url': origin_url,
})
random.shuffle(repos_details)
return repos_details
def get_url(self, repo):
"""Finds url of a repo page.
Finds the url of a repo page by parsing over the html of the row of
that repo present in the base url.
Args:
repo: a beautifulsoup object of the html code of the repo row
present in base url.
Returns:
string: The url of a repo.
"""
suffix = repo.a['href']
return self.next_url + suffix
def get_model_from_repo(self, repo):
"""Transform from repository representation to model.
"""
return {
'uid': self.PAGE + repo['name'],
'name': repo['name'],
'full_name': repo['name'],
'html_url': repo['origin_url'],
'origin_url': repo['origin_url'],
'origin_type': 'git',
'time_updated': repo['time'],
}
def transport_response_simplified(self, response):
"""Transform response to list for model manipulation.
"""
return [self.get_model_from_repo(repo) for repo in response]
def find_origin_url(repo_url):
"""Finds origin url for a repo.
Finds the origin url for a particular repo by parsing over the page of
that repo.
Args:
repo_url: URL of the repo.
Returns:
string: Origin url for the repo.
Examples:
>>> find_origin_url(
'http://git.savannah.gnu.org/cgit/fbvbconv-py.git/')
'https://git.savannah.gnu.org/git/fbvbconv-py.git'
"""
response = requests.get(repo_url)
soup = BeautifulSoup(response.text, features="html.parser")
origin_urls = find_all_origin_url(soup)
return priority_origin_url(origin_urls)
def find_all_origin_url(soup):
"""
Finds all the origin url for a particular repo by parsing over the html of
repo page.
Args:
soup: a beautifulsoup object of the html code of the repo.
Returns:
dictionary: All possible origin urls with their protocol as key.
Examples:
If soup is beautifulsoup object of the html code at
http://git.savannah.gnu.org/cgit/fbvbconv-py.git/
>>> print(find_all_origin_url(soup))
{ 'https': 'https://git.savannah.gnu.org/git/fbvbconv-py.git',
'ssh': 'ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git',
'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'}
"""
origin_urls = defaultdict(dict)
found_clone_word = False
for i in soup.find_all('tr'):
if found_clone_word:
link = i.text
protocol = link[:link.find(':')]
origin_urls[protocol] = link
if i.text == 'Clone':
found_clone_word = True
return origin_urls
def priority_origin_url(origin_url):
"""Finds the highest priority link for a particular repo.
Priority order is https>http>git>ssh.
Args:
origin_urls: A dictionary of origin links with their protocol as key.
Returns:
string: URL with the highest priority.
"""
for protocol in ['https', 'http', 'git', 'ssh']:
if protocol in origin_url:
return origin_url[protocol]
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from sqlalchemy import Column, String
from ..core.models import ModelBase
class CGitModel(ModelBase):
"""a CGit repository representation
"""
__tablename__ = 'cgit_repo'
uid = Column(String, primary_key=True)
time_updated = Column(String)
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.scheduler.celery_backend.config import app
from .lister import CGitLister
def new_lister(base_url='https://git.savannah.gnu.org/cgit/',
instance='savannah-gnu', **kw):
return CGitLister(base_url=base_url, instance=instance, **kw)
@app.task(name=__name__ + '.CGitListerTask')
def cgit_lister(**lister_args):
lister = new_lister(**lister_args)
lister.run()
@app.task(name=__name__ + '.ping')
def ping():
return 'OK'
<!DOCTYPE html>
<html lang='en'>
<head>
<title>fbvbconv-py.git - Unnamed repository; edit this file 'description' to name the repository.</title>
<meta name='generator' content='cgit v1.0-41-gc330'/>
<meta name='robots' content='index, nofollow'/>
<link rel='stylesheet' type='text/css' href='/cgit/cgit.css'/>
<link rel='shortcut icon' href='/gitweb/git-favicon.png'/>
<link rel='alternate' title='Atom feed' href='http://git.savannah.gnu.org/cgit/fbvbconv-py.git/atom/?h=master' type='application/atom+xml'/>
<link rel='vcs-git' href='git://git.savannah.gnu.org/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
<link rel='vcs-git' href='https://git.savannah.gnu.org/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
<link rel='vcs-git' href='ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'/>
</head>
<body>
<div id='cgit'><table id='header'>
<tr>
<td class='logo' rowspan='2'><a href='/cgit/'><img src='/cgit/cgit.png' alt='cgit logo'/></a></td>
<td class='main'><a href='/cgit/'>index</a> : <a title='fbvbconv-py.git' href='/cgit/fbvbconv-py.git/'>fbvbconv-py.git</a></td><td class='form'><form method='get'>
<select name='h' onchange='this.form.submit();'>
<option value='master' selected='selected'>master</option>
</select> <input type='submit' value='switch'/></form></td></tr>
<tr><td class='sub'>Unnamed repository; edit this file 'description' to name the repository.</td><td class='sub right'></td></tr></table>
<table class='tabs'><tr><td>
<a class='active' href='/cgit/fbvbconv-py.git/'>summary</a><a href='/cgit/fbvbconv-py.git/refs/'>refs</a><a href='/cgit/fbvbconv-py.git/log/'>log</a><a href='/cgit/fbvbconv-py.git/tree/'>tree</a><a href='/cgit/fbvbconv-py.git/commit/'>commit</a><a href='/cgit/fbvbconv-py.git/diff/'>diff</a></td><td class='form'><form class='right' method='get' action='/cgit/fbvbconv-py.git/log/'>
<select name='qt'>
<option value='grep'>log msg</option>
<option value='author'>author</option>
<option value='committer'>committer</option>
<option value='range'>range</option>
</select>
<input class='txt' type='text' size='10' name='q' value=''/>
<input type='submit' value='search'/>
</form>
</td></tr></table>
<div class='content'><table summary='repository info' class='list nowrap'><tr class='nohover'><th class='left'>Branch</th><th class='left'>Commit message</th><th class='left'>Author</th><th class='left' colspan='2'>Age</th></tr>
<tr><td><a href='/cgit/fbvbconv-py.git/log/'>master</a></td><td><a href='/cgit/fbvbconv-py.git/commit/'>initial import</a></td><td>Johannes Stezenbach</td><td colspan='2'><span class='age-years' title='2017-06-02 09:57:38 +0200'>2 years</span></td></tr>
<tr class='nohover'><td colspan='5'>&nbsp;</td></tr><tr class='nohover'><td colspan='5'>&nbsp;</td></tr><tr class='nohover'><th class='left'>Age</th><th class='left'>Commit message</th><th class='left'>Author</th><th class='left'>Files</th><th class='left'>Lines</th></tr>
<tr><td><span title='2017-06-02 09:57:38 +0200'>2017-06-02</span></td><td><a href='/cgit/fbvbconv-py.git/commit/?id=9766bcfae598e3e077f321bade823028eb5553bb'>initial import</a><span class='decoration'><a class='deco' href='/cgit/fbvbconv-py.git/commit/?id=9766bcfae598e3e077f321bade823028eb5553bb'>HEAD</a><a class='branch-deco' href='/cgit/fbvbconv-py.git/log/'>master</a></span></td><td>Johannes Stezenbach</td><td>3</td><td><span class='deletions'>-0</span>/<span class='insertions'>+889</span></td></tr>
<tr class='nohover'><td colspan='5'>&nbsp;</td></tr><tr class='nohover'><th class='left' colspan='5'>Clone</th></tr>
<tr><td colspan='5'><a rel='vcs-git' href='git://git.savannah.gnu.org/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>git://git.savannah.gnu.org/fbvbconv-py.git</a></td></tr>
<tr><td colspan='5'><a rel='vcs-git' href='https://git.savannah.gnu.org/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>https://git.savannah.gnu.org/git/fbvbconv-py.git</a></td></tr>
<tr><td colspan='5'><a rel='vcs-git' href='ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git' title='fbvbconv-py.git Git repository'>ssh://git.savannah.gnu.org/srv/git/fbvbconv-py.git</a></td></tr>
</table></div> <!-- class=content -->
<div class='footer'>generated by <a href='https://git.zx2c4.com/cgit/about/'>cgit v1.0-41-gc330</a> at 2019-06-19 10:51:46 +0000</div>
</div> <!-- id=cgit -->
</body>
</html>
from swh.lister.core.tests.conftest import * # noqa
# Copyright (C) 2019 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from bs4 import BeautifulSoup
from swh.lister.cgit.lister import priority_origin_url, find_all_origin_url
def test_find_all_origin_url():
f = open('swh/lister/cgit/tests/api_response.html')
soup = BeautifulSoup(f.read(), features="html.parser")
expected_output = {'https': 'https://git.savannah.gnu.org/git/'
'fbvbconv-py.git',
'ssh': 'ssh://git.savannah.gnu.org/srv/git/'
'fbvbconv-py.git',
'git': 'git://git.savannah.gnu.org/fbvbconv-py.git'}
output = find_all_origin_url(soup)
for protocol, url in expected_output.items():
assert url == output[protocol]
def test_priority_origin_url():
first_input = {'https': 'https://kernel.googlesource.com/pub/scm/docs/'
'man-pages/man-pages.git',
'git': 'git://git.kernel.org/pub/scm/docs/man-pages/'
'man-pages.git'}
second_input = {'git': 'git://git.savannah.gnu.org/perl-pesel.git',
'ssh': 'ssh://git.savannah.gnu.org/srv/git/perl-pesel.git'}
third_input = {}
assert (priority_origin_url(first_input) ==
'https://kernel.googlesource.com/pub/scm/docs/man-pages/'
'man-pages.git')
assert (priority_origin_url(second_input) ==
'git://git.savannah.gnu.org/perl-pesel.git')
assert priority_origin_url(third_input) is None
from unittest.mock import patch
def test_ping(swh_app, celery_session_worker):
res = swh_app.send_task(
'swh.lister.cgit.tasks.ping')
assert res
res.wait()
assert res.successful()
assert res.result == 'OK'
@patch('swh.lister.cgit.tasks.CGitLister')
def test_lister(lister, swh_app, celery_session_worker):
# setup the mocked CGitLister
lister.return_value = lister
lister.run.return_value = None
res = swh_app.send_task(
'swh.lister.cgit.tasks.CGitListerTask')
assert res
res.wait()
assert res.successful()
lister.assert_called_once_with(
base_url='https://git.savannah.gnu.org/cgit/',
instance='savannah-gnu')
lister.db_last_index.assert_not_called()
lister.run.assert_called_once_with()
......@@ -12,7 +12,7 @@ from swh.core.cli import CONTEXT_SETTINGS
logger = logging.getLogger(__name__)
SUPPORTED_LISTERS = ['github', 'gitlab', 'bitbucket', 'debian', 'pypi',
'npm', 'phabricator', 'gnu', 'cran']
'npm', 'phabricator', 'gnu', 'cran', 'cgit']
@click.group(name='lister', context_settings=CONTEXT_SETTINGS)
......@@ -125,6 +125,13 @@ def cli(ctx, db_url, listers, drop_tables):
from .cran.lister import CRANLister
_lister = CRANLister(override_config=override_conf)
elif lister == 'cgit':
from .cgit.models import ModelBase
from .cgit.lister import CGitLister
_lister = CGitLister(
base_url='http://git.savannah.gnu.org/cgit/',
override_config=override_conf)
else:
raise ValueError(
'Invalid lister %s: only supported listers are %s' %
......
......@@ -6,6 +6,7 @@ from swh.scheduler.tests.conftest import * # noqa
def celery_includes():
return [
'swh.lister.bitbucket.tasks',
'swh.lister.cgit.tasks',
'swh.lister.cran.tasks',
'swh.lister.debian.tasks',
'swh.lister.github.tasks',
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment