Skip to content
Snippets Groups Projects
Verified Commit c1221671 authored by Antoine R. Dumont's avatar Antoine R. Dumont
Browse files

Add stagit lister

That lister is really near the cgit & gitweb implementations. But the dom data is again
structured differently though so this implementation stands on its own.

Refs. swh/meta#5048
parent 3ab85628
No related branches found
No related tags found
No related merge requests found
Pipeline #3587 failed
Showing
with 846 additions and 0 deletions
......@@ -89,6 +89,7 @@ setup(
lister.pypi=swh.lister.pypi:register
lister.rubygems=swh.lister.rubygems:register
lister.sourceforge=swh.lister.sourceforge:register
lister.stagit=swh.lister.stagit:register
lister.tuleap=swh.lister.tuleap:register
lister.maven=swh.lister.maven:register
""",
......
# Copyright (C) 2023 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import StagitLister
return {
"lister": StagitLister,
"task_modules": [f"{__name__}.tasks"],
}
# Copyright (C) 2023 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from datetime import datetime, timezone
import logging
import re
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from requests.exceptions import HTTPError
from swh.lister.pattern import CredentialsType, StatelessLister
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
logger = logging.getLogger(__name__)
Repositories = List[Dict[str, Any]]
class StagitLister(StatelessLister[Repositories]):
"""Lister class for Stagit forge instances.
This lister will retrieve the list of published git repositories by
parsing the HTML page(s) of the index retrieved at `url`.
"""
LISTER_NAME = "stagit"
def __init__(
self,
scheduler: SchedulerInterface,
url: Optional[str] = None,
instance: Optional[str] = None,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
):
"""Lister class for Stagit repositories.
Args:
url: (Optional) Root URL of the Stagit instance, i.e. url of the index of
published git repositories on this instance. Defaults to
:file:`https://{instance}` if unset.
instance: Name of stagit instance. Defaults to url's network location
if unset.
"""
super().__init__(
scheduler=scheduler,
url=url,
instance=instance,
credentials=credentials,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
)
self.session.headers.update({"Accept": "application/html"})
def _get_and_parse(self, url: str) -> BeautifulSoup:
"""Get the given url and parse the retrieved HTML using BeautifulSoup"""
response = self.http_request(url)
return BeautifulSoup(response.text, features="html.parser")
def get_pages(self) -> Iterator[Repositories]:
"""Generate git 'project' URLs found on the current Stagit server."""
bs_idx = self._get_and_parse(self.url)
page_results = []
for tr in bs_idx.find("table", {"id": re.compile("index")}).find_all("tr"):
link = tr.find("a")
if not link:
continue
repo_description_url = self.url + "/" + link["href"]
# This retrieves the date in format "%Y-%m-%d %H:%M"
tds = tr.find_all("td")
last_update = tds[-1].text if tds and tds[-1] else None
page_results.append(
{"url": repo_description_url, "last_update": last_update}
)
yield page_results
def get_origins_from_page(
self, repositories: Repositories
) -> Iterator[ListedOrigin]:
"""Convert a page of stagit repositories into a list of ListedOrigins."""
assert self.lister_obj.id is not None
for repo in repositories:
origin_url = self._get_origin_from_repository_url(repo["url"])
if origin_url is None:
continue
yield ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="git",
last_update=_parse_date(repo["last_update"]),
)
def _get_origin_from_repository_url(self, repository_url: str) -> Optional[str]:
"""Extract the git url from the repository page"""
try:
bs = self._get_and_parse(repository_url)
except HTTPError as e:
logger.warning(
"Unexpected HTTP status code %s on %s",
e.response.status_code,
e.response.url,
)
return None
urls = [
row.find("a")["href"]
for row in bs.find_all("tr.url td")
if row.text.startswith("git clone")
]
if not urls:
return None
urls = [url for url in urls if urlparse(url).scheme in ("https", "http", "git")]
if not urls:
return None
return urls[0]
def _parse_date(date: Optional[str]) -> Optional[datetime]:
"""Parse the last update date."""
if not date:
return None
parsed_date = None
try:
parsed_date = datetime.strptime(date, "%Y-%m-%d %H:%M").replace(
tzinfo=timezone.utc
)
except Exception:
logger.warning(
"Could not parse last_update date: %s",
date,
)
return parsed_date
# Copyright (C) 2023 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Dict
from celery import shared_task
from .lister import StagitLister
@shared_task(name=f"{__name__}.StagitListerTask")
def list_stagit(**lister_args) -> Dict[str, str]:
"""Lister task for Stagit instances"""
lister = StagitLister.from_configfile(**lister_args)
return lister.run().dict()
These files are a partial dump of https://codemadness.org/git/.
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Repositories</title>
<link rel="icon" type="image/png" href="favicon.png" />
<link rel="stylesheet" type="text/css" href="style.css" />
</head>
<body>
<table>
<tr><td><img src="logo.png" alt="" width="32" height="32" /></td>
<td><span class="desc">Repositories</span></td></tr><tr><td></td><td>
</td></tr>
</table>
<hr/>
<div id="content">
<table id="index"><thead>
<tr><td><b>Name</b></td><td><b>Description</b></td><td><b>Owner</b></td><td><b>Last commit</b></td></tr></thead><tbody>
<tr><td><a href="bmf/log.html">bmf</a></td><td>bmf (Bayesian Mail Filter) 0.9.4 fork + patches
</td><td></td><td>2020-02-04 22:03</td></tr><tr><td><a href="dmenu/log.html">dmenu</a></td><td>my customized version of dmenu (hiltjo branch)
</td><td>Hiltjo Posthuma</td><td>2022-05-01 16:38</td></tr><tr><td><a href="dwm/log.html">dwm</a></td><td>my customized version of dwm (hiltjo branch)
</td><td>Hiltjo Posthuma</td><td>2023-04-10 10:34</td></tr><tr><td><a href="stagit/log.html">stagit</a></td><td>static git page generator
</td><td>Hiltjo Posthuma</td><td>2020-03-03 23:49</td></tr></tbody>
</td><td>Hiltjo Posthuma</td><td>2021-07-20 13:20</td></tr><tr><td><a href="twitch-go/log.html">twitch-go</a></td><td>twitch.tv web application in Go
</td><td>Hiltjo Posthuma</td><td>2019-05-02 18:14</td></tr><tr><td><a href="webdump/log.html">webdump</a></td><td>Text-based web client/page dump (experiment)
</td><td>Hiltjo Posthuma</td><td>2023-03-20 20:32</td></tr><tr><td><a href="www.codemadness.org/log.html">www.codemadness.org</a></td><td>www.codemadness.org saait content files
</td><td>Hiltjo Posthuma</td><td>2023-05-20 09:50</td></tr><tr><td><a href="xmlparser/log.html">xmlparser</a></td><td>XML parser
</td><td>Hiltjo Posthuma</td><td>2023-05-14 21:59</td></tr><tr><td><a href="xscreenshot/log.html">xscreenshot</a></td><td>screen capture tool
</table>
</div>
</body>
</html>
git_bmf_log.html
\ No newline at end of file
git_dmenu_log.html
\ No newline at end of file
git_dwm_log.html
\ No newline at end of file
git_stagit_log.html
\ No newline at end of file
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Repositories</title>
<link rel="icon" type="image/png" href="favicon.png" />
<link rel="stylesheet" type="text/css" href="style.css" />
</head>
<body>
<table>
<tr><td><img src="logo.png" alt="" width="32" height="32" /></td>
<td><span class="desc">Repositories</span></td></tr><tr><td></td><td>
</td></tr>
</table>
<hr/>
<div id="content">
<table id="index"><thead>
<tr><td><b>Name</b></td><td><b>Description</b></td><td><b>Owner</b></td><td><b>Last commit</b></td></tr></thead><tbody>
<tr><td><a href="bmf/log.html">bmf</a></td><td>bmf (Bayesian Mail Filter) 0.9.4 fork + patches
</td><td></td><td>2020-02-04 22:03</td></tr><tr><td><a href="dmenu/log.html">dmenu</a></td><td>my customized version of dmenu (hiltjo branch)
</td><td>Hiltjo Posthuma</td><td>2022-05-01 16:38</td></tr><tr><td><a href="dwm/log.html">dwm</a></td><td>my customized version of dwm (hiltjo branch)
</td><td>Hiltjo Posthuma</td><td>2023-04-10 10:34</td></tr><tr><td><a href="stagit/log.html">stagit</a></td><td>static git page generator
</td><td>Hiltjo Posthuma</td><td>2020-03-03 23:49</td></tr></tbody>
</table>
</div>
</body>
</html>
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Log - bmf - bmf (Bayesian Mail Filter) 0.9.4 fork + patches
</title>
<link rel="icon" type="image/png" href="favicon.png" />
<link rel="alternate" type="application/atom+xml" title="bmf Atom Feed" href="atom.xml" />
<link rel="alternate" type="application/atom+xml" title="bmf Atom Feed (tags)" href="tags.xml" />
<link rel="stylesheet" type="text/css" href="style.css" />
</head>
<body>
<table><tr><td><a href="../"><img src="logo.png" alt="" width="32" height="32" /></a></td><td><h1>bmf</h1><span class="desc">bmf (Bayesian Mail Filter) 0.9.4 fork + patches
</span></td></tr><tr class="url"><td></td><td>git clone <a href="git://git.codemadness.org/bmf">git://git.codemadness.org/bmf</a></td></tr><tr><td></td><td>
<a href="log.html">Log</a> | <a href="files.html">Files</a> | <a href="refs.html">Refs</a> | <a href="file/README.html">README</a> | <a href="file/LICENSE.html">LICENSE</a></td></tr></table>
<hr/>
<div id="content">
<table id="log"><thead>
<tr><td><b>Date</b></td><td><b>Commit message</b></td><td><b>Author</b></td><td class="num" align="right"><b>Files</b></td><td class="num" align="right"><b>+</b></td><td class="num" align="right"><b>-</b></td></tr>
</thead><tbody>
<tr><td>2020-02-04 22:03</td><td><a href="commit/9372645e9887679999d441e106da7bbc572fb2a6.html">update TODO</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+2</td><td class="num" align="right">-0</td></tr>
<tr><td>2020-02-04 21:59</td><td><a href="commit/8a316864887a48a5fd2867b6bde5d5e3b215e288.html">add a bulk test mode option (-b)</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">3</td><td class="num" align="right">+83</td><td class="num" align="right">-3</td></tr>
<tr><td>2019-01-26 19:10</td><td><a href="commit/da5b33ffd35e25649614ac678df293afcffb3f35.html">README: typo applicatios -&gt; applications</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+2</td><td class="num" align="right">-3</td></tr>
<tr><td>2019-01-26 18:39</td><td><a href="commit/2d06b1eeab72bd1e4715d9191ca2b03cd0ab50de.html">fix -d parameter</a></td><td>Julian Schweinsberg</td><td class="num" align="right">2</td><td class="num" align="right">+2</td><td class="num" align="right">-2</td></tr>
<tr><td>2018-11-09 10:18</td><td><a href="commit/40a406768615f5b89a6ba6e802fed5597c769c1a.html">fix statdump call parameter</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+1</td><td class="num" align="right">-1</td></tr>
<tr><td>2018-11-09 10:17</td><td><a href="commit/21257a01a467925aaf99a6dbb0b7604a58762473.html">fix statdump declaration</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+1</td><td class="num" align="right">-1</td></tr>
<tr><td>2018-11-08 17:12</td><td><a href="commit/e39d60975a228c3d1e5b9512e082fb8bb1c28001.html">statdump: use standard I/O functions for buffering</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+7</td><td class="num" align="right">-17</td></tr>
<tr><td>2018-11-08 17:07</td><td><a href="commit/24fa4a0c3c143c6f36f1ca08b41135156c68f9ff.html">fix uninitialized memory when parsing bogofilter header</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+2</td><td class="num" align="right">-1</td></tr>
<tr><td>2018-10-27 18:05</td><td><a href="commit/b627d86afb6118bb029d5601078fe972d576ab3e.html">function declaration: use the same parameter names</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">4</td><td class="num" align="right">+14</td><td class="num" align="right">-15</td></tr>
<tr><td>2018-10-27 18:02</td><td><a href="commit/57c341a511e88733eedf95a443567f27198247e3.html">set rdonly earlier for unveil, make open() error more clear</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">2</td><td class="num" align="right">+3</td><td class="num" align="right">-3</td></tr>
<tr><td>2018-10-27 17:56</td><td><a href="commit/da144ef21a75e5a1f78c1faf2d76d93c68f6180f.html">fix unveil(2) permissions and path name + misc code fixes</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">6</td><td class="num" align="right">+25</td><td class="num" align="right">-29</td></tr>
<tr><td>2018-10-27 17:33</td><td><a href="commit/8c0e2cad22ac8e72666e90b8069cb0b082e38429.html">dbh_open -&gt; dbtext_db_open</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">2</td><td class="num" align="right">+2</td><td class="num" align="right">-2</td></tr>
<tr><td>2018-10-27 17:31</td><td><a href="commit/60b437c6d0bc19fc9f67ca8cfaf6cbfc50d47423.html">merge dbh and dbtext (WIP)</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">7</td><td class="num" align="right">+511</td><td class="num" align="right">-566</td></tr>
<tr><td>2018-10-27 17:14</td><td><a href="commit/4c3c79f49125ef555fba1df7f6cbab2c7b26ea00.html">initial unveil(2) support + some code-cleanup and remove unused functions</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">7</td><td class="num" align="right">+35</td><td class="num" align="right">-35</td></tr>
<tr><td>2018-10-27 17:13</td><td><a href="commit/ea2535f01b1fb73863f7104b0e21719b577620c1.html">whoops, fix regression in opening &quot;database&quot;</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+1</td><td class="num" align="right">-1</td></tr>
<tr><td>2018-10-27 16:37</td><td><a href="commit/f5e56cc70c117352ec5b7a7984065eaa65db162f.html">many improvements</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">17</td><td class="num" align="right">+144</td><td class="num" align="right">-610</td></tr>
<tr><td>2018-10-25 10:41</td><td><a href="commit/20a0f52d5b478e240450fd72fa3bbd3ab5c58c48.html">fix some undefined behaviour with ctype functions</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+16</td><td class="num" align="right">-15</td></tr>
<tr><td>2018-09-29 11:15</td><td><a href="commit/f368a24da9457e4d269ca281bbc07f0eef08751e.html">improve Makefile</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+77</td><td class="num" align="right">-28</td></tr>
<tr><td>2018-09-29 11:14</td><td><a href="commit/486c23d144116c1794e3800c0c0e051b2f3469e3.html">define PACKAGE macro in C file</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+2</td><td class="num" align="right">-0</td></tr>
<tr><td>2018-09-29 10:52</td><td><a href="commit/ed1d073e8b0ab5ad0745d7d9a75ee978b460659a.html">remove unused code, leftover -f flag documentation</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">7</td><td class="num" align="right">+2</td><td class="num" align="right">-91</td></tr>
<tr><td>2018-09-29 10:49</td><td><a href="commit/8c2b855490c62d2b8f20a0dcbb85ed1dc7686155.html">dbg: simplify verbose function</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+3</td><td class="num" align="right">-8</td></tr>
<tr><td>2018-09-23 12:39</td><td><a href="commit/1e2885e37b75f738445d13b6a61caf9786b28fea.html">config.h: disable pledge for non-OpenBSD</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+5</td><td class="num" align="right">-0</td></tr>
<tr><td>2018-09-23 12:37</td><td><a href="commit/c8fb28f55c568c95be709803ef153199f1557035.html">Makefile: order dependencies, remove unneeded [ -d ] check</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+3</td><td class="num" align="right">-3</td></tr>
<tr><td>2018-09-23 12:36</td><td><a href="commit/0f11e5e148314939e59850ef2aaa607f2b06bc90.html">improve code-style</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">7</td><td class="num" align="right">+1531</td><td class="num" align="right">-1663</td></tr>
<tr><td>2018-09-23 12:29</td><td><a href="commit/cd31f403d6c7b3acf4a41365c063c4cefef34e83.html">bmf.c: improve some code-style</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+250</td><td class="num" align="right">-282</td></tr>
<tr><td>2018-09-23 12:19</td><td><a href="commit/d4c3810c7f1e6030166288e0e30224c17dfd5ba5.html">update TODO</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+9</td><td class="num" align="right">-6</td></tr>
<tr><td>2018-09-22 18:05</td><td><a href="commit/79f641da0818fd7a43a970ad425e16300f4e6572.html">remove -i and -f from usage</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+0</td><td class="num" align="right">-2</td></tr>
<tr><td>2018-09-22 16:51</td><td><a href="commit/46b7439476354ab85b37689f5bec97b0231b251a.html">Makefile: install in /usr/local</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+2</td><td class="num" align="right">-4</td></tr>
<tr><td>2018-09-22 16:49</td><td><a href="commit/41e1e6b8e9d49a1300c070f03070db4ce2d0bbbd.html">pledge test mode, no need to reopen goodlist and spamlist for each message in test mode</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">2</td><td class="num" align="right">+52</td><td class="num" align="right">-16</td></tr>
<tr><td>2018-09-22 16:27</td><td><a href="commit/bd5dd52e982fcb2c07eddb303e585a6e9b738508.html">tweak pledge</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+2</td><td class="num" align="right">-2</td></tr>
<tr><td>2018-09-22 16:27</td><td><a href="commit/30cf7b5fcb8028582d3c746e9c72289be77c3f87.html">remove code leftover from -i option</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">1</td><td class="num" align="right">+0</td><td class="num" align="right">-17</td></tr>
<tr><td>2018-09-22 16:12</td><td><a href="commit/13b02490de8ddfe9a9ad66cc2484f7fd3a3b9278.html">remove NDEBUG code</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">3</td><td class="num" align="right">+2</td><td class="num" align="right">-281</td></tr>
<tr><td>2018-09-22 16:05</td><td><a href="commit/107ae911553ca8a5885eecaa2da0c37e030c216d.html">small cleanup, remove DB_USER, DB_PASS</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">2</td><td class="num" align="right">+7</td><td class="num" align="right">-10</td></tr>
<tr><td>2018-09-22 15:57</td><td><a href="commit/e805a804b42a190f0b06d8c495fd0f29011b1329.html">changes</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">15</td><td class="num" align="right">+60</td><td class="num" align="right">-2004</td></tr>
<tr><td>2018-09-22 15:49</td><td><a href="commit/4857ceba2cfedeafd8971a8e6e3db4ce2ea7f1b6.html">add patches from OpenBSD port</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">2</td><td class="num" align="right">+20</td><td class="num" align="right">-1</td></tr>
<tr><td>2018-09-22 15:46</td><td><a href="commit/0983b0f64c3e1bf7fa03f2a4060e6f25e9e79cef.html">import bmf 0.9.4</a></td><td>Hiltjo Posthuma</td><td class="num" align="right">31</td><td class="num" align="right">+5898</td><td class="num" align="right">-0</td></tr>
</tbody></table></div>
</body>
</html>
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
# Copyright (C) 2023 The Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import os
from typing import List
import pytest
from swh.lister import __version__
from swh.lister.pattern import ListerStats
from swh.lister.stagit.lister import StagitLister, _parse_date
MAIN_INSTANCE = "codemadness.org"
MAIN_INSTANCE_URL = f"https://{MAIN_INSTANCE}/git"
def test_lister_stagit_instantiate(swh_scheduler):
"""Build a lister with either an url or an instance is supported."""
url = MAIN_INSTANCE_URL
lister = StagitLister(swh_scheduler, url=url)
assert lister is not None
assert lister.url == url
assert StagitLister(swh_scheduler, instance=MAIN_INSTANCE) is not None
assert lister is not None
assert lister.url == url
def test_lister_stagit_fail_to_instantiate(swh_scheduler):
"""Build a lister without its url nor its instance should raise"""
# ... It will raise without any of those
with pytest.raises(ValueError, match="'url' or 'instance'"):
StagitLister(swh_scheduler)
def test_lister_stagit_get_pages(requests_mock_datadir, swh_scheduler):
"""Computing the number of pages scrapped during a listing."""
url = MAIN_INSTANCE_URL
lister_stagit = StagitLister(swh_scheduler, url=url)
expected_nb_origins = 4
repos: List[List[str]] = list(lister_stagit.get_pages())
flattened_repos = sum(repos, [])
assert len(flattened_repos) == expected_nb_origins
for listed_url in flattened_repos:
assert MAIN_INSTANCE in listed_url["url"]
def test_lister_stagit_run(requests_mock_datadir, swh_scheduler):
"""Gitweb lister nominal listing case."""
url = MAIN_INSTANCE_URL
lister_stagit = StagitLister(swh_scheduler, url=url)
stats = lister_stagit.run()
expected_nb_origins = 4 # main page will get filtered out
assert stats == ListerStats(pages=1, origins=expected_nb_origins)
# test page parsing
scheduler_origins = swh_scheduler.get_listed_origins(
lister_stagit.lister_obj.id
).results
assert len(scheduler_origins) == expected_nb_origins
# test listed repositories
for listed_origin in scheduler_origins:
assert listed_origin.visit_type == "git"
assert MAIN_INSTANCE in listed_origin.url
assert listed_origin.last_update is not None
# test user agent content
for request in requests_mock_datadir.request_history:
assert "User-Agent" in request.headers
user_agent = request.headers["User-Agent"]
assert "Software Heritage stagit lister" in user_agent
assert __version__ in user_agent
def test_lister_stagit_get_pages_with_pages_and_retry(
requests_mock_datadir, requests_mock, datadir, mocker, swh_scheduler
):
"""Rate limited page are tested back after some time so ingestion can proceed."""
url = MAIN_INSTANCE_URL
with open(os.path.join(datadir, f"https_{MAIN_INSTANCE}/git"), "rb") as page:
requests_mock.get(
url,
[
{"content": None, "status_code": 429},
{"content": None, "status_code": 429},
{"content": page.read(), "status_code": 200},
],
)
lister_stagit = StagitLister(swh_scheduler, url=url)
mocker.patch.object(lister_stagit.http_request.retry, "sleep")
pages: List[List[str]] = list(lister_stagit.get_pages())
flattened_repos = sum(pages, [])
assert len(pages) == 1
assert len(flattened_repos) == 4
def test_lister_stagit_get_origin_from_repo_failing(
swh_scheduler, requests_mock_datadir
):
"""Instances whose summary does not return anything are filtered out."""
# This instance has some more origins which no longer returns their summary
lister_stagit = StagitLister(swh_scheduler, url=f"https://{MAIN_INSTANCE}/foobar")
stats = lister_stagit.run()
# so they are filtered out, only the 7 we know are thus listed
expected_nb_origins = 4
assert stats == ListerStats(pages=1, origins=expected_nb_origins)
def test__parse_date():
assert _parse_date(None) is None
assert _parse_date("No commits") is None
date = _parse_date("2022-08-26 12:48")
assert date is not None
assert date.tzinfo is not None
# Copyright (C) 2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
def test_gitweb_lister_task(
swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
):
# setup the mocked GitwebLister
lister = mocker.patch("swh.lister.gitweb.tasks.GitwebLister")
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
kwargs = dict(
url="https://git.gentoo.org/", instance="kernel", base_git_url=None, max_pages=1
)
res = swh_scheduler_celery_app.send_task(
"swh.lister.gitweb.tasks.GitwebListerTask",
kwargs=kwargs,
)
assert res
res.wait()
assert res.successful()
lister.from_configfile.assert_called_once_with(**kwargs)
lister.run.assert_called_once_with()
......@@ -48,6 +48,9 @@ lister_args = {
"gitiles": {
"instance": "gerrit.googlesource.com",
},
"stagit": {
"url": "https://git.codemadness.org",
},
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment