Skip to content
Snippets Groups Projects
Commit 6696a842 authored by Franck Bret's avatar Franck Bret
Browse files

Hackage: List origins from hackage.haskell.org, The Haskell Package Repository

Use http api point to get package names and build origin urls.
parent 8ff418fb
No related branches found
Tags v0.2.3
No related merge requests found
Showing
with 358 additions and 0 deletions
......@@ -69,6 +69,7 @@ setup(
lister.gitlab=swh.lister.gitlab:register
lister.gnu=swh.lister.gnu:register
lister.golang=swh.lister.golang:register
lister.hackage=swh.lister.hackage:register
lister.launchpad=swh.lister.launchpad:register
lister.npm=swh.lister.npm:register
lister.opam=swh.lister.opam:register
......
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""
Hackage lister
==============
The Hackage lister list origins from `hackage.haskell.org`_, the `Haskell`_ Package
Repository.
The registry provide an `http api`_ from where the lister retrieve package names
and build origins urls.
As of August 2022 `hackage.haskell.org`_ list 15536 package names.
Origins retrieving strategy
---------------------------
To get a list of all package names we make a POST call to
`https://hackage.haskell.org/packages/search` endpoint with some params given as
json data.
Default params::
{
"page": 0,
"sortColumn": "default",
"sortDirection": "ascending",
"searchQuery": "(deprecated:any)",
}
The page size is 50. The lister will make has much http api call has needed to get
all results.
Page listing
------------
The result is paginated, each page is 50 records long.
Entry data set example::
{
"description": "3D model parsers",
"downloads": 6,
"lastUpload": "2014-11-08T03:55:23.879047Z",
"maintainers": [{"display": "capsjac", "uri": "/user/capsjac"}],
"name": {"display": "3dmodels", "uri": "/package/3dmodels"},
"tags": [
{"display": "graphics", "uri": "/packages/tag/graphics"},
{"display": "lgpl", "uri": "/packages/tag/lgpl"},
{"display": "library", "uri": "/packages/tag/library"},
],
"votes": 1.5,
}
Origins from page
-----------------
The lister yields 50 origins url per page.
Each ListedOrigin has a `last_update` date set.
Running tests
-------------
Activate the virtualenv and run from within swh-lister directory::
pytest -s -vv --log-cli-level=DEBUG swh/lister/hackage/tests
Testing with Docker
-------------------
Change directory to swh/docker then launch the docker environment::
docker compose up -d
Then schedule an Hackage listing task::
docker compose exec swh-scheduler swh scheduler task add -p oneshot list-hackage
You can follow lister execution by displaying logs of swh-lister service::
docker compose logs -f swh-lister
.. _hackage.haskell.org: https://hackage.haskell.org/
.. _Haskell: https://haskell.org/
.. _http api: https://hackage.haskell.org/api
"""
def register():
from .lister import HackageLister
return {
"lister": HackageLister,
"task_modules": ["%s.tasks" % __name__],
}
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, Dict, Iterator, List, Optional
import iso8601
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
HackageListerPage = List[Dict[str, Any]]
class HackageLister(StatelessLister[HackageListerPage]):
"""List Hackage (The Haskell Package Repository) origins."""
LISTER_NAME = "hackage"
VISIT_TYPE = "hackage"
INSTANCE = "hackage"
BASE_URL = "https://hackage.haskell.org/"
PACKAGE_NAMES_URL_PATTERN = "{base_url}packages/search"
PACKAGE_INFO_URL_PATTERN = "{base_url}package/{pkgname}"
def __init__(
self,
scheduler: SchedulerInterface,
credentials: Optional[CredentialsType] = None,
url: Optional[str] = None,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=url if url else self.BASE_URL,
)
# Ensure to set this with same value as the http api search endpoint use
# (50 as of august 2022)
self.page_size: int = 50
def get_pages(self) -> Iterator[HackageListerPage]:
"""Yield an iterator which returns 'page'
It uses the http api endpoint `https://hackage.haskell.org/packages/search`
to get a list of package names from which we build an origin url.
Results are paginated.
"""
params = {
"page": 0,
"sortColumn": "default",
"sortDirection": "ascending",
"searchQuery": "(deprecated:any)",
}
data = self.http_request(
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
method="POST",
json=params,
).json()
nb_entries: int = data["numberOfResults"]
(nb_pages, remainder) = divmod(nb_entries, self.page_size)
if remainder:
nb_pages += 1
yield data["pageContents"]
for page in range(1, nb_pages):
params["page"] = page
data = self.http_request(
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
method="POST",
json=params,
).json()
yield data["pageContents"]
def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances."""
assert self.lister_obj.id is not None
for entry in page:
pkgname = entry["name"]["display"]
last_update = iso8601.parse_date(entry["lastUpload"])
url = self.PACKAGE_INFO_URL_PATTERN.format(
base_url=self.url, pkgname=pkgname
)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=url,
last_update=last_update,
)
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import shared_task
from swh.lister.hackage.lister import HackageLister
@shared_task(name=__name__ + ".HackageListerTask")
def list_hackage(**lister_args):
"""Lister task for Hackage, the Haskell Package Repository"""
return HackageLister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"
This diff is collapsed.
This diff is collapsed.
{"numberOfResults":51,"pageContents":[{"description":"Command-line program for type-checking and compiling Agda programs","downloads":20,"lastUpload":"2012-03-12T11:01:45Z","maintainers":[{"display":"NilsAndersDanielsson","uri":"/user/NilsAndersDanielsson"},{"display":"UlfNorell","uri":"/user/UlfNorell"}],"name":{"display":"Agda-executable","uri":"/package/Agda-executable"},"tags":[{"display":"dependent-types","uri":"/packages/tag/dependent-types"},{"display":"deprecated","uri":"/packages/tag/deprecated"},{"display":"program","uri":"/packages/tag/program"}],"votes":0}]}
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import functools
import json
from pathlib import Path
from urllib.parse import unquote, urlparse
from swh.lister.hackage.lister import HackageLister
def json_callback(request, context, datadir):
"""Callback for requests_mock that load a json file regarding a page number"""
page = request.json()["page"]
unquoted_url = unquote(request.url)
url = urlparse(unquoted_url)
dirname = "%s_%s" % (url.scheme, url.hostname)
filename = url.path[1:]
if filename.endswith("/"):
filename = filename[:-1]
filename = filename.replace("/", "_")
return json.loads(Path(datadir, dirname, f"{filename}_{page}").read_text())
def test_hackage_lister(swh_scheduler, requests_mock, datadir):
requests_mock.post(
url="https://hackage.haskell.org/packages/search",
status_code=200,
json=functools.partial(json_callback, datadir=datadir),
)
expected_origins = []
for page in [0, 1, 2]:
data = json.loads(
Path(
datadir, "https_hackage.haskell.org", f"packages_search_{page}"
).read_text()
)
for entry in data["pageContents"]:
pkgname = entry["name"]["display"]
expected_origins.append(
{"url": f"https://hackage.haskell.org/package/{pkgname}"}
)
lister = HackageLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 3
assert res.origins == res.pages * 50
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins)
assert {
(
scheduled.visit_type,
scheduled.url,
)
for scheduled in scheduler_origins
} == {
(
"hackage",
expected["url"],
)
for expected in expected_origins
}
def test_hackage_lister_pagination_49(swh_scheduler, requests_mock, datadir):
requests_mock.post(
url="https://fake49.haskell.org/packages/search",
status_code=200,
json=functools.partial(json_callback, datadir=datadir),
)
lister = HackageLister(scheduler=swh_scheduler, url="https://fake49.haskell.org/")
pages = list(lister.get_pages())
# there should be 1 page with 49 entries
assert len(pages) == 1
assert len(pages[0]) == 49
def test_hackage_lister_pagination_51(swh_scheduler, requests_mock, datadir):
requests_mock.post(
url="https://fake51.haskell.org/packages/search",
status_code=200,
json=functools.partial(json_callback, datadir=datadir),
)
lister = HackageLister(scheduler=swh_scheduler, url="https://fake51.haskell.org/")
pages = list(lister.get_pages())
# there should be 2 pages with 50 + 1 entries
assert len(pages) == 2
assert len(pages[0]) == 50
assert len(pages[1]) == 1
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
def test_hackage_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.hackage.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
def test_hackage_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
# setup the mocked HackageLister
lister = mocker.patch("swh.lister.hackage.tasks.HackageLister")
lister.from_configfile.return_value = lister
stats = ListerStats(pages=42, origins=42)
lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task(
"swh.lister.hackage.tasks.HackageListerTask"
)
assert res
res.wait()
assert res.successful()
assert res.result == stats.dict()
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment