Skip to content
Snippets Groups Projects
Commit 065b3f81 authored by Franck Bret's avatar Franck Bret
Browse files

Hackage: Implement incremental mode

Use http api lastUpload argument in search query to retrieve new or
updated origins since last run

Related to T4597
parent 6ad61aec
No related branches found
No related tags found
1 merge request!325Hackage: Implement incremental mode
......@@ -20,7 +20,7 @@ Origins retrieving strategy
---------------------------
To get a list of all package names we make a POST call to
`https://hackage.haskell.org/packages/search` endpoint with some params given as
``https://hackage.haskell.org/packages/search`` endpoint with some params given as
json data.
Default params::
......@@ -35,6 +35,10 @@ Default params::
The page size is 50. The lister will make has much http api call has needed to get
all results.
For incremental mode we expand the search query with ``lastUpload`` greater than
``state.last_listing_date``, the api will return all new or updated package names since
last run.
Page listing
------------
......@@ -60,7 +64,7 @@ Origins from page
-----------------
The lister yields 50 origins url per page.
Each ListedOrigin has a `last_update` date set.
Each ListedOrigin has a ``last_update`` date set.
Running tests
-------------
......
......@@ -3,6 +3,8 @@
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import dataclass
from datetime import datetime, timezone
import logging
from typing import Any, Dict, Iterator, List, Optional
......@@ -11,7 +13,7 @@ import iso8601
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
from ..pattern import CredentialsType, Lister
logger = logging.getLogger(__name__)
......@@ -19,7 +21,15 @@ logger = logging.getLogger(__name__)
HackageListerPage = List[Dict[str, Any]]
class HackageLister(StatelessLister[HackageListerPage]):
@dataclass
class HackageListerState:
"""Store lister state for incremental mode operations"""
last_listing_date: Optional[datetime] = None
"""Last date when Hackage lister was executed"""
class HackageLister(Lister[HackageListerState, HackageListerPage]):
"""List Hackage (The Haskell Package Repository) origins."""
LISTER_NAME = "hackage"
......@@ -45,6 +55,20 @@ class HackageLister(StatelessLister[HackageListerPage]):
# Ensure to set this with same value as the http api search endpoint use
# (50 as of august 2022)
self.page_size: int = 50
self.listing_date = datetime.now().astimezone(tz=timezone.utc)
def state_from_dict(self, d: Dict[str, Any]) -> HackageListerState:
last_listing_date = d.get("last_listing_date")
if last_listing_date is not None:
d["last_listing_date"] = iso8601.parse_date(last_listing_date)
return HackageListerState(**d)
def state_to_dict(self, state: HackageListerState) -> Dict[str, Any]:
d: Dict[str, Optional[str]] = {"last_listing_date": None}
last_listing_date = state.last_listing_date
if last_listing_date is not None:
d["last_listing_date"] = last_listing_date.isoformat()
return d
def get_pages(self) -> Iterator[HackageListerPage]:
"""Yield an iterator which returns 'page'
......@@ -54,11 +78,24 @@ class HackageLister(StatelessLister[HackageListerPage]):
Results are paginated.
"""
# Search query
sq = "(deprecated:any)"
if self.state.last_listing_date:
last_str = (
self.state.last_listing_date.astimezone(tz=timezone.utc)
.date()
.isoformat()
)
# Incremental mode search query
sq += "(lastUpload >= %s)" % last_str
params = {
"page": 0,
"sortColumn": "default",
"sortDirection": "ascending",
"searchQuery": "(deprecated:any)",
"searchQuery": sq,
}
data = self.http_request(
......@@ -67,20 +104,22 @@ class HackageLister(StatelessLister[HackageListerPage]):
json=params,
).json()
nb_entries: int = data["numberOfResults"]
(nb_pages, remainder) = divmod(nb_entries, self.page_size)
if remainder:
nb_pages += 1
yield data["pageContents"]
for page in range(1, nb_pages):
params["page"] = page
data = self.http_request(
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
method="POST",
json=params,
).json()
if data.get("pageContents"):
nb_entries: int = data["numberOfResults"]
(nb_pages, remainder) = divmod(nb_entries, self.page_size)
if remainder:
nb_pages += 1
# First page
yield data["pageContents"]
# Next pages
for page in range(1, nb_pages):
params["page"] = page
data = self.http_request(
url=self.PACKAGE_NAMES_URL_PATTERN.format(base_url=self.url),
method="POST",
json=params,
).json()
yield data["pageContents"]
def get_origins_from_page(self, page: HackageListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances."""
......@@ -92,9 +131,14 @@ class HackageLister(StatelessLister[HackageListerPage]):
url = self.PACKAGE_INFO_URL_PATTERN.format(
base_url=self.url, pkgname=pkgname
)
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=url,
last_update=last_update,
)
def finalize(self) -> None:
self.state.last_listing_date = self.listing_date
self.updated = True
{"numberOfResults":3,"pageContents":[{"description":"Translations of classic Truth Maintenance Systems","downloads":14,"lastUpload":"2022-09-13T19:21:15.533437837Z","maintainers":[{"display":"jpmrst","uri":"/user/jpmrst"}],"name":{"display":"BPS","uri":"/package/BPS"},"tags":[{"display":"gpl","uri":"/packages/tag/gpl"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"truth-maintenance","uri":"/packages/tag/truth-maintenance"}],"votes":0},{"description":"C-Structs implementation for Haskell","downloads":25,"lastUpload":"2022-09-30T08:00:34.348551203Z","maintainers":[{"display":"SimonPlakolb","uri":"/user/SimonPlakolb"}],"name":{"display":"C-structs","uri":"/package/C-structs"},"tags":[{"display":"c","uri":"/packages/tag/c"},{"display":"data","uri":"/packages/tag/data"},{"display":"foreign","uri":"/packages/tag/foreign"},{"display":"library","uri":"/packages/tag/library"},{"display":"mit","uri":"/packages/tag/mit"},{"display":"structures","uri":"/packages/tag/structures"}],"votes":2},{"description":"Cluster algorithms, PCA, and chemical conformere analysis","downloads":29,"lastUpload":"2022-09-28T11:54:25.8011197Z","maintainers":[{"display":"phillipseeber","uri":"/user/phillipseeber"}],"name":{"display":"ConClusion","uri":"/package/ConClusion"},"tags":[{"display":"agpl","uri":"/packages/tag/agpl"},{"display":"chemistry","uri":"/packages/tag/chemistry"},{"display":"library","uri":"/packages/tag/library"},{"display":"program","uri":"/packages/tag/program"},{"display":"statistics","uri":"/packages/tag/statistics"}],"votes":2}]}
{"numberOfResults":0,"pageContents":[]}
......@@ -8,25 +8,31 @@ import json
from pathlib import Path
from urllib.parse import unquote, urlparse
from swh.lister.hackage.lister import HackageLister
import iso8601
from swh.lister.hackage.lister import HackageLister, HackageListerState
def json_callback(request, context, datadir):
"""Callback for requests_mock that load a json file regarding a page number"""
page = request.json()["page"]
def json_callback(request, context, datadir, visit=0):
"""Callback for requests_mock that load a json file regarding a page number"""
unquoted_url = unquote(request.url)
url = urlparse(unquoted_url)
page = request.json()["page"]
dirname = "%s_%s" % (url.scheme, url.hostname)
filename = url.path[1:]
if filename.endswith("/"):
filename = filename[:-1]
filename = filename.replace("/", "_")
filepath = Path(datadir, dirname, f"{filename}_{page}")
return json.loads(Path(datadir, dirname, f"{filename}_{page}").read_text())
if visit > 0:
filepath = filepath.parent / f"{filepath.stem}_visit{visit}"
return json.loads(filepath.read_text())
def test_hackage_lister(swh_scheduler, requests_mock, datadir):
"""Assert a full listing of 3 pages of 50 origins"""
requests_mock.post(
url="https://hackage.haskell.org/packages/search",
......@@ -74,6 +80,10 @@ def test_hackage_lister(swh_scheduler, requests_mock, datadir):
def test_hackage_lister_pagination_49(swh_scheduler, requests_mock, datadir):
"""Test Pagination
Page size is 50, lister returns 1 page when origins < page size
"""
requests_mock.post(
url="https://fake49.haskell.org/packages/search",
status_code=200,
......@@ -87,6 +97,10 @@ def test_hackage_lister_pagination_49(swh_scheduler, requests_mock, datadir):
def test_hackage_lister_pagination_51(swh_scheduler, requests_mock, datadir):
"""Test Pagination
Page size is 50, lister returns 2 page when origins > page size
"""
requests_mock.post(
url="https://fake51.haskell.org/packages/search",
status_code=200,
......@@ -98,3 +112,86 @@ def test_hackage_lister_pagination_51(swh_scheduler, requests_mock, datadir):
assert len(pages) == 2
assert len(pages[0]) == 50
assert len(pages[1]) == 1
def test_hackage_lister_incremental(swh_scheduler, requests_mock, datadir):
"""Test incremental lister
* First run, full listing, 3 pages, 150 origins
* Second run, 1 page, 3 new or updated origins
* Third run, nothing new, 0 page, 0 origins
"""
mock_url = "https://hackage.haskell.org/packages/search"
# first run
requests_mock.post(
url=mock_url,
status_code=200,
json=functools.partial(json_callback, datadir=datadir),
)
lister = HackageLister(scheduler=swh_scheduler)
# force lister.last_listing_date to not being 'now'
lister.state.last_listing_date = iso8601.parse_date("2022-08-26T02:27:45.073759Z")
lister.set_state_in_scheduler()
assert lister.get_state_from_scheduler() == HackageListerState(
last_listing_date=iso8601.parse_date("2022-08-26T02:27:45.073759Z")
)
first = lister.run()
assert first.pages == 3
assert first.origins == 3 * 50
# 3 http requests done
assert len(requests_mock.request_history) == 3
for rh in requests_mock.request_history:
assert rh.json()["searchQuery"] == "(deprecated:any)(lastUpload >= 2022-08-26)"
# second run
requests_mock.post(
url=mock_url,
status_code=200,
json=functools.partial(json_callback, datadir=datadir, visit=1),
)
lister = HackageLister(scheduler=swh_scheduler)
# force lister.last_listing_date to not being 'now'
lister.state.last_listing_date = iso8601.parse_date(
"2022-09-30T08:00:34.348551203Z"
)
lister.set_state_in_scheduler()
assert lister.get_state_from_scheduler() == HackageListerState(
last_listing_date=iso8601.parse_date("2022-09-30T08:00:34.348551203Z")
)
second = lister.run()
assert second.pages == 1
assert second.origins == 3
assert len(requests_mock.request_history) == 3 + 1
# Check the first three ones, should be the same as first run
for i in range(3):
assert (
requests_mock.request_history[i].json()["searchQuery"]
== "(deprecated:any)(lastUpload >= 2022-08-26)"
)
# Check the last one, lastUpload should be the same as second run
assert (
requests_mock.last_request.json()["searchQuery"]
== "(deprecated:any)(lastUpload >= 2022-09-30)"
)
# third run (no update since last run, no new or updated origins but one http requests
# with no results)
requests_mock.post(
url=mock_url,
status_code=200,
json=functools.partial(json_callback, datadir=datadir, visit=2),
)
lister = HackageLister(scheduler=swh_scheduler)
third = lister.run()
assert third.pages == 0
assert third.origins == 0
assert lister.get_state_from_scheduler() == HackageListerState(
last_listing_date=lister.state.last_listing_date
)
assert len(requests_mock.request_history) == 3 + 1 + 1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment