diff --git a/README.md b/README.md index 5432c2bf3baedf561e0979ee8ea5d59694e8757a..e0332a292a542d3065e14184afdc8121032e7cc9 100644 --- a/README.md +++ b/README.md @@ -13,11 +13,14 @@ following Python modules: - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` +- `swh.liser.fedora` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` - `swh.lister.gnu` +- `swh.lister.gogs` - `swh.lister.golang` +- `swh.lister.hex` - `swh.lister.launchpad` - `swh.lister.maven` - `swh.lister.npm` @@ -25,9 +28,6 @@ following Python modules: - `swh.lister.phabricator` - `swh.lister.pypi` - `swh.lister.tuleap` -- `swh.lister.gogs` -- `swh.liser.fedora` -- `swh.lister.hex` Dependencies ------------ diff --git a/setup.py b/setup.py index ac5d5fb045524edb25ddf2c6695dfda845b699e9..90c75ebcea427cb907910a03bff367376fbca72f 100755 --- a/setup.py +++ b/setup.py @@ -65,12 +65,15 @@ setup( lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register + lister.fedora=swh.lister.fedora:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register lister.gitlab=swh.lister.gitlab:register lister.gnu=swh.lister.gnu:register lister.golang=swh.lister.golang:register + lister.gogs=swh.lister.gogs:register lister.hackage=swh.lister.hackage:register + lister.hex=swh.lister.hex:register lister.launchpad=swh.lister.launchpad:register lister.nixguix=swh.lister.nixguix:register lister.npm=swh.lister.npm:register @@ -85,9 +88,6 @@ setup( lister.sourceforge=swh.lister.sourceforge:register lister.tuleap=swh.lister.tuleap:register lister.maven=swh.lister.maven:register - lister.gogs=swh.lister.gogs:register - lister.fedora=swh.lister.fedora:register - lister.hex=swh.lister.hex:register """, classifiers=[ "Programming Language :: Python :: 3", diff --git a/swh/lister/hex/lister.py b/swh/lister/hex/lister.py index 0188c7caccba452d46bfc1b1b2f3ed74f08896c1..e06425259452eea90a9eac5a880cd25b1663d164 100644 --- a/swh/lister/hex/lister.py +++ b/swh/lister/hex/lister.py @@ -4,6 +4,7 @@ # See top-level LICENSE file for more information from dataclasses import asdict, dataclass +from datetime import datetime import logging from typing import Any, Dict, Iterator, List from urllib.parse import urljoin @@ -32,10 +33,12 @@ class HexListerState: """Id of the last page listed on an incremental pass""" last_pkg_name: str = "" """Name of the last package inserted at on an incremental pass""" + last_updated_at: str = datetime.min.replace(tzinfo=iso8601.UTC).isoformat() + """updated_at value of the last seen package on an incremental pass""" class HexLister(Lister[HexListerState, HexListerPage]): - """List origins from the "Hex" forge.""" + """List origins from the Hex.pm""" LISTER_NAME = "hex" VISIT_TYPE = "hex" @@ -73,12 +76,19 @@ class HexLister(Lister[HexListerState, HexListerPage]): url = urljoin(self.url, self.PACKAGES_PATH) while page_id is not None: + logger.debug( + "Fetching URL %s with page_id = %s and updated_after = %s", + url, + page_id, + self.state.last_updated_at, + ) + body = self.http_request( url, params={ "page": page_id, - "sort": "name", - }, # sort=name is actually the default + "search": f"updated_after:{self.state.last_updated_at}", + }, ).json() yield body @@ -116,15 +126,23 @@ class HexLister(Lister[HexListerState, HexListerPage]): return last_pkg_name = page[-1]["name"] - - # incoming page should have alphabetically greater - # last package name than the one stored in the state - if last_pkg_name > self.state.last_pkg_name: + last_updated_at = page[-1]["updated_at"] + # TODO: Think more about 2nd condition: + if ( + iso8601.parse_date(last_updated_at) + > iso8601.parse_date(self.state.last_updated_at) + and last_pkg_name != self.state.last_pkg_name + and len(page) > 0 + ): self.state.last_pkg_name = last_pkg_name self.state.last_page_id += 1 + self.state.last_updated_at = last_updated_at def finalize(self) -> None: scheduler_state = self.get_state_from_scheduler() - if self.state.last_page_id > scheduler_state.last_page_id: + # Mark the lister as updated only if it finds any updated repos + if iso8601.parse_date(self.state.last_updated_at) > iso8601.parse_date( + scheduler_state.last_updated_at + ): self.updated = True diff --git a/swh/lister/hex/tasks.py b/swh/lister/hex/tasks.py index 5e6dafbc2f815930315e423dbc080508590a0cb9..012bc8f778169727d02b718e76e63ee80f9bcfc5 100644 --- a/swh/lister/hex/tasks.py +++ b/swh/lister/hex/tasks.py @@ -13,7 +13,7 @@ from .lister import HexLister def list_hex_full( instance: Optional[str] = None, ) -> Dict[str, int]: - """Full update of a Hex.pm instance""" + """Full listing of Hex.pm""" lister = HexLister.from_configfile(instance=instance) return lister.run().dict() diff --git a/swh/lister/hex/tests/test_lister.py b/swh/lister/hex/tests/test_lister.py index fe1d2aaedba45aa785b74b64df7ae9abd5d7b8c2..bb977a38440dc38cd28257de97387930788d0bfd 100644 --- a/swh/lister/hex/tests/test_lister.py +++ b/swh/lister/hex/tests/test_lister.py @@ -1,8 +1,10 @@ import json from pathlib import Path -from typing import List +from typing import List, Optional +from urllib.parse import quote import pytest +from requests import HTTPError from swh.lister.hex.lister import HexLister, ListedOrigin from swh.scheduler.interface import SchedulerInterface @@ -25,10 +27,27 @@ def check_listed_origins(lister_urls: List[str], scheduler_origins: List[ListedO assert set(lister_urls) == {origin.url for origin in scheduler_origins} +@pytest.fixture +def mock_hexpm_page(requests_mock): + def func( + page_id: int, + updated_after: str, + body: Optional[List[dict]], + status_code: int = 200, + ): + search_query = quote(f"updated_after:{updated_after}") + page_url = f"https://hex.pm/api/packages/?page={page_id}&search={search_query}" + requests_mock.get( + page_url, json=body, complete_qs=True, status_code=status_code + ) + + return func + + def test_full_lister_hex( swh_scheduler: SchedulerInterface, - requests_mock, hexpm_page, + mock_hexpm_page, ): """ Simulate a full listing of packages for hex (erlang package manager) @@ -37,10 +56,10 @@ def test_full_lister_hex( p2_origin_urls, p2_json = hexpm_page(2) p3_origin_urls, p3_json = hexpm_page(3) - requests_mock.get("https://hex.pm/api/packages/?page=1", json=p1_json) - requests_mock.get("https://hex.pm/api/packages/?page=2", json=p2_json) - requests_mock.get("https://hex.pm/api/packages/?page=3", json=p3_json) - requests_mock.get("https://hex.pm/api/packages/?page=4", json=[]) + mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json) + mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json) + mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json) + mock_hexpm_page(4, "2022-09-09T21:00:14.993273Z", []) lister = HexLister(swh_scheduler) @@ -60,9 +79,9 @@ def test_full_lister_hex( assert lister.updated -def test_gogs_incremental_lister( +def test_hex_incremental_lister( swh_scheduler, - requests_mock, + mock_hexpm_page, hexpm_page, ): lister = HexLister(swh_scheduler) @@ -71,9 +90,9 @@ def test_gogs_incremental_lister( p1_origin_urls, p1_json = hexpm_page(1) p2_origin_urls, p2_json = hexpm_page(2) - requests_mock.get("https://hex.pm/api/packages/?page=1", json=p1_json) - requests_mock.get("https://hex.pm/api/packages/?page=2", json=p2_json) - requests_mock.get("https://hex.pm/api/packages/?page=3", json=[]) + mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json) + mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", p2_json) + mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", []) stats = lister.run() @@ -94,9 +113,9 @@ def test_gogs_incremental_lister( # Second run: P3 isn't empty anymore p3_origin_urls, p3_json = hexpm_page(3) - requests_mock.get("https://hex.pm/api/packages/?page=3", json=p3_json) - requests_mock.get( - "https://hex.pm/api/packages/?page=4", json=[] + mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json) + mock_hexpm_page( + 4, "2022-09-09T21:00:14.993273Z", [] ) # TODO: Try with 40x/50x here? stats = lister.run() @@ -125,9 +144,7 @@ def test_gogs_incremental_lister( stats = lister.run() assert stats.pages == 1 - assert ( - stats.origins == 0 - ) # FIXME: inconsistent with Gogs lister. Either of them could be wrong + assert stats.origins == 0 # FIXME: inconsistent with Gogs lister lister_state = lister.get_state_from_scheduler() assert ( @@ -139,3 +156,24 @@ def test_gogs_incremental_lister( check_listed_origins( p1_origin_urls + p2_origin_urls + p3_origin_urls, scheduler_origins ) + + +@pytest.mark.parametrize("http_code", [400, 500]) +def test_hex_lister_http_error(swh_scheduler, http_code, mock_hexpm_page, hexpm_page): + """Test handling of some HTTP errors commonly encountered""" + lister = HexLister(swh_scheduler) + + # First run: P1 and P2 return 4 origins each and P3 returns 0 + p1_origin_urls, p1_json = hexpm_page(1) + _, p3_json = hexpm_page(3) + + mock_hexpm_page(1, "0001-01-01T00:00:00+00:00", p1_json) + mock_hexpm_page(2, "2018-01-30T04:56:03.053561Z", None, http_code) + mock_hexpm_page(3, "2019-03-27T00:32:47.822901Z", p3_json) + + with pytest.raises(HTTPError): + lister.run() + + # Only P1 should be listed + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + check_listed_origins(p1_origin_urls, scheduler_origins)