diff --git a/README.md b/README.md index e0332a292a542d3065e14184afdc8121032e7cc9..91d6e79781971d4e403a8a789a5a47574b62d7ce 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ following Python modules: - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` -- `swh.liser.fedora` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` @@ -27,6 +26,7 @@ following Python modules: - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` +- `swh.lister.rpm` - `swh.lister.tuleap` Dependencies diff --git a/setup.py b/setup.py index 983feb03de917bf1b92a78592375095fed7fc162..899543540aaa65eb9d01baea136ffe5090df9e83 100755 --- a/setup.py +++ b/setup.py @@ -65,7 +65,6 @@ setup( lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register - lister.fedora=swh.lister.fedora:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register lister.gitiles=swh.lister.gitiles:register @@ -87,6 +86,7 @@ setup( lister.pubdev=swh.lister.pubdev:register lister.puppet=swh.lister.puppet:register lister.pypi=swh.lister.pypi:register + lister.rpm=swh.lister.rpm:register lister.rubygems=swh.lister.rubygems:register lister.sourceforge=swh.lister.sourceforge:register lister.stagit=swh.lister.stagit:register diff --git a/swh/lister/fedora/lister.py b/swh/lister/fedora/lister.py deleted file mode 100644 index 34712b3709917395f8fc790ee20b8e622a2cb08a..0000000000000000000000000000000000000000 --- a/swh/lister/fedora/lister.py +++ /dev/null @@ -1,265 +0,0 @@ -# Copyright (C) 2022 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from dataclasses import dataclass, field -from datetime import datetime, timezone -import logging -from typing import Any, Dict, Iterator, List, Optional, Set, Type -from urllib.error import HTTPError -from urllib.parse import urljoin - -import repomd - -from swh.scheduler.interface import SchedulerInterface -from swh.scheduler.model import ListedOrigin - -from ..pattern import Lister - -logger = logging.getLogger(__name__) - - -Release = int -Edition = str -PkgName = str -PkgVersion = str -FedoraOrigin = str -FedoraPageType = Type[repomd.Repo] -"""Each page is a list of packages from a given Fedora (release, edition) pair""" - - -def get_editions(release: Release) -> List[Edition]: - """Get list of editions for a given release.""" - # Ignore dirs that don't contain .rpm files: - # Docker,CloudImages,Atomic*,Spins,Live,Cloud_Atomic,Silverblue - - if release < 20: - return ["Everything", "Fedora"] - elif release < 28: - return ["Everything", "Server", "Workstation"] - else: - return ["Everything", "Server", "Workstation", "Modular"] - - -def get_last_modified(pkg: repomd.Package) -> datetime: - """Get timezone aware last modified time in UTC from RPM package metadata.""" - ts = pkg._element.find("common:time", namespaces=repomd._ns).get("build") - return datetime.utcfromtimestamp(int(ts)).replace(tzinfo=timezone.utc) - - -def get_checksums(pkg: repomd.Package) -> Dict[str, str]: - """Get checksums associated to rpm archive.""" - cs = pkg._element.find("common:checksum", namespaces=repomd._ns) - cs_type = cs.get("type") - if cs_type == "sha": - cs_type = "sha1" - return {cs_type: cs.text} - - -@dataclass -class FedoraListerState: - """State of Fedora lister""" - - package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) - """Dictionary mapping a package name to all the versions found during - last listing""" - - -class FedoraLister(Lister[FedoraListerState, FedoraPageType]): - """ - List source packages for given Fedora releases. - - The lister will create a snapshot for each package name from all its - available versions. - - If a package snapshot is different from the last listing operation, - it will be sent to the scheduler that will create a loading task - to archive newly found source code. - - Args: - scheduler: instance of SchedulerInterface - url: fedora package archives mirror URL - releases: list of fedora releases to process - """ - - LISTER_NAME = "fedora" - - def __init__( - self, - scheduler: SchedulerInterface, - instance: str = "fedora", - url: str = "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", - releases: List[Release] = [34, 35, 36], - max_origins_per_page: Optional[int] = None, - max_pages: Optional[int] = None, - enable_origins: bool = True, - ): - super().__init__( - scheduler=scheduler, - url=url, - instance=instance, - credentials={}, - max_origins_per_page=max_origins_per_page, - max_pages=max_pages, - enable_origins=enable_origins, - ) - - self.releases = releases - - self.listed_origins: Dict[FedoraOrigin, ListedOrigin] = {} - "will hold all listed origins info" - self.origins_to_send: Set[FedoraOrigin] = set() - "will hold updated origins since last listing" - self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} - "will contain the lister state after a call to run" - self.last_page = False - - def state_from_dict(self, d: Dict[str, Any]) -> FedoraListerState: - return FedoraListerState(package_versions={k: set(v) for k, v in d.items()}) - - def state_to_dict(self, state: FedoraListerState) -> Dict[str, Any]: - return {k: list(v) for k, v in state.package_versions.items()} - - def page_request(self, release: Release, edition: Edition) -> FedoraPageType: - """Return parsed packages for a given fedora release.""" - index_url = urljoin( - self.url, - f"{release}/{edition}/source/SRPMS/" - if release < 24 - else f"{release}/{edition}/source/tree/", - ) - - repo = repomd.load(index_url) # throws error if no repomd.xml is not found - self.last_page = ( - release == self.releases[-1] and edition == get_editions(release)[-1] - ) - - logger.debug( - "Fetched metadata from url: %s, found %d packages", index_url, len(repo) - ) - # TODO: Extract more fields like "provides" and "requires" from *primary.xml - # as extrinsic metadata using the pkg._element.findtext method - return repo - - def get_pages(self) -> Iterator[FedoraPageType]: - """Return an iterator on parsed fedora packages, one page per (release, edition) pair""" - - for release in self.releases: - for edition in get_editions(release): - logger.debug("Listing fedora release %s edition %s", release, edition) - self.current_release = release - self.current_edition = edition - try: - yield self.page_request(release, edition) - except HTTPError as http_error: - if http_error.getcode() == 404: - logger.debug( - "No packages metadata found for fedora release %s edition %s", - release, - edition, - ) - continue - raise - - def origin_url_for_package(self, package_name: PkgName) -> FedoraOrigin: - """Return the origin url for the given package""" - return f"https://src.fedoraproject.org/rpms/{package_name}" - - def get_origins_from_page(self, page: FedoraPageType) -> Iterator[ListedOrigin]: - """Convert a page of fedora package sources into an iterator of ListedOrigin.""" - assert self.lister_obj.id is not None - - origins_to_send = set() - - # iterate on each package's metadata - for pkg_metadata in page: - # extract package metadata - package_name = pkg_metadata.name - package_version = pkg_metadata.vr - package_version_split = package_version.split(".") - if package_version_split[-1].startswith("fc"): - # remove trailing ".fcXY" in version for the rpm loader to avoid - # creating multiple releases targeting same directory - package_version = ".".join(package_version_split[:-1]) - - package_build_time = get_last_modified(pkg_metadata) - package_download_path = pkg_metadata.location - - # build origin url - origin_url = self.origin_url_for_package(package_name) - # create package version key as expected by the fedora (rpm) loader - package_version_key = ( - f"fedora{self.current_release}/{self.current_edition}/" - f"{package_version}" - ).lower() - - # this is the first time a package is listed - if origin_url not in self.listed_origins: - # create a ListedOrigin object for it that can be later - # updated with new package versions info - self.listed_origins[origin_url] = ListedOrigin( - lister_id=self.lister_obj.id, - url=origin_url, - visit_type="rpm", - extra_loader_arguments={"packages": {}}, - last_update=package_build_time, - ) - - # init set that will contain all listed package versions - self.package_versions[package_name] = set() - - # origin will be yielded at the end of that method - origins_to_send.add(origin_url) - - # update package metadata in parameter that will be provided - # to the rpm loader - self.listed_origins[origin_url].extra_loader_arguments["packages"][ - package_version_key - ] = { - "name": package_name, - "version": package_version, - "url": urljoin(page.baseurl, package_download_path), - "buildTime": package_build_time.isoformat(), - "checksums": get_checksums(pkg_metadata), - } - - last_update = self.listed_origins[origin_url].last_update - if last_update is not None and package_build_time > last_update: - self.listed_origins[origin_url].last_update = package_build_time - - # add package version key to the set of found versions - self.package_versions[package_name].add(package_version_key) - - # package has already been listed during a previous listing process - if package_name in self.state.package_versions: - new_versions = ( - self.package_versions[package_name] - - self.state.package_versions[package_name] - ) - # no new versions so far, no need to send the origin to the scheduler - if not new_versions: - origins_to_send.remove(origin_url) - - logger.debug( - "Found %s packages to update (new ones or packages with new versions).", - len(origins_to_send), - ) - logger.debug( - "Current total number of listed packages is equal to %s.", - len(self.listed_origins), - ) - - # yield from origins_to_send.values() - self.origins_to_send.update(origins_to_send) - - if self.last_page: - # yield listed origins when all fedora releases and editions processed - yield from [ - self.listed_origins[origin_url] for origin_url in self.origins_to_send - ] - - def finalize(self): - # set mapping between listed package names and versions as lister state - self.state.package_versions = self.package_versions - self.updated = len(self.listed_origins) > 0 diff --git a/swh/lister/fedora/tasks.py b/swh/lister/fedora/tasks.py deleted file mode 100644 index 18c8a605f08c2cb564d1afe7453636df23e66c97..0000000000000000000000000000000000000000 --- a/swh/lister/fedora/tasks.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (C) 2022 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from typing import Dict - -from celery import shared_task - -from .lister import FedoraLister - - -@shared_task(name=__name__ + ".FullFedoraRelister") -def list_fedora_full(**lister_args) -> Dict[str, int]: - """Full update of a Fedora instance""" - lister = FedoraLister.from_configfile(**lister_args) - return lister.run().dict() - - -@shared_task(name=__name__ + ".ping") -def _ping() -> str: - return "OK" diff --git a/swh/lister/fedora/tests/test_lister.py b/swh/lister/fedora/tests/test_lister.py deleted file mode 100644 index dc093597eb548392101b03dc4b71b1adf7d73357..0000000000000000000000000000000000000000 --- a/swh/lister/fedora/tests/test_lister.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright (C) 2022 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from io import StringIO -from pathlib import Path -from typing import List -from unittest.mock import MagicMock -from urllib.error import HTTPError - -import pytest - -from swh.lister.fedora.lister import FedoraLister, Release, get_editions -from swh.scheduler.interface import SchedulerInterface - - -def mock_repomd(datadir, mocker, use_altered_fedora36=False): - """Mocks the .xml files fetched by repomd for the next lister run""" - paths = ["repomd26.xml", "primary26.xml.gz", "repomd36.xml", "primary36.xml.gz"] - if use_altered_fedora36: - paths[3] = "primary36-altered.xml.gz" - - cm = MagicMock() - cm.read.side_effect = [ - Path(datadir, "archives.fedoraproject.org", path).read_bytes() for path in paths - ] - cm.__enter__.return_value = cm - mocker.patch("repomd.urllib.request.urlopen").return_value = cm - - -def rpm_url(release, path): - return ( - "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/" - f"{release}/Everything/source/tree/Packages/{path}" - ) - - -@pytest.fixture -def pkg_versions(): - return { - "https://src.fedoraproject.org/rpms/0install": { - "fedora26/everything/2.11-4": { - "name": "0install", - "version": "2.11-4", - "buildTime": "2017-02-10T04:59:31+00:00", - "url": rpm_url(26, "0/0install-2.11-4.fc26.src.rpm"), - "checksums": { - # note: we intentionally altered the original - # primary26.xml file to test sha1 usage - "sha1": "a6fdef5d1026dea208eeeba148f55ac2f545989b", - }, - } - }, - "https://src.fedoraproject.org/rpms/0xFFFF": { - "fedora26/everything/0.3.9-15": { - "name": "0xFFFF", - "version": "0.3.9-15", - "buildTime": "2017-02-10T05:01:53+00:00", - "url": rpm_url(26, "0/0xFFFF-0.3.9-15.fc26.src.rpm"), - "checksums": { - "sha256": "96f9c163c0402d2b30e5343c8397a6d50e146c85a446804396b119ef9698231f" - }, - }, - "fedora36/everything/0.9-4": { - "name": "0xFFFF", - "version": "0.9-4", - "buildTime": "2022-01-19T19:13:53+00:00", - "url": rpm_url(36, "0/0xFFFF-0.9-4.fc36.src.rpm"), - "checksums": { - "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" - }, - }, - }, - "https://src.fedoraproject.org/rpms/2ping": { - "fedora36/everything/4.5.1-2": { - "name": "2ping", - "version": "4.5.1-2", - "buildTime": "2022-01-19T19:12:21+00:00", - "url": rpm_url(36, "2/2ping-4.5.1-2.fc36.src.rpm"), - "checksums": { - "sha256": "2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28" - }, - } - }, - } - - -def run_lister( - swh_scheduler: SchedulerInterface, - releases: List[Release], - pkg_versions: dict, - origin_count: int, - updated: bool = True, -): - """Runs the lister and tests that the listed origins are correct.""" - lister = FedoraLister(scheduler=swh_scheduler, releases=releases) - - stats = lister.run() - scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - lister_state = lister.get_state_from_scheduler() - state_pkg_versions = {k.split("/")[-1]: set(v) for k, v in pkg_versions.items()} - - # One edition from each release (we mocked get_editions) - assert stats.pages == (len(releases) if updated else 0) - assert stats.origins == origin_count - - assert { - o.url: o.extra_loader_arguments["packages"] for o in scheduler_origins - } == pkg_versions - - assert lister_state.package_versions == state_pkg_versions - assert lister.updated == updated - - -def test_get_editions(): - assert get_editions(18) == ["Everything", "Fedora"] - assert get_editions(26) == ["Everything", "Server", "Workstation"] - assert get_editions(34) == ["Everything", "Server", "Workstation", "Modular"] - - -@pytest.mark.parametrize("status_code", [400, 404, 500]) -def test_fedora_lister_http_error( - swh_scheduler: SchedulerInterface, mocker: MagicMock, status_code: int -): - """ - Simulates handling of HTTP Errors while fetching of packages for fedora releases. - """ - releases = [18] - - is_404 = status_code == 404 - - def side_effect(url): - if is_404: - raise HTTPError( - url, status_code, "Not Found", {"content-type": "text/html"}, StringIO() - ) - else: - raise HTTPError( - url, - status_code, - "Internal server error", - {"content-type": "text/html"}, - StringIO(), - ) - - urlopen_patch = mocker.patch("repomd.urllib.request.urlopen") - urlopen_patch.side_effect = side_effect - - expected_pkgs: dict = {} - - if is_404: - run_lister( - swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False - ) - else: - with pytest.raises(HTTPError): - run_lister( - swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False - ) - - -def test_full_lister_fedora( - swh_scheduler: SchedulerInterface, - mocker: MagicMock, - datadir: Path, - pkg_versions: dict, -): - """ - Simulates a full listing of packages for fedora releases. - """ - releases = [26, 36] - - get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions") - get_editions_patch.return_value = ["Everything"] - - mock_repomd(datadir, mocker) - run_lister(swh_scheduler, releases, pkg_versions, origin_count=3) - - -def test_incremental_lister( - swh_scheduler: SchedulerInterface, - mocker: MagicMock, - datadir: Path, - pkg_versions: dict, -): - """ - Simulates an incremental listing of packages for fedora releases. - """ - releases = [26, 36] - - get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions") - get_editions_patch.return_value = ["Everything"] - - # First run - mock_repomd(datadir, mocker) - run_lister(swh_scheduler, releases, pkg_versions, origin_count=3) - # Second run (no updates) - mock_repomd(datadir, mocker) - run_lister(swh_scheduler, releases, pkg_versions, origin_count=0) - - # Use an altered version of primary36.xml in which we updated the version - # of package 0xFFFF to 0.10: - mock_repomd(datadir, mocker, use_altered_fedora36=True) - # Add new version to the set of expected pkg versions: - pkg_versions["https://src.fedoraproject.org/rpms/0xFFFF"].update( - { - "fedora36/everything/0.10-4": { - "name": "0xFFFF", - "version": "0.10-4", - "buildTime": "2022-01-19T19:13:53+00:00", - "url": rpm_url(36, "0/0xFFFF-0.10-4.fc36.src.rpm"), - "checksums": { - "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" - }, - } - } - ) - - # Third run (0xFFFF in fedora36 editions got updated and it needs to be listed) - run_lister(swh_scheduler, releases, pkg_versions, origin_count=1) diff --git a/swh/lister/fedora/tests/test_tasks.py b/swh/lister/fedora/tests/test_tasks.py deleted file mode 100644 index 7fd4236e71f93438b320a209a2974ed914eaad7f..0000000000000000000000000000000000000000 --- a/swh/lister/fedora/tests/test_tasks.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (C) 2022 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from unittest.mock import patch - -from swh.lister.pattern import ListerStats - - -def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): - res = swh_scheduler_celery_app.send_task("swh.lister.fedora.tasks.ping") - assert res - res.wait() - assert res.successful() - assert res.result == "OK" - - -@patch("swh.lister.fedora.tasks.FedoraLister") -def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - lister.from_configfile.return_value = lister - lister.run.return_value = ListerStats(pages=10, origins=500) - - kwargs = dict( - url="https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/" - ) - res = swh_scheduler_celery_app.send_task( - "swh.lister.fedora.tasks.FullFedoraRelister", - kwargs=kwargs, - ) - assert res - res.wait() - assert res.successful() - - lister.from_configfile.assert_called_once_with(**kwargs) - lister.run.assert_called_once_with() - - -@patch("swh.lister.fedora.tasks.FedoraLister") -def test_full_listing_params( - lister, swh_scheduler_celery_app, swh_scheduler_celery_worker -): - lister.from_configfile.return_value = lister - lister.run.return_value = ListerStats(pages=10, origins=500) - - kwargs = dict( - url="https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", - instance="archives.fedoraproject.org", - releases=["36"], - ) - res = swh_scheduler_celery_app.send_task( - "swh.lister.fedora.tasks.FullFedoraRelister", - kwargs=kwargs, - ) - assert res - res.wait() - assert res.successful() - - lister.from_configfile.assert_called_once_with(**kwargs) - lister.run.assert_called_once_with() diff --git a/swh/lister/fedora/__init__.py b/swh/lister/rpm/__init__.py similarity index 69% rename from swh/lister/fedora/__init__.py rename to swh/lister/rpm/__init__.py index 6fb3a148d8bf4ab1db60ef26225d296aabdff633..54d6da19cf29794a3a34f5c32d2e56592d467832 100644 --- a/swh/lister/fedora/__init__.py +++ b/swh/lister/rpm/__init__.py @@ -1,13 +1,13 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def register(): - from .lister import FedoraLister + from .lister import RPMLister return { - "lister": FedoraLister, + "lister": RPMLister, "task_modules": [f"{__name__}.tasks"], } diff --git a/swh/lister/rpm/config/centos.yml b/swh/lister/rpm/config/centos.yml new file mode 100644 index 0000000000000000000000000000000000000000..25c6b52955c88566357f01f941f39ded9cb6e52b --- /dev/null +++ b/swh/lister/rpm/config/centos.yml @@ -0,0 +1,100 @@ +# RPM lister parameters to process CentOS source packages + +url: https://www.centos.org +instance: CentOS +rpm_src_data: + - base_url: https://vault.centos.org/ + releases: + - "3.7" + - "3.8" + - "3.9" + - "4.0" + - "4.1" + - "4.2" + - "4.3" + - "4.4" + - "4.5" + - "4.6" + - "4.7" + - "4.8" + - "4.9" + - "5.0" + - "5.1" + - "5.2" + - "5.3" + - "5.4" + - "5.5" + - "5.6" + - "5.7" + - "5.8" + - "5.9" + - "5.10" + - "5.11" + - "6.0" + - "6.1" + - "6.2" + - "6.3" + - "6.4" + - "6.5" + - "6.6" + - "6.7" + - "6.8" + - "6.9" + - "6.10" + - "7.0.1406" + - "7.1.1503" + - "7.2.1511" + - "7.3.1611" + - "7.4.1708" + - "7.5.1804" + - "7.6.1810" + - "7.7.1908" + - "7.8.2003" + - "7.9.2009" + - "8-stream" + - "8.0.1905" + - "8.1.1911" + - "8.2.2004" + - "8.3.2011" + - "8.4.2105" + - "8.5.2111" + components: + - AppStream + - BaseOS + - HighAvailability + - PowerTools + - SCL + - addons + - centosplus + - contrib + - cr + - csgfs + - dotnet + - extras + - fasttrack + - opstools + - os + - rt + - testing + - updates + - xen4 + + index_url_templates: + - $base_url/$release/$component/Source/ + - $base_url/$release/$component/SRPMS/ + - $base_url/$release/$component/x86_64/ + + - base_url: https://mirror.stream.centos.org + releases: + - 9-stream + components: + - AppStream + - BaseOS + - CRB + - HighAvailability + - NFV + - RT + - ResilientStorage + + index_url_templates: + - $base_url/$release/$component/source/tree/ diff --git a/swh/lister/rpm/config/fedora.yml b/swh/lister/rpm/config/fedora.yml new file mode 100644 index 0000000000000000000000000000000000000000..382863ea760014615e2588ee6501ffa49d262bc7 --- /dev/null +++ b/swh/lister/rpm/config/fedora.yml @@ -0,0 +1,77 @@ +# RPM lister parameters to process Fedora source packages + +url: https://fedoraproject.org +instance: "Fedora" +rpm_src_data: + - base_url: https://archives.fedoraproject.org/pub/archive/fedora/linux/ + releases: + - "2" + - "3" + - "4" + - "5" + - "6" + components: + - core + - extras + index_url_templates: + - $base_url/$component/$release/SRPMS + - $base_url/$component/$release/source/SRPMS + - $base_url/$component/$release/x86_64/os/ + + - base_url: https://archives.fedoraproject.org/pub/archive/fedora/linux/ + releases: + - "7" + - "8" + - "9" + - "10" + - "11" + - "12" + - "13" + - "14" + - "15" + - "16" + - "17" + - "18" + - "19" + - "20" + - "21" + - "22" + - "23" + - "24" + - "25" + - "26" + - "27" + - "28" + - "29" + - "30" + - "31" + - "32" + - "33" + - "34" + - "35" + components: + - Everything + - Server + - Workstation + - Modular + - Fedora + index_url_templates: + - $base_url/releases/$release/$component/source/tree/ + - $base_url/updates/$release/$component/source/tree/ + - $base_url/releases/$release/$component/source/SRPMS/ + - $base_url/updates/$release/SRPMS/ + + - base_url: https://dl.fedoraproject.org/pub/fedora/linux/ + releases: + - "36" + - "37" + - "38" + components: + - Everything + - Server + - Workstation + - Modular + - Fedora + index_url_templates: + - $base_url/releases/$release/$component/source/tree/ + - $base_url/updates/$release/$component/source/tree/ diff --git a/swh/lister/rpm/config/opensuse.yml b/swh/lister/rpm/config/opensuse.yml new file mode 100644 index 0000000000000000000000000000000000000000..461d3173d70302d137c2108ae9519d60e7859cd9 --- /dev/null +++ b/swh/lister/rpm/config/opensuse.yml @@ -0,0 +1,26 @@ +# RPM lister parameters to process openSUSE source packages + +url: http://opensuse.org +instance: openSUSE +rpm_src_data: + - base_url: http://download.opensuse.org/source/ + releases: + - tumbleweed + - jump/15.2 + - leap/15.0-Current + - leap/15.0 + - leap/15.1 + - leap/15.2 + - leap/15.3 + - leap/15.4 + - leap/15.5 + - leap/42.2 + - leap/42.3-Current + - leap/42.3 + components: + - oss + - non-oss + index_url_templates: + - $base_url/distribution/$release/repo/$component/ + - $base_url/distribution/$release/repo/$component/suse/ + - $base_url/$release/repo/$component/ diff --git a/swh/lister/rpm/config/oracle.yml b/swh/lister/rpm/config/oracle.yml new file mode 100644 index 0000000000000000000000000000000000000000..c5c64e6c55feebe118bc3ee9500442c9f738fd64 --- /dev/null +++ b/swh/lister/rpm/config/oracle.yml @@ -0,0 +1,156 @@ +# RPM lister parameters to process Oracle Linux source packages + +url: https://www.oracle.com/linux +instance: OracleLinux +rpm_src_data: + - base_url: https://yum.oracle.com/repo/EnterpriseLinux/ + releases: + - EL5 + components: + - addons + - oracle_addons + - unsupported + - 0/base + - 1/base + - 2/base + - 3/base + - 4/base + - 5/base + + index_url_templates: + - $base_url/$release/$component/x86_64 + + - base_url: https://yum.oracle.com/repo/OracleLinux/ + releases: + - OL5 + - OL6 + - OL7 + - OL8 + - OL9 + components: + - 0/base + - 0/baseos/base + - 1/base + - 1/baseos/base + - 10/base + - 11/base + - 2/base + - 2/baseos/base + - 3/base + - 3/baseos/base + - 4/base + - 4/baseos/base + - 4/security/validation + - 5/base + - 5/baseos/base + - 6/base + - 6/baseos/base + - 7/base + - 7/baseos/base + - 8/base + - 8/baseos/base + - 8/security/validation + - 9/base + - MODRHCK + - MySQL + - MySQL56 + - MySQL57_community + - MySQL80/community + - MySQL80/connectors/community + - MySQL80/tools/community + - MySQL80_community + - RDMA + - SoftwareCollections + - UEK/latest + - UEKR3 + - UEKR3/latest + - UEKR3_OFED20 + - UEKR4 + - UEKR4/OFED + - UEKR4/archive + - UEKR5 + - UEKR5/RDMA + - UEKR5/archive + - UEKR6 + - UEKR6/RDMA + - UEKR7 + - UEKR7/RDMA + - addons + - appstream + - appstream/developer + - automation2 + - baseos/developer + - baseos/latest + - beta + - ceph + - ceph30 + - codeready/builder + - codeready/builder/developer + - developer + - developer/EPEL + - developer/EPEL/modular + - developer/UEKR5 + - developer/UEKR6 + - developer/UEKR7 + - developer/golang117 + - developer/golang118 + - developer/golang119 + - developer/kvm/utils + - developer/nodejs12 + - developer/olcne + - developer/php74 + - developer_EPEL + - developer_gluster310 + - developer_gluster312 + - distro/builder + - gluster/appstream + - gluster312 + - gluster41 + - gluster5 + - gluster6 + - gluster8 + - kvm/appstream + - kvm/utils + - latest + - latest/archive + - leapp + - ofed_UEK + - olcne + - olcne11 + - olcne12 + - olcne13 + - olcne14 + - olcne15 + - olcne16 + - openstack10 + - openstack21 + - openstack30 + - openstack40 + - openstack40_extras + - openstack50 + - openstack50_extras + - optional + - optional/archive + - optional/beta + - oracle/instantclient + - oracle/instantclient21 + - oraclelinuxmanager210/client + - oraclelinuxmanager210/server + - ovirt42 + - ovirt42/extras + - ovirt43 + - ovirt43/extras + - ovirt44 + - ovirt44/extras + - security/validation + - spacewalk210/client + - spacewalk210/server + - spacewalk24/client + - spacewalk24/server + - spacewalk26/client + - spacewalk26/server + - spacewalk27/client + - spacewalk27/server + + index_url_templates: + - $base_url/$release/$component/x86_64 diff --git a/swh/lister/rpm/config/rockylinux.yml b/swh/lister/rpm/config/rockylinux.yml new file mode 100644 index 0000000000000000000000000000000000000000..3f7bf2bdbd99ccc39fcb4df8f43a2aa5afbf424e --- /dev/null +++ b/swh/lister/rpm/config/rockylinux.yml @@ -0,0 +1,38 @@ +# RPM lister parameters to process Rocky Linux source packages + +url: https://rockylinux.org +instance: RockyLinux +rpm_src_data: + - base_url: https://download.rockylinux.org/ + releases: + - "8.3" + - "8.4" + - "8.4-RC1" + - "8.5" + - "8.6" + - "8.7" + - "8.8" + - "9.0" + - "9.1" + - "9.2" + components: + - AppStream + - BaseOS + - Devel + - HighAvailability + - Minimal + - PowerTools + - ResilientStorage + - CRB + - NFV + - RT + - SAP + - SAPHANA + - devel + - extras + - plus + - nfv + - rockyrpi + index_url_templates: + - $base_url/vault/rocky/$release/$component/source/tree/ + - $base_url/pub/rocky/$release/$component/source/tree/ diff --git a/swh/lister/rpm/lister.py b/swh/lister/rpm/lister.py new file mode 100644 index 0000000000000000000000000000000000000000..a6eeb1fa42b9877dabb1b031295b8017e43dd101 --- /dev/null +++ b/swh/lister/rpm/lister.py @@ -0,0 +1,314 @@ +# Copyright (C) 2022-2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import dataclass, field +from datetime import datetime, timezone +from itertools import product +import logging +from string import Template +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple +from urllib.parse import urljoin + +import repomd +from typing_extensions import TypedDict + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import Lister + +logger = logging.getLogger(__name__) + + +Release = str +Component = str +PkgName = str +PkgVersion = str +RPMOrigin = str + +RPMPageType = Optional[Tuple[Release, Component, repomd.Repo]] +"""Each page is a list of packages for a given (release, component) pair +from a Red Hat based distribution.""" + + +class RPMSourceData(TypedDict): + """Dictionary holding relevant data for listing RPM source packages. + + See content of the lister config directory to get examples of RPM + source data for famous RedHat based distributions. + """ + + base_url: str + """Base URL of a RPM repository""" + releases: List[Release] + """List of release identifiers for a Red Hat based distribution""" + components: List[Component] + """List of components for a Red Hat based distribution""" + index_url_templates: List[str] + """List of URL templates to discover source packages metadata, the + following variables can be substituted in them: ``base_url``, ``release`` + and ``edition``, see :class:`string.Template` for more details about the + format. The generated URLs must target directories containing a sub-directory + named ``repodata``, which contains packages metadata, in order to be + successfully processed by the lister.""" + + +def _get_last_modified(pkg: repomd.Package) -> datetime: + """Get timezone aware last modified time in UTC from RPM package metadata.""" + ts = pkg._element.find("common:time", namespaces=repomd._ns).get("build") + return datetime.utcfromtimestamp(int(ts)).replace(tzinfo=timezone.utc) + + +def _get_checksums(pkg: repomd.Package) -> Dict[str, str]: + """Get checksums associated to rpm archive.""" + cs = pkg._element.find("common:checksum", namespaces=repomd._ns) + cs_type = cs.get("type") + if cs_type == "sha": + cs_type = "sha1" + return {cs_type: cs.text} + + +@dataclass +class RPMListerState: + """State of RPM lister""" + + package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) + """Dictionary mapping a package name to all the versions found during + last listing""" + + +class RPMLister(Lister[RPMListerState, RPMPageType]): + """ + List source packages for a Red Hat based linux distribution. + + The lister creates a snapshot for each package from all its available versions. + + In incremental mode, only packages with different snapshot since the last listing + operation will be sent to the scheduler that will create loading tasks to archive + newly found source code. + + Args: + scheduler: instance of SchedulerInterface + url: Red Hat based distribution info URL + instance: name of Red Hat based distribution + rpm_src_data: list of dictionaries holding data required to list RPM source packages, + see examples in the config directory. + incremental: if :const:`True`, only packages with new versions are sent to the + scheduler when relisting + """ + + LISTER_NAME = "rpm" + + def __init__( + self, + scheduler: SchedulerInterface, + url: str, + instance: str, + rpm_src_data: List[RPMSourceData], + incremental: bool = False, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, + ): + super().__init__( + scheduler=scheduler, + url=url, + instance=instance, + credentials={}, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, + ) + + self.rpm_src_data = rpm_src_data + self.incremental = incremental + + self.listed_origins: Dict[RPMOrigin, ListedOrigin] = {} + self.origins_to_send: Set[RPMOrigin] = set() + self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} + + def state_from_dict(self, d: Dict[str, Any]) -> RPMListerState: + return RPMListerState(package_versions={k: set(v) for k, v in d.items()}) + + def state_to_dict(self, state: RPMListerState) -> Dict[str, Any]: + return {k: list(v) for k, v in state.package_versions.items()} + + def repo_request( + self, + index_url_template: Template, + base_url: str, + release: Release, + component: Component, + ) -> Optional[RPMPageType]: + """Return parsed packages for a given distribution release and component.""" + + index_url = index_url_template.substitute( + base_url=base_url.rstrip("/"), release=release, component=component + ) + + try: + repo = repomd.load(index_url) # throws error if no repomd.xml is not found + except Exception: + logger.debug("Repository metadata not found at URL %s", index_url) + return None + else: + logger.debug( + "Fetched metadata from url: %s, found %d packages", index_url, len(repo) + ) + return repo + + def get_pages(self) -> Iterator[RPMPageType]: + """Return an iterator on parsed rpm packages, one page per (release, component) pair.""" + for rpm_src_data in self.rpm_src_data: + index_url_templates = [ + Template(index_url_template) + for index_url_template in rpm_src_data["index_url_templates"] + ] + # try all possible package repository URLs for each (release, component) pair + for release, component, index_url_template in product( + rpm_src_data["releases"], + rpm_src_data["components"], + index_url_templates, + ): + repo = self.repo_request( + index_url_template, + rpm_src_data["base_url"], + release, + component, + ) + if repo is not None: + # valid package repository found, yield page + yield (release, component, repo) + + yield None + + def origin_url_for_package(self, package_name: PkgName) -> RPMOrigin: + """Return the origin url for the given package.""" + # TODO: Use a better origin URL before deploying the lister to production + # https://gitlab.softwareheritage.org/swh/devel/swh-model/-/issues/4632 + return f"rpm://{self.instance}/packages/{package_name}" + + def get_origins_from_page(self, page: RPMPageType) -> Iterator[ListedOrigin]: + """Convert a page of rpm package sources into an iterator of ListedOrigin.""" + assert self.lister_obj.id is not None + + if page is None: + # all pages processed, yield listed origins + for origin_url in self.origins_to_send: + yield self.listed_origins[origin_url] + return + + release, component, repo = page + + logger.debug( + "Listing %s release %s component %s from repository metadata located at %s", + self.instance, + release, + component, + repo.baseurl, + ) + + origins_to_send = set() + new_origins_count = 0 + + # iterate on each package's metadata + for pkg_metadata in repo: + + if pkg_metadata.arch != "src": + # not a source package, skip it + continue + + # extract package metadata + package_name = pkg_metadata.name + + # we extract the intrinsic version of the package for the rpm loader + # to avoid creating different releases targeting the same directory + # 2.12-10.el8 => 2.12-10 + package_version_split = pkg_metadata.vr.rsplit("-", maxsplit=1) + package_version = "-".join( + [ + package_version_split[0], + package_version_split[1].split(".", maxsplit=1)[0], + ] + ) + + # create package version key as expected by the rpm loader + package_version_key = f"{release}/{component}/{package_version}" + + package_build_time = _get_last_modified(pkg_metadata) + package_download_url = urljoin( + repo.baseurl.rstrip("/") + "/", pkg_metadata.location + ) + checksums = _get_checksums(pkg_metadata) + + # build origin url + origin_url = self.origin_url_for_package(package_name) + + # this is the first time a package is listed + if origin_url not in self.listed_origins: + # create a ListedOrigin object for it that can be later + # updated with new package versions info + self.listed_origins[origin_url] = ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="rpm", + extra_loader_arguments={"packages": {}}, + last_update=package_build_time, + ) + + # init set that will contain all listed package versions + self.package_versions[package_name] = set() + new_origins_count += 1 + + # origins will be yielded when all pages processed + origins_to_send.add(origin_url) + + # update package metadata in parameter that will be provided + # to the rpm loader + self.listed_origins[origin_url].extra_loader_arguments["packages"][ + package_version_key + ] = { + "name": package_name, + "version": package_version, + "url": package_download_url, + "build_time": package_build_time.isoformat(), + "checksums": checksums, + } + + last_update = self.listed_origins[origin_url].last_update + if last_update is not None and package_build_time > last_update: + self.listed_origins[origin_url].last_update = package_build_time + + # add package version key to the set of found versions + self.package_versions[package_name].add(package_version_key) + + # package has already been listed during a previous listing process + if self.incremental and package_name in self.state.package_versions: + new_versions = ( + self.package_versions[package_name] + - self.state.package_versions[package_name] + ) + # no new versions so far, no need to send the origin to the scheduler + if not new_versions: + origins_to_send.remove(origin_url) + + logger.debug( + "Found %s packages to update (%s new ones and %s packages with new versions).", + len(origins_to_send), + new_origins_count, + len(origins_to_send) - new_origins_count, + ) + logger.debug( + "Current total number of listed source packages is equal to %s.", + len(self.listed_origins), + ) + + self.origins_to_send.update(origins_to_send) + + def finalize(self): + if self.incremental: + # set mapping between listed package names and versions as lister state + self.state.package_versions = self.package_versions + self.updated = len(self.listed_origins) > 0 diff --git a/swh/lister/rpm/tasks.py b/swh/lister/rpm/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..ef160f774865c28d6b08688c9904e55c8a9ee1c6 --- /dev/null +++ b/swh/lister/rpm/tasks.py @@ -0,0 +1,28 @@ +# Copyright (C) 2022-2023 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import RPMLister + + +@shared_task(name=__name__ + ".FullRPMLister") +def list_rpm_full(**lister_args) -> Dict[str, int]: + """Full listing of Red Hat based distribution source packages""" + lister = RPMLister.from_configfile(**lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".IncrementalRPMLister") +def list_rpm_incremental(**lister_args) -> Dict[str, int]: + """Incremental listing of Red Hat based distribution source packages""" + lister = RPMLister.from_configfile(**lister_args, incremental=True) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/fedora/tests/__init__.py b/swh/lister/rpm/tests/__init__.py similarity index 100% rename from swh/lister/fedora/tests/__init__.py rename to swh/lister/rpm/tests/__init__.py diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml.gz b/swh/lister/rpm/tests/data/archives.fedoraproject.org/primary26.xml.gz similarity index 100% rename from swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml.gz rename to swh/lister/rpm/tests/data/archives.fedoraproject.org/primary26.xml.gz diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz b/swh/lister/rpm/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz similarity index 100% rename from swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz rename to swh/lister/rpm/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml.gz b/swh/lister/rpm/tests/data/archives.fedoraproject.org/primary36.xml.gz similarity index 100% rename from swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml.gz rename to swh/lister/rpm/tests/data/archives.fedoraproject.org/primary36.xml.gz diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd26.xml b/swh/lister/rpm/tests/data/archives.fedoraproject.org/repomd26.xml similarity index 100% rename from swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd26.xml rename to swh/lister/rpm/tests/data/archives.fedoraproject.org/repomd26.xml diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd36.xml b/swh/lister/rpm/tests/data/archives.fedoraproject.org/repomd36.xml similarity index 100% rename from swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd36.xml rename to swh/lister/rpm/tests/data/archives.fedoraproject.org/repomd36.xml diff --git a/swh/lister/rpm/tests/test_lister.py b/swh/lister/rpm/tests/test_lister.py new file mode 100644 index 0000000000000000000000000000000000000000..43c6e85e0c0e8ab632977cc7125a5c51cd254d90 --- /dev/null +++ b/swh/lister/rpm/tests/test_lister.py @@ -0,0 +1,283 @@ +# Copyright (C) 2022-2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path +from string import Template +from typing import List + +import pytest +from urllib3.exceptions import HTTPError + +from swh.lister.rpm.lister import Component, Release, RPMLister +from swh.scheduler.interface import SchedulerInterface + +FEDORA_URL = "https://fedoraproject.org/" +FEDORA_ARCHIVE_URL = "https://archives.fedoraproject.org/pub/archive/fedora/linux" + +FEDORA_INDEX_URL_TEMPLATES = [ + "$base_url/releases/$release/$component/source/tree/", + "$base_url/updates/$release/$component/source/tree/", + "$base_url/releases/$release/$component/source/SRPMS/", + "$base_url/updates/$release/SRPMS/", +] + + +def mock_repomd(mocker, side_effect): + """Mocks the .xml files fetched by repomd for the next lister run""" + cm = mocker.MagicMock() + cm.read.side_effect = side_effect + cm.__enter__.return_value = cm + mocker.patch("repomd.urllib.request.urlopen").return_value = cm + + +def mock_fedora_repomd(datadir, mocker, use_altered_fedora36=False): + repodata = [ + ["repomd26.xml", "primary26.xml.gz"], + ["repomd36.xml", "primary36.xml.gz"], + ] + if use_altered_fedora36: + repodata[1][1] = "primary36-altered.xml.gz" + + side_effect = [] + + for paths in repodata: + side_effect += [ + Path(datadir, "archives.fedoraproject.org", path).read_bytes() + for path in paths + ] + side_effect += [HTTPError() for _ in range(len(FEDORA_INDEX_URL_TEMPLATES) - 1)] + + mock_repomd(mocker, side_effect) + + +def rpm_repodata_url(release, component): + return Template(FEDORA_INDEX_URL_TEMPLATES[0]).substitute( + base_url=FEDORA_ARCHIVE_URL, release=release, component=component + ) + + +def rpm_src_package_url(release, component, path): + return f"{rpm_repodata_url(release, component)}Packages/{path}" + + +def rpm_package_origin_url(package_name, instance="Fedora"): + return f"rpm://{instance}/packages/{package_name}" + + +@pytest.fixture +def pkg_versions(): + return { + f"{rpm_package_origin_url('0install')}": { + "26/Everything/2.11-4": { + "name": "0install", + "version": "2.11-4", + "build_time": "2017-02-10T04:59:31+00:00", + "url": rpm_src_package_url( + release="26", + component="Everything", + path="0/0install-2.11-4.fc26.src.rpm", + ), + "checksums": { + # note: we intentionally altered the original + # primary26.xml file to test sha1 usage + "sha1": "a6fdef5d1026dea208eeeba148f55ac2f545989b", + }, + } + }, + f"{rpm_package_origin_url('0xFFFF')}": { + "26/Everything/0.3.9-15": { + "name": "0xFFFF", + "version": "0.3.9-15", + "build_time": "2017-02-10T05:01:53+00:00", + "url": rpm_src_package_url( + release="26", + component="Everything", + path="0/0xFFFF-0.3.9-15.fc26.src.rpm", + ), + "checksums": { + "sha256": "96f9c163c0402d2b30e5343c8397a6d50e146c85a446804396b119ef9698231f" + }, + }, + "36/Everything/0.9-4": { + "name": "0xFFFF", + "version": "0.9-4", + "build_time": "2022-01-19T19:13:53+00:00", + "url": rpm_src_package_url( + release="36", + component="Everything", + path="0/0xFFFF-0.9-4.fc36.src.rpm", + ), + "checksums": { + "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" + }, + }, + }, + f"{rpm_package_origin_url('2ping')}": { + "36/Everything/4.5.1-2": { + "name": "2ping", + "version": "4.5.1-2", + "build_time": "2022-01-19T19:12:21+00:00", + "url": rpm_src_package_url( + release="36", + component="Everything", + path="2/2ping-4.5.1-2.fc36.src.rpm", + ), + "checksums": { + "sha256": "2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28" + }, + } + }, + } + + +def run_lister( + swh_scheduler: SchedulerInterface, + releases: List[Release], + components: List[Component], + pkg_versions: dict, + origin_count: int, + incremental: bool = False, + updated: bool = True, +): + """Runs the lister and tests that the listed origins are correct.""" + lister = RPMLister( + scheduler=swh_scheduler, + url=FEDORA_URL, + instance="Fedora", + rpm_src_data=[ + { + "base_url": FEDORA_ARCHIVE_URL, + "releases": releases, + "components": components, + "index_url_templates": FEDORA_INDEX_URL_TEMPLATES, + } + ], + incremental=incremental, + ) + + stats = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + lister_state = lister.get_state_from_scheduler() + state_pkg_versions = {k.split("/")[-1]: set(v) for k, v in pkg_versions.items()} + + # One component from each release plus extra null page to flush origins + assert stats.pages == (len(releases) + 1 if updated else 1) + assert stats.origins == origin_count + + assert { + o.url: o.extra_loader_arguments["packages"] for o in scheduler_origins + } == pkg_versions + + if incremental: + assert lister_state.package_versions == state_pkg_versions + assert lister.updated == updated + + +@pytest.mark.parametrize("status_code", [400, 404, 500]) +def test_fedora_lister_http_error(swh_scheduler, mocker, status_code): + """ + Simulates handling of HTTP Errors while fetching packages for fedora releases. + """ + + release = "18" + component = "Everything" + + mock_repomd( + mocker, + side_effect=[HTTPError() for _ in range(len(FEDORA_INDEX_URL_TEMPLATES))], + ) + + run_lister( + swh_scheduler, + releases=[release], + components=[component], + pkg_versions={}, + origin_count=0, + updated=False, + ) + + +def test_full_rpm_lister( + swh_scheduler, + mocker, + datadir, + pkg_versions, +): + """ + Simulates a full listing of packages for fedora releases. + """ + + mock_fedora_repomd(datadir, mocker) + run_lister( + swh_scheduler, + releases=["26", "36"], + components=["Everything"], + pkg_versions=pkg_versions, + origin_count=3, + ) + + +def test_incremental_rpm_lister( + swh_scheduler, + mocker, + datadir, + pkg_versions, +): + """ + Simulates an incremental listing of packages for fedora releases. + """ + + # First run + mock_fedora_repomd(datadir, mocker) + run_lister( + swh_scheduler, + releases=["26", "36"], + components=["Everything"], + pkg_versions=pkg_versions, + origin_count=3, + incremental=True, + ) + # Second run (no updates) + mock_fedora_repomd(datadir, mocker) + run_lister( + swh_scheduler, + releases=["26", "36"], + components=["Everything"], + pkg_versions=pkg_versions, + origin_count=0, + incremental=True, + ) + + # Use an altered version of primary36.xml in which we updated the version + # of package 0xFFFF to 0.10: + mock_fedora_repomd(datadir, mocker, use_altered_fedora36=True) + # Add new version to the set of expected pkg versions: + pkg_versions[rpm_package_origin_url("0xFFFF")].update( + { + "36/Everything/0.10-4": { + "name": "0xFFFF", + "version": "0.10-4", + "build_time": "2022-01-19T19:13:53+00:00", + "url": rpm_src_package_url( + release="36", + component="Everything", + path="0/0xFFFF-0.10-4.fc36.src.rpm", + ), + "checksums": { + "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" + }, + } + } + ) + + # Third run (0xFFFF in fedora36 component got updated and it needs to be listed) + run_lister( + swh_scheduler, + releases=["26", "36"], + components=["Everything"], + pkg_versions=pkg_versions, + origin_count=1, + incremental=True, + ) diff --git a/swh/lister/rpm/tests/test_tasks.py b/swh/lister/rpm/tests/test_tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..5f08f15be114058b71f99aefa7b23b1b05d3dcdc --- /dev/null +++ b/swh/lister/rpm/tests/test_tasks.py @@ -0,0 +1,67 @@ +# Copyright (C) 2022-2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from swh.lister.pattern import ListerStats + +from .test_lister import FEDORA_ARCHIVE_URL, FEDORA_INDEX_URL_TEMPLATES, FEDORA_URL + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.rpm.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +LISTER_KWARGS = dict( + url=FEDORA_URL, + instance="fedora", + rpm_src_data=[ + { + "base_url": FEDORA_ARCHIVE_URL, + "releases": ["36"], + "components": ["Everything"], + "index_url_templates": FEDORA_INDEX_URL_TEMPLATES, + } + ], +) + + +def test_full_listing(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + lister = mocker.patch("swh.lister.rpm.tasks.RPMLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + res = swh_scheduler_celery_app.send_task( + "swh.lister.rpm.tasks.FullRPMLister", + kwargs=LISTER_KWARGS, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**LISTER_KWARGS) + lister.run.assert_called_once_with() + + +def test_incremental_listing( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + lister = mocker.patch("swh.lister.rpm.tasks.RPMLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + res = swh_scheduler_celery_app.send_task( + "swh.lister.rpm.tasks.IncrementalRPMLister", + kwargs=LISTER_KWARGS, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**LISTER_KWARGS, incremental=True) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index 00b11f280d52bc52c224d35be0ef8c8f3887860f..a5645a610511371984caa2e892e41235da024790 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -38,9 +38,7 @@ lister_args = { "url": "https://guix.gnu.org/sources.json", "origin_upstream": "https://git.savannah.gnu.org/cgit/guix.git/", }, - "fedora": { - "url": "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", - }, + "rpm": {"url": "http://opensuse.org", "instance": "openSUSE", "rpm_src_data": []}, "pagure": {"instance": "pagure.io"}, "gitweb": { "url": "https://git.distorted.org.uk/~mdw/", @@ -64,8 +62,7 @@ def test_get_lister_wrong_input(): def test_get_lister(swh_scheduler_config): """Instantiating a supported lister should be ok""" - # Drop launchpad lister from the lister to check, its test setup is more involved - # than the other listers and it's not currently done here + for lister_name in SUPPORTED_LISTERS: lst = get_lister( lister_name,