From 95714f6f375947ecfd926ffc2ecc36b5f61d4629 Mon Sep 17 00:00:00 2001 From: Antoine Lambert <antoine.lambert@inria.fr> Date: Wed, 16 Aug 2023 13:25:23 +0000 Subject: [PATCH] rpm: Turn fedora lister into a generic Red Hat based distribution one As Red Hat based linux distributions share the same type of package repository, rework the fedora lister into a generic one to list RPM source packages and their versions from numerous distributions. For a given distribution, the RPM lister will fetch packages metadata from a list of release identifiers and a list of software components. Source packages are then processed and relevant info are extracted to be sent to the RPM loader. When all releases and components were processed, the lister collected all versions for each package name and send those info to the scheduler that will create RPM loading tasks afterwards. Nevertheless, as there is no generic way to list all releases and components for a given distribution but also to guess the right URL to retrieve packages metadata from, those info need to be manually provided to the lister as input parameters. Some examples of those parameters for various distributions can be found in the config directory of the lister. Regarding the produced origin URLs, as there is no way to find valid HTTP ones for all distributions, the same behavior as with the debian lister is used and they have the following form: rpm://{instance}/packages/{package_name} where the instance variable corresponds to the name of the listed distribution such as Fedora, CentOS, or openSUSE. Related to swh/meta#5011. --- README.md | 2 +- setup.py | 2 +- swh/lister/fedora/lister.py | 265 --------------- swh/lister/fedora/tasks.py | 21 -- swh/lister/fedora/tests/test_lister.py | 221 ------------ swh/lister/fedora/tests/test_tasks.py | 60 ---- swh/lister/{fedora => rpm}/__init__.py | 6 +- swh/lister/rpm/config/centos.yml | 100 ++++++ swh/lister/rpm/config/fedora.yml | 77 +++++ swh/lister/rpm/config/opensuse.yml | 26 ++ swh/lister/rpm/config/oracle.yml | 156 +++++++++ swh/lister/rpm/config/rockylinux.yml | 38 +++ swh/lister/rpm/lister.py | 314 ++++++++++++++++++ swh/lister/rpm/tasks.py | 28 ++ swh/lister/{fedora => rpm}/tests/__init__.py | 0 .../primary26.xml.gz | Bin .../primary36-altered.xml.gz | Bin .../primary36.xml.gz | Bin .../archives.fedoraproject.org/repomd26.xml | 0 .../archives.fedoraproject.org/repomd36.xml | 0 swh/lister/rpm/tests/test_lister.py | 283 ++++++++++++++++ swh/lister/rpm/tests/test_tasks.py | 67 ++++ swh/lister/tests/test_cli.py | 7 +- 23 files changed, 1096 insertions(+), 577 deletions(-) delete mode 100644 swh/lister/fedora/lister.py delete mode 100644 swh/lister/fedora/tasks.py delete mode 100644 swh/lister/fedora/tests/test_lister.py delete mode 100644 swh/lister/fedora/tests/test_tasks.py rename swh/lister/{fedora => rpm}/__init__.py (69%) create mode 100644 swh/lister/rpm/config/centos.yml create mode 100644 swh/lister/rpm/config/fedora.yml create mode 100644 swh/lister/rpm/config/opensuse.yml create mode 100644 swh/lister/rpm/config/oracle.yml create mode 100644 swh/lister/rpm/config/rockylinux.yml create mode 100644 swh/lister/rpm/lister.py create mode 100644 swh/lister/rpm/tasks.py rename swh/lister/{fedora => rpm}/tests/__init__.py (100%) rename swh/lister/{fedora => rpm}/tests/data/archives.fedoraproject.org/primary26.xml.gz (100%) rename swh/lister/{fedora => rpm}/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz (100%) rename swh/lister/{fedora => rpm}/tests/data/archives.fedoraproject.org/primary36.xml.gz (100%) rename swh/lister/{fedora => rpm}/tests/data/archives.fedoraproject.org/repomd26.xml (100%) rename swh/lister/{fedora => rpm}/tests/data/archives.fedoraproject.org/repomd36.xml (100%) create mode 100644 swh/lister/rpm/tests/test_lister.py create mode 100644 swh/lister/rpm/tests/test_tasks.py diff --git a/README.md b/README.md index e0332a29..91d6e797 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ following Python modules: - `swh.lister.cgit` - `swh.lister.cran` - `swh.lister.debian` -- `swh.liser.fedora` - `swh.lister.gitea` - `swh.lister.github` - `swh.lister.gitlab` @@ -27,6 +26,7 @@ following Python modules: - `swh.lister.packagist` - `swh.lister.phabricator` - `swh.lister.pypi` +- `swh.lister.rpm` - `swh.lister.tuleap` Dependencies diff --git a/setup.py b/setup.py index 983feb03..89954354 100755 --- a/setup.py +++ b/setup.py @@ -65,7 +65,6 @@ setup( lister.cran=swh.lister.cran:register lister.crates=swh.lister.crates:register lister.debian=swh.lister.debian:register - lister.fedora=swh.lister.fedora:register lister.gitea=swh.lister.gitea:register lister.github=swh.lister.github:register lister.gitiles=swh.lister.gitiles:register @@ -87,6 +86,7 @@ setup( lister.pubdev=swh.lister.pubdev:register lister.puppet=swh.lister.puppet:register lister.pypi=swh.lister.pypi:register + lister.rpm=swh.lister.rpm:register lister.rubygems=swh.lister.rubygems:register lister.sourceforge=swh.lister.sourceforge:register lister.stagit=swh.lister.stagit:register diff --git a/swh/lister/fedora/lister.py b/swh/lister/fedora/lister.py deleted file mode 100644 index 34712b37..00000000 --- a/swh/lister/fedora/lister.py +++ /dev/null @@ -1,265 +0,0 @@ -# Copyright (C) 2022 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from dataclasses import dataclass, field -from datetime import datetime, timezone -import logging -from typing import Any, Dict, Iterator, List, Optional, Set, Type -from urllib.error import HTTPError -from urllib.parse import urljoin - -import repomd - -from swh.scheduler.interface import SchedulerInterface -from swh.scheduler.model import ListedOrigin - -from ..pattern import Lister - -logger = logging.getLogger(__name__) - - -Release = int -Edition = str -PkgName = str -PkgVersion = str -FedoraOrigin = str -FedoraPageType = Type[repomd.Repo] -"""Each page is a list of packages from a given Fedora (release, edition) pair""" - - -def get_editions(release: Release) -> List[Edition]: - """Get list of editions for a given release.""" - # Ignore dirs that don't contain .rpm files: - # Docker,CloudImages,Atomic*,Spins,Live,Cloud_Atomic,Silverblue - - if release < 20: - return ["Everything", "Fedora"] - elif release < 28: - return ["Everything", "Server", "Workstation"] - else: - return ["Everything", "Server", "Workstation", "Modular"] - - -def get_last_modified(pkg: repomd.Package) -> datetime: - """Get timezone aware last modified time in UTC from RPM package metadata.""" - ts = pkg._element.find("common:time", namespaces=repomd._ns).get("build") - return datetime.utcfromtimestamp(int(ts)).replace(tzinfo=timezone.utc) - - -def get_checksums(pkg: repomd.Package) -> Dict[str, str]: - """Get checksums associated to rpm archive.""" - cs = pkg._element.find("common:checksum", namespaces=repomd._ns) - cs_type = cs.get("type") - if cs_type == "sha": - cs_type = "sha1" - return {cs_type: cs.text} - - -@dataclass -class FedoraListerState: - """State of Fedora lister""" - - package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) - """Dictionary mapping a package name to all the versions found during - last listing""" - - -class FedoraLister(Lister[FedoraListerState, FedoraPageType]): - """ - List source packages for given Fedora releases. - - The lister will create a snapshot for each package name from all its - available versions. - - If a package snapshot is different from the last listing operation, - it will be sent to the scheduler that will create a loading task - to archive newly found source code. - - Args: - scheduler: instance of SchedulerInterface - url: fedora package archives mirror URL - releases: list of fedora releases to process - """ - - LISTER_NAME = "fedora" - - def __init__( - self, - scheduler: SchedulerInterface, - instance: str = "fedora", - url: str = "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", - releases: List[Release] = [34, 35, 36], - max_origins_per_page: Optional[int] = None, - max_pages: Optional[int] = None, - enable_origins: bool = True, - ): - super().__init__( - scheduler=scheduler, - url=url, - instance=instance, - credentials={}, - max_origins_per_page=max_origins_per_page, - max_pages=max_pages, - enable_origins=enable_origins, - ) - - self.releases = releases - - self.listed_origins: Dict[FedoraOrigin, ListedOrigin] = {} - "will hold all listed origins info" - self.origins_to_send: Set[FedoraOrigin] = set() - "will hold updated origins since last listing" - self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} - "will contain the lister state after a call to run" - self.last_page = False - - def state_from_dict(self, d: Dict[str, Any]) -> FedoraListerState: - return FedoraListerState(package_versions={k: set(v) for k, v in d.items()}) - - def state_to_dict(self, state: FedoraListerState) -> Dict[str, Any]: - return {k: list(v) for k, v in state.package_versions.items()} - - def page_request(self, release: Release, edition: Edition) -> FedoraPageType: - """Return parsed packages for a given fedora release.""" - index_url = urljoin( - self.url, - f"{release}/{edition}/source/SRPMS/" - if release < 24 - else f"{release}/{edition}/source/tree/", - ) - - repo = repomd.load(index_url) # throws error if no repomd.xml is not found - self.last_page = ( - release == self.releases[-1] and edition == get_editions(release)[-1] - ) - - logger.debug( - "Fetched metadata from url: %s, found %d packages", index_url, len(repo) - ) - # TODO: Extract more fields like "provides" and "requires" from *primary.xml - # as extrinsic metadata using the pkg._element.findtext method - return repo - - def get_pages(self) -> Iterator[FedoraPageType]: - """Return an iterator on parsed fedora packages, one page per (release, edition) pair""" - - for release in self.releases: - for edition in get_editions(release): - logger.debug("Listing fedora release %s edition %s", release, edition) - self.current_release = release - self.current_edition = edition - try: - yield self.page_request(release, edition) - except HTTPError as http_error: - if http_error.getcode() == 404: - logger.debug( - "No packages metadata found for fedora release %s edition %s", - release, - edition, - ) - continue - raise - - def origin_url_for_package(self, package_name: PkgName) -> FedoraOrigin: - """Return the origin url for the given package""" - return f"https://src.fedoraproject.org/rpms/{package_name}" - - def get_origins_from_page(self, page: FedoraPageType) -> Iterator[ListedOrigin]: - """Convert a page of fedora package sources into an iterator of ListedOrigin.""" - assert self.lister_obj.id is not None - - origins_to_send = set() - - # iterate on each package's metadata - for pkg_metadata in page: - # extract package metadata - package_name = pkg_metadata.name - package_version = pkg_metadata.vr - package_version_split = package_version.split(".") - if package_version_split[-1].startswith("fc"): - # remove trailing ".fcXY" in version for the rpm loader to avoid - # creating multiple releases targeting same directory - package_version = ".".join(package_version_split[:-1]) - - package_build_time = get_last_modified(pkg_metadata) - package_download_path = pkg_metadata.location - - # build origin url - origin_url = self.origin_url_for_package(package_name) - # create package version key as expected by the fedora (rpm) loader - package_version_key = ( - f"fedora{self.current_release}/{self.current_edition}/" - f"{package_version}" - ).lower() - - # this is the first time a package is listed - if origin_url not in self.listed_origins: - # create a ListedOrigin object for it that can be later - # updated with new package versions info - self.listed_origins[origin_url] = ListedOrigin( - lister_id=self.lister_obj.id, - url=origin_url, - visit_type="rpm", - extra_loader_arguments={"packages": {}}, - last_update=package_build_time, - ) - - # init set that will contain all listed package versions - self.package_versions[package_name] = set() - - # origin will be yielded at the end of that method - origins_to_send.add(origin_url) - - # update package metadata in parameter that will be provided - # to the rpm loader - self.listed_origins[origin_url].extra_loader_arguments["packages"][ - package_version_key - ] = { - "name": package_name, - "version": package_version, - "url": urljoin(page.baseurl, package_download_path), - "buildTime": package_build_time.isoformat(), - "checksums": get_checksums(pkg_metadata), - } - - last_update = self.listed_origins[origin_url].last_update - if last_update is not None and package_build_time > last_update: - self.listed_origins[origin_url].last_update = package_build_time - - # add package version key to the set of found versions - self.package_versions[package_name].add(package_version_key) - - # package has already been listed during a previous listing process - if package_name in self.state.package_versions: - new_versions = ( - self.package_versions[package_name] - - self.state.package_versions[package_name] - ) - # no new versions so far, no need to send the origin to the scheduler - if not new_versions: - origins_to_send.remove(origin_url) - - logger.debug( - "Found %s packages to update (new ones or packages with new versions).", - len(origins_to_send), - ) - logger.debug( - "Current total number of listed packages is equal to %s.", - len(self.listed_origins), - ) - - # yield from origins_to_send.values() - self.origins_to_send.update(origins_to_send) - - if self.last_page: - # yield listed origins when all fedora releases and editions processed - yield from [ - self.listed_origins[origin_url] for origin_url in self.origins_to_send - ] - - def finalize(self): - # set mapping between listed package names and versions as lister state - self.state.package_versions = self.package_versions - self.updated = len(self.listed_origins) > 0 diff --git a/swh/lister/fedora/tasks.py b/swh/lister/fedora/tasks.py deleted file mode 100644 index 18c8a605..00000000 --- a/swh/lister/fedora/tasks.py +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (C) 2022 the Software Heritage developers -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from typing import Dict - -from celery import shared_task - -from .lister import FedoraLister - - -@shared_task(name=__name__ + ".FullFedoraRelister") -def list_fedora_full(**lister_args) -> Dict[str, int]: - """Full update of a Fedora instance""" - lister = FedoraLister.from_configfile(**lister_args) - return lister.run().dict() - - -@shared_task(name=__name__ + ".ping") -def _ping() -> str: - return "OK" diff --git a/swh/lister/fedora/tests/test_lister.py b/swh/lister/fedora/tests/test_lister.py deleted file mode 100644 index dc093597..00000000 --- a/swh/lister/fedora/tests/test_lister.py +++ /dev/null @@ -1,221 +0,0 @@ -# Copyright (C) 2022 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from io import StringIO -from pathlib import Path -from typing import List -from unittest.mock import MagicMock -from urllib.error import HTTPError - -import pytest - -from swh.lister.fedora.lister import FedoraLister, Release, get_editions -from swh.scheduler.interface import SchedulerInterface - - -def mock_repomd(datadir, mocker, use_altered_fedora36=False): - """Mocks the .xml files fetched by repomd for the next lister run""" - paths = ["repomd26.xml", "primary26.xml.gz", "repomd36.xml", "primary36.xml.gz"] - if use_altered_fedora36: - paths[3] = "primary36-altered.xml.gz" - - cm = MagicMock() - cm.read.side_effect = [ - Path(datadir, "archives.fedoraproject.org", path).read_bytes() for path in paths - ] - cm.__enter__.return_value = cm - mocker.patch("repomd.urllib.request.urlopen").return_value = cm - - -def rpm_url(release, path): - return ( - "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/" - f"{release}/Everything/source/tree/Packages/{path}" - ) - - -@pytest.fixture -def pkg_versions(): - return { - "https://src.fedoraproject.org/rpms/0install": { - "fedora26/everything/2.11-4": { - "name": "0install", - "version": "2.11-4", - "buildTime": "2017-02-10T04:59:31+00:00", - "url": rpm_url(26, "0/0install-2.11-4.fc26.src.rpm"), - "checksums": { - # note: we intentionally altered the original - # primary26.xml file to test sha1 usage - "sha1": "a6fdef5d1026dea208eeeba148f55ac2f545989b", - }, - } - }, - "https://src.fedoraproject.org/rpms/0xFFFF": { - "fedora26/everything/0.3.9-15": { - "name": "0xFFFF", - "version": "0.3.9-15", - "buildTime": "2017-02-10T05:01:53+00:00", - "url": rpm_url(26, "0/0xFFFF-0.3.9-15.fc26.src.rpm"), - "checksums": { - "sha256": "96f9c163c0402d2b30e5343c8397a6d50e146c85a446804396b119ef9698231f" - }, - }, - "fedora36/everything/0.9-4": { - "name": "0xFFFF", - "version": "0.9-4", - "buildTime": "2022-01-19T19:13:53+00:00", - "url": rpm_url(36, "0/0xFFFF-0.9-4.fc36.src.rpm"), - "checksums": { - "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" - }, - }, - }, - "https://src.fedoraproject.org/rpms/2ping": { - "fedora36/everything/4.5.1-2": { - "name": "2ping", - "version": "4.5.1-2", - "buildTime": "2022-01-19T19:12:21+00:00", - "url": rpm_url(36, "2/2ping-4.5.1-2.fc36.src.rpm"), - "checksums": { - "sha256": "2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28" - }, - } - }, - } - - -def run_lister( - swh_scheduler: SchedulerInterface, - releases: List[Release], - pkg_versions: dict, - origin_count: int, - updated: bool = True, -): - """Runs the lister and tests that the listed origins are correct.""" - lister = FedoraLister(scheduler=swh_scheduler, releases=releases) - - stats = lister.run() - scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results - lister_state = lister.get_state_from_scheduler() - state_pkg_versions = {k.split("/")[-1]: set(v) for k, v in pkg_versions.items()} - - # One edition from each release (we mocked get_editions) - assert stats.pages == (len(releases) if updated else 0) - assert stats.origins == origin_count - - assert { - o.url: o.extra_loader_arguments["packages"] for o in scheduler_origins - } == pkg_versions - - assert lister_state.package_versions == state_pkg_versions - assert lister.updated == updated - - -def test_get_editions(): - assert get_editions(18) == ["Everything", "Fedora"] - assert get_editions(26) == ["Everything", "Server", "Workstation"] - assert get_editions(34) == ["Everything", "Server", "Workstation", "Modular"] - - -@pytest.mark.parametrize("status_code", [400, 404, 500]) -def test_fedora_lister_http_error( - swh_scheduler: SchedulerInterface, mocker: MagicMock, status_code: int -): - """ - Simulates handling of HTTP Errors while fetching of packages for fedora releases. - """ - releases = [18] - - is_404 = status_code == 404 - - def side_effect(url): - if is_404: - raise HTTPError( - url, status_code, "Not Found", {"content-type": "text/html"}, StringIO() - ) - else: - raise HTTPError( - url, - status_code, - "Internal server error", - {"content-type": "text/html"}, - StringIO(), - ) - - urlopen_patch = mocker.patch("repomd.urllib.request.urlopen") - urlopen_patch.side_effect = side_effect - - expected_pkgs: dict = {} - - if is_404: - run_lister( - swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False - ) - else: - with pytest.raises(HTTPError): - run_lister( - swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False - ) - - -def test_full_lister_fedora( - swh_scheduler: SchedulerInterface, - mocker: MagicMock, - datadir: Path, - pkg_versions: dict, -): - """ - Simulates a full listing of packages for fedora releases. - """ - releases = [26, 36] - - get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions") - get_editions_patch.return_value = ["Everything"] - - mock_repomd(datadir, mocker) - run_lister(swh_scheduler, releases, pkg_versions, origin_count=3) - - -def test_incremental_lister( - swh_scheduler: SchedulerInterface, - mocker: MagicMock, - datadir: Path, - pkg_versions: dict, -): - """ - Simulates an incremental listing of packages for fedora releases. - """ - releases = [26, 36] - - get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions") - get_editions_patch.return_value = ["Everything"] - - # First run - mock_repomd(datadir, mocker) - run_lister(swh_scheduler, releases, pkg_versions, origin_count=3) - # Second run (no updates) - mock_repomd(datadir, mocker) - run_lister(swh_scheduler, releases, pkg_versions, origin_count=0) - - # Use an altered version of primary36.xml in which we updated the version - # of package 0xFFFF to 0.10: - mock_repomd(datadir, mocker, use_altered_fedora36=True) - # Add new version to the set of expected pkg versions: - pkg_versions["https://src.fedoraproject.org/rpms/0xFFFF"].update( - { - "fedora36/everything/0.10-4": { - "name": "0xFFFF", - "version": "0.10-4", - "buildTime": "2022-01-19T19:13:53+00:00", - "url": rpm_url(36, "0/0xFFFF-0.10-4.fc36.src.rpm"), - "checksums": { - "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" - }, - } - } - ) - - # Third run (0xFFFF in fedora36 editions got updated and it needs to be listed) - run_lister(swh_scheduler, releases, pkg_versions, origin_count=1) diff --git a/swh/lister/fedora/tests/test_tasks.py b/swh/lister/fedora/tests/test_tasks.py deleted file mode 100644 index 7fd4236e..00000000 --- a/swh/lister/fedora/tests/test_tasks.py +++ /dev/null @@ -1,60 +0,0 @@ -# Copyright (C) 2022 The Software Heritage developers -# See the AUTHORS file at the top-level directory of this distribution -# License: GNU General Public License version 3, or any later version -# See top-level LICENSE file for more information - -from unittest.mock import patch - -from swh.lister.pattern import ListerStats - - -def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): - res = swh_scheduler_celery_app.send_task("swh.lister.fedora.tasks.ping") - assert res - res.wait() - assert res.successful() - assert res.result == "OK" - - -@patch("swh.lister.fedora.tasks.FedoraLister") -def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): - lister.from_configfile.return_value = lister - lister.run.return_value = ListerStats(pages=10, origins=500) - - kwargs = dict( - url="https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/" - ) - res = swh_scheduler_celery_app.send_task( - "swh.lister.fedora.tasks.FullFedoraRelister", - kwargs=kwargs, - ) - assert res - res.wait() - assert res.successful() - - lister.from_configfile.assert_called_once_with(**kwargs) - lister.run.assert_called_once_with() - - -@patch("swh.lister.fedora.tasks.FedoraLister") -def test_full_listing_params( - lister, swh_scheduler_celery_app, swh_scheduler_celery_worker -): - lister.from_configfile.return_value = lister - lister.run.return_value = ListerStats(pages=10, origins=500) - - kwargs = dict( - url="https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", - instance="archives.fedoraproject.org", - releases=["36"], - ) - res = swh_scheduler_celery_app.send_task( - "swh.lister.fedora.tasks.FullFedoraRelister", - kwargs=kwargs, - ) - assert res - res.wait() - assert res.successful() - - lister.from_configfile.assert_called_once_with(**kwargs) - lister.run.assert_called_once_with() diff --git a/swh/lister/fedora/__init__.py b/swh/lister/rpm/__init__.py similarity index 69% rename from swh/lister/fedora/__init__.py rename to swh/lister/rpm/__init__.py index 6fb3a148..54d6da19 100644 --- a/swh/lister/fedora/__init__.py +++ b/swh/lister/rpm/__init__.py @@ -1,13 +1,13 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information def register(): - from .lister import FedoraLister + from .lister import RPMLister return { - "lister": FedoraLister, + "lister": RPMLister, "task_modules": [f"{__name__}.tasks"], } diff --git a/swh/lister/rpm/config/centos.yml b/swh/lister/rpm/config/centos.yml new file mode 100644 index 00000000..25c6b529 --- /dev/null +++ b/swh/lister/rpm/config/centos.yml @@ -0,0 +1,100 @@ +# RPM lister parameters to process CentOS source packages + +url: https://www.centos.org +instance: CentOS +rpm_src_data: + - base_url: https://vault.centos.org/ + releases: + - "3.7" + - "3.8" + - "3.9" + - "4.0" + - "4.1" + - "4.2" + - "4.3" + - "4.4" + - "4.5" + - "4.6" + - "4.7" + - "4.8" + - "4.9" + - "5.0" + - "5.1" + - "5.2" + - "5.3" + - "5.4" + - "5.5" + - "5.6" + - "5.7" + - "5.8" + - "5.9" + - "5.10" + - "5.11" + - "6.0" + - "6.1" + - "6.2" + - "6.3" + - "6.4" + - "6.5" + - "6.6" + - "6.7" + - "6.8" + - "6.9" + - "6.10" + - "7.0.1406" + - "7.1.1503" + - "7.2.1511" + - "7.3.1611" + - "7.4.1708" + - "7.5.1804" + - "7.6.1810" + - "7.7.1908" + - "7.8.2003" + - "7.9.2009" + - "8-stream" + - "8.0.1905" + - "8.1.1911" + - "8.2.2004" + - "8.3.2011" + - "8.4.2105" + - "8.5.2111" + components: + - AppStream + - BaseOS + - HighAvailability + - PowerTools + - SCL + - addons + - centosplus + - contrib + - cr + - csgfs + - dotnet + - extras + - fasttrack + - opstools + - os + - rt + - testing + - updates + - xen4 + + index_url_templates: + - $base_url/$release/$component/Source/ + - $base_url/$release/$component/SRPMS/ + - $base_url/$release/$component/x86_64/ + + - base_url: https://mirror.stream.centos.org + releases: + - 9-stream + components: + - AppStream + - BaseOS + - CRB + - HighAvailability + - NFV + - RT + - ResilientStorage + + index_url_templates: + - $base_url/$release/$component/source/tree/ diff --git a/swh/lister/rpm/config/fedora.yml b/swh/lister/rpm/config/fedora.yml new file mode 100644 index 00000000..382863ea --- /dev/null +++ b/swh/lister/rpm/config/fedora.yml @@ -0,0 +1,77 @@ +# RPM lister parameters to process Fedora source packages + +url: https://fedoraproject.org +instance: "Fedora" +rpm_src_data: + - base_url: https://archives.fedoraproject.org/pub/archive/fedora/linux/ + releases: + - "2" + - "3" + - "4" + - "5" + - "6" + components: + - core + - extras + index_url_templates: + - $base_url/$component/$release/SRPMS + - $base_url/$component/$release/source/SRPMS + - $base_url/$component/$release/x86_64/os/ + + - base_url: https://archives.fedoraproject.org/pub/archive/fedora/linux/ + releases: + - "7" + - "8" + - "9" + - "10" + - "11" + - "12" + - "13" + - "14" + - "15" + - "16" + - "17" + - "18" + - "19" + - "20" + - "21" + - "22" + - "23" + - "24" + - "25" + - "26" + - "27" + - "28" + - "29" + - "30" + - "31" + - "32" + - "33" + - "34" + - "35" + components: + - Everything + - Server + - Workstation + - Modular + - Fedora + index_url_templates: + - $base_url/releases/$release/$component/source/tree/ + - $base_url/updates/$release/$component/source/tree/ + - $base_url/releases/$release/$component/source/SRPMS/ + - $base_url/updates/$release/SRPMS/ + + - base_url: https://dl.fedoraproject.org/pub/fedora/linux/ + releases: + - "36" + - "37" + - "38" + components: + - Everything + - Server + - Workstation + - Modular + - Fedora + index_url_templates: + - $base_url/releases/$release/$component/source/tree/ + - $base_url/updates/$release/$component/source/tree/ diff --git a/swh/lister/rpm/config/opensuse.yml b/swh/lister/rpm/config/opensuse.yml new file mode 100644 index 00000000..461d3173 --- /dev/null +++ b/swh/lister/rpm/config/opensuse.yml @@ -0,0 +1,26 @@ +# RPM lister parameters to process openSUSE source packages + +url: http://opensuse.org +instance: openSUSE +rpm_src_data: + - base_url: http://download.opensuse.org/source/ + releases: + - tumbleweed + - jump/15.2 + - leap/15.0-Current + - leap/15.0 + - leap/15.1 + - leap/15.2 + - leap/15.3 + - leap/15.4 + - leap/15.5 + - leap/42.2 + - leap/42.3-Current + - leap/42.3 + components: + - oss + - non-oss + index_url_templates: + - $base_url/distribution/$release/repo/$component/ + - $base_url/distribution/$release/repo/$component/suse/ + - $base_url/$release/repo/$component/ diff --git a/swh/lister/rpm/config/oracle.yml b/swh/lister/rpm/config/oracle.yml new file mode 100644 index 00000000..c5c64e6c --- /dev/null +++ b/swh/lister/rpm/config/oracle.yml @@ -0,0 +1,156 @@ +# RPM lister parameters to process Oracle Linux source packages + +url: https://www.oracle.com/linux +instance: OracleLinux +rpm_src_data: + - base_url: https://yum.oracle.com/repo/EnterpriseLinux/ + releases: + - EL5 + components: + - addons + - oracle_addons + - unsupported + - 0/base + - 1/base + - 2/base + - 3/base + - 4/base + - 5/base + + index_url_templates: + - $base_url/$release/$component/x86_64 + + - base_url: https://yum.oracle.com/repo/OracleLinux/ + releases: + - OL5 + - OL6 + - OL7 + - OL8 + - OL9 + components: + - 0/base + - 0/baseos/base + - 1/base + - 1/baseos/base + - 10/base + - 11/base + - 2/base + - 2/baseos/base + - 3/base + - 3/baseos/base + - 4/base + - 4/baseos/base + - 4/security/validation + - 5/base + - 5/baseos/base + - 6/base + - 6/baseos/base + - 7/base + - 7/baseos/base + - 8/base + - 8/baseos/base + - 8/security/validation + - 9/base + - MODRHCK + - MySQL + - MySQL56 + - MySQL57_community + - MySQL80/community + - MySQL80/connectors/community + - MySQL80/tools/community + - MySQL80_community + - RDMA + - SoftwareCollections + - UEK/latest + - UEKR3 + - UEKR3/latest + - UEKR3_OFED20 + - UEKR4 + - UEKR4/OFED + - UEKR4/archive + - UEKR5 + - UEKR5/RDMA + - UEKR5/archive + - UEKR6 + - UEKR6/RDMA + - UEKR7 + - UEKR7/RDMA + - addons + - appstream + - appstream/developer + - automation2 + - baseos/developer + - baseos/latest + - beta + - ceph + - ceph30 + - codeready/builder + - codeready/builder/developer + - developer + - developer/EPEL + - developer/EPEL/modular + - developer/UEKR5 + - developer/UEKR6 + - developer/UEKR7 + - developer/golang117 + - developer/golang118 + - developer/golang119 + - developer/kvm/utils + - developer/nodejs12 + - developer/olcne + - developer/php74 + - developer_EPEL + - developer_gluster310 + - developer_gluster312 + - distro/builder + - gluster/appstream + - gluster312 + - gluster41 + - gluster5 + - gluster6 + - gluster8 + - kvm/appstream + - kvm/utils + - latest + - latest/archive + - leapp + - ofed_UEK + - olcne + - olcne11 + - olcne12 + - olcne13 + - olcne14 + - olcne15 + - olcne16 + - openstack10 + - openstack21 + - openstack30 + - openstack40 + - openstack40_extras + - openstack50 + - openstack50_extras + - optional + - optional/archive + - optional/beta + - oracle/instantclient + - oracle/instantclient21 + - oraclelinuxmanager210/client + - oraclelinuxmanager210/server + - ovirt42 + - ovirt42/extras + - ovirt43 + - ovirt43/extras + - ovirt44 + - ovirt44/extras + - security/validation + - spacewalk210/client + - spacewalk210/server + - spacewalk24/client + - spacewalk24/server + - spacewalk26/client + - spacewalk26/server + - spacewalk27/client + - spacewalk27/server + + index_url_templates: + - $base_url/$release/$component/x86_64 diff --git a/swh/lister/rpm/config/rockylinux.yml b/swh/lister/rpm/config/rockylinux.yml new file mode 100644 index 00000000..3f7bf2bd --- /dev/null +++ b/swh/lister/rpm/config/rockylinux.yml @@ -0,0 +1,38 @@ +# RPM lister parameters to process Rocky Linux source packages + +url: https://rockylinux.org +instance: RockyLinux +rpm_src_data: + - base_url: https://download.rockylinux.org/ + releases: + - "8.3" + - "8.4" + - "8.4-RC1" + - "8.5" + - "8.6" + - "8.7" + - "8.8" + - "9.0" + - "9.1" + - "9.2" + components: + - AppStream + - BaseOS + - Devel + - HighAvailability + - Minimal + - PowerTools + - ResilientStorage + - CRB + - NFV + - RT + - SAP + - SAPHANA + - devel + - extras + - plus + - nfv + - rockyrpi + index_url_templates: + - $base_url/vault/rocky/$release/$component/source/tree/ + - $base_url/pub/rocky/$release/$component/source/tree/ diff --git a/swh/lister/rpm/lister.py b/swh/lister/rpm/lister.py new file mode 100644 index 00000000..a6eeb1fa --- /dev/null +++ b/swh/lister/rpm/lister.py @@ -0,0 +1,314 @@ +# Copyright (C) 2022-2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from dataclasses import dataclass, field +from datetime import datetime, timezone +from itertools import product +import logging +from string import Template +from typing import Any, Dict, Iterator, List, Optional, Set, Tuple +from urllib.parse import urljoin + +import repomd +from typing_extensions import TypedDict + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import Lister + +logger = logging.getLogger(__name__) + + +Release = str +Component = str +PkgName = str +PkgVersion = str +RPMOrigin = str + +RPMPageType = Optional[Tuple[Release, Component, repomd.Repo]] +"""Each page is a list of packages for a given (release, component) pair +from a Red Hat based distribution.""" + + +class RPMSourceData(TypedDict): + """Dictionary holding relevant data for listing RPM source packages. + + See content of the lister config directory to get examples of RPM + source data for famous RedHat based distributions. + """ + + base_url: str + """Base URL of a RPM repository""" + releases: List[Release] + """List of release identifiers for a Red Hat based distribution""" + components: List[Component] + """List of components for a Red Hat based distribution""" + index_url_templates: List[str] + """List of URL templates to discover source packages metadata, the + following variables can be substituted in them: ``base_url``, ``release`` + and ``edition``, see :class:`string.Template` for more details about the + format. The generated URLs must target directories containing a sub-directory + named ``repodata``, which contains packages metadata, in order to be + successfully processed by the lister.""" + + +def _get_last_modified(pkg: repomd.Package) -> datetime: + """Get timezone aware last modified time in UTC from RPM package metadata.""" + ts = pkg._element.find("common:time", namespaces=repomd._ns).get("build") + return datetime.utcfromtimestamp(int(ts)).replace(tzinfo=timezone.utc) + + +def _get_checksums(pkg: repomd.Package) -> Dict[str, str]: + """Get checksums associated to rpm archive.""" + cs = pkg._element.find("common:checksum", namespaces=repomd._ns) + cs_type = cs.get("type") + if cs_type == "sha": + cs_type = "sha1" + return {cs_type: cs.text} + + +@dataclass +class RPMListerState: + """State of RPM lister""" + + package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict) + """Dictionary mapping a package name to all the versions found during + last listing""" + + +class RPMLister(Lister[RPMListerState, RPMPageType]): + """ + List source packages for a Red Hat based linux distribution. + + The lister creates a snapshot for each package from all its available versions. + + In incremental mode, only packages with different snapshot since the last listing + operation will be sent to the scheduler that will create loading tasks to archive + newly found source code. + + Args: + scheduler: instance of SchedulerInterface + url: Red Hat based distribution info URL + instance: name of Red Hat based distribution + rpm_src_data: list of dictionaries holding data required to list RPM source packages, + see examples in the config directory. + incremental: if :const:`True`, only packages with new versions are sent to the + scheduler when relisting + """ + + LISTER_NAME = "rpm" + + def __init__( + self, + scheduler: SchedulerInterface, + url: str, + instance: str, + rpm_src_data: List[RPMSourceData], + incremental: bool = False, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, + ): + super().__init__( + scheduler=scheduler, + url=url, + instance=instance, + credentials={}, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, + ) + + self.rpm_src_data = rpm_src_data + self.incremental = incremental + + self.listed_origins: Dict[RPMOrigin, ListedOrigin] = {} + self.origins_to_send: Set[RPMOrigin] = set() + self.package_versions: Dict[PkgName, Set[PkgVersion]] = {} + + def state_from_dict(self, d: Dict[str, Any]) -> RPMListerState: + return RPMListerState(package_versions={k: set(v) for k, v in d.items()}) + + def state_to_dict(self, state: RPMListerState) -> Dict[str, Any]: + return {k: list(v) for k, v in state.package_versions.items()} + + def repo_request( + self, + index_url_template: Template, + base_url: str, + release: Release, + component: Component, + ) -> Optional[RPMPageType]: + """Return parsed packages for a given distribution release and component.""" + + index_url = index_url_template.substitute( + base_url=base_url.rstrip("/"), release=release, component=component + ) + + try: + repo = repomd.load(index_url) # throws error if no repomd.xml is not found + except Exception: + logger.debug("Repository metadata not found at URL %s", index_url) + return None + else: + logger.debug( + "Fetched metadata from url: %s, found %d packages", index_url, len(repo) + ) + return repo + + def get_pages(self) -> Iterator[RPMPageType]: + """Return an iterator on parsed rpm packages, one page per (release, component) pair.""" + for rpm_src_data in self.rpm_src_data: + index_url_templates = [ + Template(index_url_template) + for index_url_template in rpm_src_data["index_url_templates"] + ] + # try all possible package repository URLs for each (release, component) pair + for release, component, index_url_template in product( + rpm_src_data["releases"], + rpm_src_data["components"], + index_url_templates, + ): + repo = self.repo_request( + index_url_template, + rpm_src_data["base_url"], + release, + component, + ) + if repo is not None: + # valid package repository found, yield page + yield (release, component, repo) + + yield None + + def origin_url_for_package(self, package_name: PkgName) -> RPMOrigin: + """Return the origin url for the given package.""" + # TODO: Use a better origin URL before deploying the lister to production + # https://gitlab.softwareheritage.org/swh/devel/swh-model/-/issues/4632 + return f"rpm://{self.instance}/packages/{package_name}" + + def get_origins_from_page(self, page: RPMPageType) -> Iterator[ListedOrigin]: + """Convert a page of rpm package sources into an iterator of ListedOrigin.""" + assert self.lister_obj.id is not None + + if page is None: + # all pages processed, yield listed origins + for origin_url in self.origins_to_send: + yield self.listed_origins[origin_url] + return + + release, component, repo = page + + logger.debug( + "Listing %s release %s component %s from repository metadata located at %s", + self.instance, + release, + component, + repo.baseurl, + ) + + origins_to_send = set() + new_origins_count = 0 + + # iterate on each package's metadata + for pkg_metadata in repo: + + if pkg_metadata.arch != "src": + # not a source package, skip it + continue + + # extract package metadata + package_name = pkg_metadata.name + + # we extract the intrinsic version of the package for the rpm loader + # to avoid creating different releases targeting the same directory + # 2.12-10.el8 => 2.12-10 + package_version_split = pkg_metadata.vr.rsplit("-", maxsplit=1) + package_version = "-".join( + [ + package_version_split[0], + package_version_split[1].split(".", maxsplit=1)[0], + ] + ) + + # create package version key as expected by the rpm loader + package_version_key = f"{release}/{component}/{package_version}" + + package_build_time = _get_last_modified(pkg_metadata) + package_download_url = urljoin( + repo.baseurl.rstrip("/") + "/", pkg_metadata.location + ) + checksums = _get_checksums(pkg_metadata) + + # build origin url + origin_url = self.origin_url_for_package(package_name) + + # this is the first time a package is listed + if origin_url not in self.listed_origins: + # create a ListedOrigin object for it that can be later + # updated with new package versions info + self.listed_origins[origin_url] = ListedOrigin( + lister_id=self.lister_obj.id, + url=origin_url, + visit_type="rpm", + extra_loader_arguments={"packages": {}}, + last_update=package_build_time, + ) + + # init set that will contain all listed package versions + self.package_versions[package_name] = set() + new_origins_count += 1 + + # origins will be yielded when all pages processed + origins_to_send.add(origin_url) + + # update package metadata in parameter that will be provided + # to the rpm loader + self.listed_origins[origin_url].extra_loader_arguments["packages"][ + package_version_key + ] = { + "name": package_name, + "version": package_version, + "url": package_download_url, + "build_time": package_build_time.isoformat(), + "checksums": checksums, + } + + last_update = self.listed_origins[origin_url].last_update + if last_update is not None and package_build_time > last_update: + self.listed_origins[origin_url].last_update = package_build_time + + # add package version key to the set of found versions + self.package_versions[package_name].add(package_version_key) + + # package has already been listed during a previous listing process + if self.incremental and package_name in self.state.package_versions: + new_versions = ( + self.package_versions[package_name] + - self.state.package_versions[package_name] + ) + # no new versions so far, no need to send the origin to the scheduler + if not new_versions: + origins_to_send.remove(origin_url) + + logger.debug( + "Found %s packages to update (%s new ones and %s packages with new versions).", + len(origins_to_send), + new_origins_count, + len(origins_to_send) - new_origins_count, + ) + logger.debug( + "Current total number of listed source packages is equal to %s.", + len(self.listed_origins), + ) + + self.origins_to_send.update(origins_to_send) + + def finalize(self): + if self.incremental: + # set mapping between listed package names and versions as lister state + self.state.package_versions = self.package_versions + self.updated = len(self.listed_origins) > 0 diff --git a/swh/lister/rpm/tasks.py b/swh/lister/rpm/tasks.py new file mode 100644 index 00000000..ef160f77 --- /dev/null +++ b/swh/lister/rpm/tasks.py @@ -0,0 +1,28 @@ +# Copyright (C) 2022-2023 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from typing import Dict + +from celery import shared_task + +from .lister import RPMLister + + +@shared_task(name=__name__ + ".FullRPMLister") +def list_rpm_full(**lister_args) -> Dict[str, int]: + """Full listing of Red Hat based distribution source packages""" + lister = RPMLister.from_configfile(**lister_args) + return lister.run().dict() + + +@shared_task(name=__name__ + ".IncrementalRPMLister") +def list_rpm_incremental(**lister_args) -> Dict[str, int]: + """Incremental listing of Red Hat based distribution source packages""" + lister = RPMLister.from_configfile(**lister_args, incremental=True) + return lister.run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping() -> str: + return "OK" diff --git a/swh/lister/fedora/tests/__init__.py b/swh/lister/rpm/tests/__init__.py similarity index 100% rename from swh/lister/fedora/tests/__init__.py rename to swh/lister/rpm/tests/__init__.py diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml.gz b/swh/lister/rpm/tests/data/archives.fedoraproject.org/primary26.xml.gz similarity index 100% rename from swh/lister/fedora/tests/data/archives.fedoraproject.org/primary26.xml.gz rename to swh/lister/rpm/tests/data/archives.fedoraproject.org/primary26.xml.gz diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz b/swh/lister/rpm/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz similarity index 100% rename from swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz rename to swh/lister/rpm/tests/data/archives.fedoraproject.org/primary36-altered.xml.gz diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml.gz b/swh/lister/rpm/tests/data/archives.fedoraproject.org/primary36.xml.gz similarity index 100% rename from swh/lister/fedora/tests/data/archives.fedoraproject.org/primary36.xml.gz rename to swh/lister/rpm/tests/data/archives.fedoraproject.org/primary36.xml.gz diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd26.xml b/swh/lister/rpm/tests/data/archives.fedoraproject.org/repomd26.xml similarity index 100% rename from swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd26.xml rename to swh/lister/rpm/tests/data/archives.fedoraproject.org/repomd26.xml diff --git a/swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd36.xml b/swh/lister/rpm/tests/data/archives.fedoraproject.org/repomd36.xml similarity index 100% rename from swh/lister/fedora/tests/data/archives.fedoraproject.org/repomd36.xml rename to swh/lister/rpm/tests/data/archives.fedoraproject.org/repomd36.xml diff --git a/swh/lister/rpm/tests/test_lister.py b/swh/lister/rpm/tests/test_lister.py new file mode 100644 index 00000000..43c6e85e --- /dev/null +++ b/swh/lister/rpm/tests/test_lister.py @@ -0,0 +1,283 @@ +# Copyright (C) 2022-2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from pathlib import Path +from string import Template +from typing import List + +import pytest +from urllib3.exceptions import HTTPError + +from swh.lister.rpm.lister import Component, Release, RPMLister +from swh.scheduler.interface import SchedulerInterface + +FEDORA_URL = "https://fedoraproject.org/" +FEDORA_ARCHIVE_URL = "https://archives.fedoraproject.org/pub/archive/fedora/linux" + +FEDORA_INDEX_URL_TEMPLATES = [ + "$base_url/releases/$release/$component/source/tree/", + "$base_url/updates/$release/$component/source/tree/", + "$base_url/releases/$release/$component/source/SRPMS/", + "$base_url/updates/$release/SRPMS/", +] + + +def mock_repomd(mocker, side_effect): + """Mocks the .xml files fetched by repomd for the next lister run""" + cm = mocker.MagicMock() + cm.read.side_effect = side_effect + cm.__enter__.return_value = cm + mocker.patch("repomd.urllib.request.urlopen").return_value = cm + + +def mock_fedora_repomd(datadir, mocker, use_altered_fedora36=False): + repodata = [ + ["repomd26.xml", "primary26.xml.gz"], + ["repomd36.xml", "primary36.xml.gz"], + ] + if use_altered_fedora36: + repodata[1][1] = "primary36-altered.xml.gz" + + side_effect = [] + + for paths in repodata: + side_effect += [ + Path(datadir, "archives.fedoraproject.org", path).read_bytes() + for path in paths + ] + side_effect += [HTTPError() for _ in range(len(FEDORA_INDEX_URL_TEMPLATES) - 1)] + + mock_repomd(mocker, side_effect) + + +def rpm_repodata_url(release, component): + return Template(FEDORA_INDEX_URL_TEMPLATES[0]).substitute( + base_url=FEDORA_ARCHIVE_URL, release=release, component=component + ) + + +def rpm_src_package_url(release, component, path): + return f"{rpm_repodata_url(release, component)}Packages/{path}" + + +def rpm_package_origin_url(package_name, instance="Fedora"): + return f"rpm://{instance}/packages/{package_name}" + + +@pytest.fixture +def pkg_versions(): + return { + f"{rpm_package_origin_url('0install')}": { + "26/Everything/2.11-4": { + "name": "0install", + "version": "2.11-4", + "build_time": "2017-02-10T04:59:31+00:00", + "url": rpm_src_package_url( + release="26", + component="Everything", + path="0/0install-2.11-4.fc26.src.rpm", + ), + "checksums": { + # note: we intentionally altered the original + # primary26.xml file to test sha1 usage + "sha1": "a6fdef5d1026dea208eeeba148f55ac2f545989b", + }, + } + }, + f"{rpm_package_origin_url('0xFFFF')}": { + "26/Everything/0.3.9-15": { + "name": "0xFFFF", + "version": "0.3.9-15", + "build_time": "2017-02-10T05:01:53+00:00", + "url": rpm_src_package_url( + release="26", + component="Everything", + path="0/0xFFFF-0.3.9-15.fc26.src.rpm", + ), + "checksums": { + "sha256": "96f9c163c0402d2b30e5343c8397a6d50e146c85a446804396b119ef9698231f" + }, + }, + "36/Everything/0.9-4": { + "name": "0xFFFF", + "version": "0.9-4", + "build_time": "2022-01-19T19:13:53+00:00", + "url": rpm_src_package_url( + release="36", + component="Everything", + path="0/0xFFFF-0.9-4.fc36.src.rpm", + ), + "checksums": { + "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" + }, + }, + }, + f"{rpm_package_origin_url('2ping')}": { + "36/Everything/4.5.1-2": { + "name": "2ping", + "version": "4.5.1-2", + "build_time": "2022-01-19T19:12:21+00:00", + "url": rpm_src_package_url( + release="36", + component="Everything", + path="2/2ping-4.5.1-2.fc36.src.rpm", + ), + "checksums": { + "sha256": "2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28" + }, + } + }, + } + + +def run_lister( + swh_scheduler: SchedulerInterface, + releases: List[Release], + components: List[Component], + pkg_versions: dict, + origin_count: int, + incremental: bool = False, + updated: bool = True, +): + """Runs the lister and tests that the listed origins are correct.""" + lister = RPMLister( + scheduler=swh_scheduler, + url=FEDORA_URL, + instance="Fedora", + rpm_src_data=[ + { + "base_url": FEDORA_ARCHIVE_URL, + "releases": releases, + "components": components, + "index_url_templates": FEDORA_INDEX_URL_TEMPLATES, + } + ], + incremental=incremental, + ) + + stats = lister.run() + scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results + lister_state = lister.get_state_from_scheduler() + state_pkg_versions = {k.split("/")[-1]: set(v) for k, v in pkg_versions.items()} + + # One component from each release plus extra null page to flush origins + assert stats.pages == (len(releases) + 1 if updated else 1) + assert stats.origins == origin_count + + assert { + o.url: o.extra_loader_arguments["packages"] for o in scheduler_origins + } == pkg_versions + + if incremental: + assert lister_state.package_versions == state_pkg_versions + assert lister.updated == updated + + +@pytest.mark.parametrize("status_code", [400, 404, 500]) +def test_fedora_lister_http_error(swh_scheduler, mocker, status_code): + """ + Simulates handling of HTTP Errors while fetching packages for fedora releases. + """ + + release = "18" + component = "Everything" + + mock_repomd( + mocker, + side_effect=[HTTPError() for _ in range(len(FEDORA_INDEX_URL_TEMPLATES))], + ) + + run_lister( + swh_scheduler, + releases=[release], + components=[component], + pkg_versions={}, + origin_count=0, + updated=False, + ) + + +def test_full_rpm_lister( + swh_scheduler, + mocker, + datadir, + pkg_versions, +): + """ + Simulates a full listing of packages for fedora releases. + """ + + mock_fedora_repomd(datadir, mocker) + run_lister( + swh_scheduler, + releases=["26", "36"], + components=["Everything"], + pkg_versions=pkg_versions, + origin_count=3, + ) + + +def test_incremental_rpm_lister( + swh_scheduler, + mocker, + datadir, + pkg_versions, +): + """ + Simulates an incremental listing of packages for fedora releases. + """ + + # First run + mock_fedora_repomd(datadir, mocker) + run_lister( + swh_scheduler, + releases=["26", "36"], + components=["Everything"], + pkg_versions=pkg_versions, + origin_count=3, + incremental=True, + ) + # Second run (no updates) + mock_fedora_repomd(datadir, mocker) + run_lister( + swh_scheduler, + releases=["26", "36"], + components=["Everything"], + pkg_versions=pkg_versions, + origin_count=0, + incremental=True, + ) + + # Use an altered version of primary36.xml in which we updated the version + # of package 0xFFFF to 0.10: + mock_fedora_repomd(datadir, mocker, use_altered_fedora36=True) + # Add new version to the set of expected pkg versions: + pkg_versions[rpm_package_origin_url("0xFFFF")].update( + { + "36/Everything/0.10-4": { + "name": "0xFFFF", + "version": "0.10-4", + "build_time": "2022-01-19T19:13:53+00:00", + "url": rpm_src_package_url( + release="36", + component="Everything", + path="0/0xFFFF-0.10-4.fc36.src.rpm", + ), + "checksums": { + "sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd" + }, + } + } + ) + + # Third run (0xFFFF in fedora36 component got updated and it needs to be listed) + run_lister( + swh_scheduler, + releases=["26", "36"], + components=["Everything"], + pkg_versions=pkg_versions, + origin_count=1, + incremental=True, + ) diff --git a/swh/lister/rpm/tests/test_tasks.py b/swh/lister/rpm/tests/test_tasks.py new file mode 100644 index 00000000..5f08f15b --- /dev/null +++ b/swh/lister/rpm/tests/test_tasks.py @@ -0,0 +1,67 @@ +# Copyright (C) 2022-2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +from swh.lister.pattern import ListerStats + +from .test_lister import FEDORA_ARCHIVE_URL, FEDORA_INDEX_URL_TEMPLATES, FEDORA_URL + + +def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.rpm.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +LISTER_KWARGS = dict( + url=FEDORA_URL, + instance="fedora", + rpm_src_data=[ + { + "base_url": FEDORA_ARCHIVE_URL, + "releases": ["36"], + "components": ["Everything"], + "index_url_templates": FEDORA_INDEX_URL_TEMPLATES, + } + ], +) + + +def test_full_listing(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + lister = mocker.patch("swh.lister.rpm.tasks.RPMLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + res = swh_scheduler_celery_app.send_task( + "swh.lister.rpm.tasks.FullRPMLister", + kwargs=LISTER_KWARGS, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**LISTER_KWARGS) + lister.run.assert_called_once_with() + + +def test_incremental_listing( + swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker +): + lister = mocker.patch("swh.lister.rpm.tasks.RPMLister") + lister.from_configfile.return_value = lister + lister.run.return_value = ListerStats(pages=10, origins=500) + + res = swh_scheduler_celery_app.send_task( + "swh.lister.rpm.tasks.IncrementalRPMLister", + kwargs=LISTER_KWARGS, + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**LISTER_KWARGS, incremental=True) + lister.run.assert_called_once_with() diff --git a/swh/lister/tests/test_cli.py b/swh/lister/tests/test_cli.py index 00b11f28..a5645a61 100644 --- a/swh/lister/tests/test_cli.py +++ b/swh/lister/tests/test_cli.py @@ -38,9 +38,7 @@ lister_args = { "url": "https://guix.gnu.org/sources.json", "origin_upstream": "https://git.savannah.gnu.org/cgit/guix.git/", }, - "fedora": { - "url": "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", - }, + "rpm": {"url": "http://opensuse.org", "instance": "openSUSE", "rpm_src_data": []}, "pagure": {"instance": "pagure.io"}, "gitweb": { "url": "https://git.distorted.org.uk/~mdw/", @@ -64,8 +62,7 @@ def test_get_lister_wrong_input(): def test_get_lister(swh_scheduler_config): """Instantiating a supported lister should be ok""" - # Drop launchpad lister from the lister to check, its test setup is more involved - # than the other listers and it's not currently done here + for lister_name in SUPPORTED_LISTERS: lst = get_lister( lister_name, -- GitLab