diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py index c281f222eb687e31192f7c35adcdafc27266e5be..cdab7285a7d185fab8925c16e29aea1ed2012c54 100644 --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -80,6 +80,7 @@ class ArchLister(StatelessLister[ArchListerPage]): VISIT_TYPE = "arch" INSTANCE = "arch" + BASE_URL = "https://archlinux.org" ARCH_PACKAGE_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}" ARCH_PACKAGE_VERSIONS_URL_PATTERN = "{base_url}/packages/{pkgname[0]}/{pkgname}" ARCH_PACKAGE_DOWNLOAD_URL_PATTERN = ( @@ -93,6 +94,8 @@ class ArchLister(StatelessLister[ArchListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -119,8 +122,8 @@ class ArchLister(StatelessLister[ArchListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - url=flavours["official"]["base_info_url"], - instance=self.INSTANCE, + url=url, + instance=instance, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py index dc43d7d48af5cd1462ee241eee51dcf9e3b9a761..82a5c4060cf5b0354486dde4ad69276ac00d422f 100644 --- a/swh/lister/aur/lister.py +++ b/swh/lister/aur/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -46,6 +46,8 @@ class AurLister(StatelessLister[AurListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -54,8 +56,8 @@ class AurLister(StatelessLister[AurListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index d65d0c26c3b1e531968f729aac349d683c62629a..00d8abfffc728e8e3fe8f398ab3c74ff4516a2ea 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -51,6 +51,8 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_URL, + instance: str = INSTANCE, page_size: int = 1000, incremental: bool = True, credentials: CredentialsType = None, @@ -61,8 +63,8 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): super().__init__( scheduler=scheduler, credentials=credentials, - url=self.API_URL, - instance=self.INSTANCE, + url=url, + instance=instance, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py index cc440dc5d7cb95eb8c3664e44e2009278c9952a0..71473db1da8a9baf495917af28e8fdb13ea05c68 100644 --- a/swh/lister/bower/lister.py +++ b/swh/lister/bower/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -29,6 +29,8 @@ class BowerLister(StatelessLister[BowerListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -37,8 +39,8 @@ class BowerLister(StatelessLister[BowerListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.API_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py index 4f5cb40235f1723c2811b70dd76396de8d40fd5c..d18ac870c834fc1046c805712644a13cad66fdd7 100644 --- a/swh/lister/conda/lister.py +++ b/swh/lister/conda/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -39,6 +39,7 @@ class CondaLister(StatelessLister[CondaListerPage]): scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, url: str = BASE_REPO_URL, + instance: str = INSTANCE, channel: str = "", archs: List = [], max_origins_per_page: Optional[int] = None, @@ -48,7 +49,7 @@ class CondaLister(StatelessLister[CondaListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, + instance=instance, url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py index 80669ebdb914c21c68c2d6bbb546d72c7f3eba06..0aee8a8530a2618d8317b8a80e7cbc8b093cfba5 100644 --- a/swh/lister/cpan/lister.py +++ b/swh/lister/cpan/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -80,6 +80,8 @@ class CpanLister(StatelessLister[CpanListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -88,8 +90,8 @@ class CpanLister(StatelessLister[CpanListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.API_BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index e0dbd32fb36b4eef47e041e3d68e8c6391bf3c7f..26db72e6934c0f738f03c52ec6b83c1a8d4a5e57 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -34,10 +34,13 @@ class CRANLister(StatelessLister[PageType]): """ LISTER_NAME = "cran" + INSTANCE = "cran" def __init__( self, scheduler: SchedulerInterface, + url: str = CRAN_MIRROR_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -45,8 +48,8 @@ class CRANLister(StatelessLister[PageType]): ): super().__init__( scheduler, - url=CRAN_MIRROR_URL, - instance="cran", + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py index b0b088329c5f7f43af7bd90f1a122bdc8fa9f1bf..41890eae0c7ac2b3b682ba269cc3f497373a7957 100644 --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -65,6 +65,8 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -73,8 +75,8 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - url=self.BASE_URL, - instance=self.INSTANCE, + url=url, + instance=instance, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py index 4a6271e35c0ca5f33ca4397055e8651a97fcb9d0..0b300f3a7ff868d8f8b10ff57843057a214a3942 100644 --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -68,12 +68,14 @@ class DebianLister(Lister[DebianListerState, DebianPageType]): """ LISTER_NAME = "debian" + MIRROR_URL = "http://deb.debian.org/debian/" + INSTANCE = "Debian" def __init__( self, scheduler: SchedulerInterface, - distribution: str = "Debian", - mirror_url: str = "http://deb.debian.org/debian/", + url: str = MIRROR_URL, + instance: str = INSTANCE, suites: List[Suite] = ["stretch", "buster", "bullseye"], components: List[Component] = ["main", "contrib", "non-free"], credentials: Optional[CredentialsType] = None, @@ -83,8 +85,8 @@ class DebianLister(Lister[DebianListerState, DebianPageType]): ): super().__init__( scheduler=scheduler, - url=mirror_url, - instance=distribution, + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, @@ -95,7 +97,7 @@ class DebianLister(Lister[DebianListerState, DebianPageType]): if not self.url.endswith("/"): self.url += "/" - self.distribution = distribution + self.distribution = instance self.suites = suites self.components = components diff --git a/swh/lister/debian/tasks.py b/swh/lister/debian/tasks.py index fe62a784911f61d1da15f1ca0e534cb62b6d462f..89b21fb4b35e1e5f2000c2ee9a9116a1520ce5b1 100644 --- a/swh/lister/debian/tasks.py +++ b/swh/lister/debian/tasks.py @@ -10,6 +10,11 @@ from .lister import DebianLister @shared_task(name=__name__ + ".DebianListerTask") def list_debian_distribution(**lister_args): """List a Debian distribution""" + # for backward compatibility with previous parameter names + if "mirror_url" in lister_args: + lister_args["url"] = lister_args.pop("mirror_url") + if "distribution" in lister_args: + lister_args["instance"] = lister_args.pop("distribution") return DebianLister.from_configfile(**lister_args).run().dict() diff --git a/swh/lister/debian/tests/test_lister.py b/swh/lister/debian/tests/test_lister.py index 6f2711d4375f98963f7173f8c02f0b7f43dc36b5..fcaed468108af7c4c0b772eb59bd0b98f614efc1 100644 --- a/swh/lister/debian/tests/test_lister.py +++ b/swh/lister/debian/tests/test_lister.py @@ -65,7 +65,7 @@ def _init_test( lister = DebianLister( scheduler=swh_scheduler, - mirror_url=_mirror_url, + url=_mirror_url, suites=list(debian_sources.keys()), components=_components, ) diff --git a/swh/lister/debian/tests/test_tasks.py b/swh/lister/debian/tests/test_tasks.py index 0a1d30d40473df1b08343fa91f17778129537fe4..78688c905c671682a4c13ce11f5f4c1c43a93751 100644 --- a/swh/lister/debian/tests/test_tasks.py +++ b/swh/lister/debian/tests/test_tasks.py @@ -23,6 +23,35 @@ def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): stats = ListerStats(pages=12, origins=35618) lister.run.return_value = stats + kwargs = dict( + url="http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/", + instance="Ubuntu", + suites=["xenial", "bionic", "focal"], + components=["main", "multiverse", "restricted", "universe"], + ) + + res = swh_scheduler_celery_app.send_task( + "swh.lister.debian.tasks.DebianListerTask", kwargs=kwargs + ) + assert res + res.wait() + assert res.successful() + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() + + assert res.result == stats.dict() + + +@patch("swh.lister.debian.tasks.DebianLister") +def test_lister_old_parameter_names( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + # setup the mocked DebianLister + lister.from_configfile.return_value = lister + stats = ListerStats(pages=12, origins=35618) + lister.run.return_value = stats + kwargs = dict( mirror_url="http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/", distribution="Ubuntu", @@ -37,6 +66,9 @@ def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): res.wait() assert res.successful() + kwargs["url"] = kwargs.pop("mirror_url") + kwargs["instance"] = kwargs.pop("distribution") + lister.from_configfile.assert_called_once_with(**kwargs) lister.run.assert_called_once_with() diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 738c516f92e34678f11f3a34d576fe462df665c7..7e63d16addfbed6200cf032274627988ff38740e 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2022 The Software Heritage developers +# Copyright (C) 2020-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -62,6 +62,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): """ # noqa: B950 LISTER_NAME = "github" + INSTANCE = "github" API_URL = "https://api.github.com/repositories" PAGE_SIZE = 1000 @@ -69,6 +70,8 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_URL, + instance: str = INSTANCE, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -79,8 +82,8 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): super().__init__( scheduler=scheduler, credentials=credentials, - url=self.API_URL, - instance="github", + url=url, + instance=instance, with_github_session=True, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 721bdc23698fa8fca335abf10da1bd8e348e95ff..2af6642836be9845c5864c2e0fb064b1f85983fc 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -25,11 +25,14 @@ class GNULister(StatelessLister[GNUPageType]): """ LISTER_NAME = "GNU" + INSTANCE = "GNU" GNU_FTP_URL = "https://ftp.gnu.org" def __init__( self, scheduler: SchedulerInterface, + url: str = GNU_FTP_URL, + instance: str = INSTANCE, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -37,8 +40,8 @@ class GNULister(StatelessLister[GNUPageType]): ): super().__init__( scheduler=scheduler, - url=self.GNU_FTP_URL, - instance="GNU", + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py index 36a247b140c74b44a80f3bd062485b193476ddc4..368c1d0849637ba24eb7d5fcbd535faa533ca166 100644 --- a/swh/lister/golang/lister.py +++ b/swh/lister/golang/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -45,6 +45,8 @@ class GolangLister(Lister[GolangStateType, GolangPageType]): def __init__( self, scheduler: SchedulerInterface, + url: str = GOLANG_MODULES_INDEX_URL, + instance: str = LISTER_NAME, incremental: bool = False, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, @@ -53,8 +55,8 @@ class GolangLister(Lister[GolangStateType, GolangPageType]): ): super().__init__( scheduler=scheduler, - url=self.GOLANG_MODULES_INDEX_URL, - instance=self.LISTER_NAME, + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/hackage/lister.py b/swh/lister/hackage/lister.py index a86ff67c8a969497e9eea6177a66b1c20ee99bb2..1872bc68946f6131e3493dc57eac4d524271b32b 100644 --- a/swh/lister/hackage/lister.py +++ b/swh/lister/hackage/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -43,17 +43,18 @@ class HackageLister(Lister[HackageListerState, HackageListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, enable_origins: bool = True, - url: Optional[str] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=url if url else self.BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/hex/lister.py b/swh/lister/hex/lister.py index b264b602a3aa5f3bc92f1cfd24f82c50d172c885..1ff3a8b1d28aef98bd63a7f62dbd486b5d553027 100644 --- a/swh/lister/hex/lister.py +++ b/swh/lister/hex/lister.py @@ -1,11 +1,11 @@ -# Copyright (C) 2021-2022 The Software Heritage developers +# Copyright (C) 2021-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import logging -from typing import Any, Dict, Iterator, List +from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin import iso8601 @@ -46,15 +46,22 @@ class HexLister(Lister[HexListerState, HexListerPage]): def __init__( self, scheduler: SchedulerInterface, - instance: str = "hex", + url: str = HEX_API_URL, + instance: str = LISTER_NAME, page_size: int = 100, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, - url=self.HEX_API_URL, + url=url, instance=instance, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # TODO: Add authentication support self.page_size = page_size diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index 987154c04c525604311f7f4e12e13c5c301bc13d..1545693ee254566d417d270c699d1bc0f7447177 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -59,11 +59,14 @@ class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): will be returned """ + LAUNCHPAD_URL = "https://launchpad.net/" LISTER_NAME = "launchpad" def __init__( self, scheduler: SchedulerInterface, + url: str = LAUNCHPAD_URL, + instance: str = LISTER_NAME, incremental: bool = False, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, @@ -72,8 +75,8 @@ class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): ): super().__init__( scheduler=scheduler, - url="https://launchpad.net/", - instance="launchpad", + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py index f10c02d1e21b8590649081e00399c1451d5e5815..b1276c6deb302a5da4e4102b64581711b4ebd7e4 100644 --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2022 the Software Heritage developers +# Copyright (C) 2018-2023 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -50,6 +50,8 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_FULL_LISTING_URL, + instance: str = INSTANCE, page_size: int = 1000, incremental: bool = False, credentials: CredentialsType = None, @@ -60,10 +62,8 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]): super().__init__( scheduler=scheduler, credentials=credentials, - url=self.API_INCREMENTAL_LISTING_URL - if incremental - else self.API_FULL_LISTING_URL, - instance=self.INSTANCE, + url=url, + instance=instance, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, @@ -75,6 +75,8 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]): # provided as the startkey query parameter value, so we increment the page # size by one to avoid double package processing self.page_size += 1 + else: + self.url = self.API_INCREMENTAL_LISTING_URL self.incremental = incremental self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/nuget/lister.py b/swh/lister/nuget/lister.py index 98f9fc9b5e7353b9a4aa17e652db126435b21ab0..1d04f7d08a0fa5c4a001ca9484675521dd9ada1e 100644 --- a/swh/lister/nuget/lister.py +++ b/swh/lister/nuget/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -43,6 +43,8 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_INDEX_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -51,8 +53,8 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.API_INDEX_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py index 99dd986353cb440d9f5987bb372147351461c92d..ba7ac12fa4151680e4b7136bb646148389ddf443 100644 --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -55,6 +55,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): """ LISTER_NAME = "Packagist" + INSTANCE = "packagist" PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json" PACKAGIST_PACKAGE_URL_FORMATS = [ # preferred, static, efficient on their side as it can be cached @@ -72,6 +73,8 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): def __init__( self, scheduler: SchedulerInterface, + url: str = PACKAGIST_PACKAGES_LIST_URL, + instance: str = INSTANCE, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -80,8 +83,8 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): ): super().__init__( scheduler=scheduler, - url=self.PACKAGIST_PACKAGES_LIST_URL, - instance="packagist", + url=url, + instance=instance, credentials=credentials, with_github_session=True, max_origins_per_page=max_origins_per_page, diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py index 50e4f15dfc0ad8a8f1dcb44fa7d41241d813874a..601bdefd1fc0551b1069e5af17ad2d2f117a4a9d 100644 --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -35,6 +35,8 @@ class PubDevLister(StatelessLister[PubDevListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -43,8 +45,8 @@ class PubDevLister(StatelessLister[PubDevListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/puppet/lister.py b/swh/lister/puppet/lister.py index 6e84b27cb2d3e19ba76e869e705aed4a3435313f..26c7a4c33914b1f6f350ec84903d52ab9e0771d2 100644 --- a/swh/lister/puppet/lister.py +++ b/swh/lister/puppet/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -42,6 +42,8 @@ class PuppetLister(Lister[PuppetListerState, PuppetListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -50,8 +52,8 @@ class PuppetLister(Lister[PuppetListerState, PuppetListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index f5141c1dd333814266b2e434dd68c718938d7baf..5ba08eb7beb7350565afd92aa8515874aeb7184c 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -69,6 +69,8 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = PACKAGE_LIST_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -76,8 +78,8 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]): ): super().__init__( scheduler=scheduler, - url=self.PACKAGE_LIST_URL, - instance=self.INSTANCE, + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py index bb317eab7276fbf8d39f82c3dadbbd78994e9e0f..4e59b901cf8457600445407eaaa2e9cbc926dbca 100644 --- a/swh/lister/rubygems/lister.py +++ b/swh/lister/rubygems/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -62,6 +62,8 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = RUBY_GEMS_POSTGRES_DUMP_BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -70,8 +72,8 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py index 234e198ff49b606aacc9ded7759f380df58f95cd..518a7ece9b5d9f2696d29d4aaf11093db76be718 100644 --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021-2022 The Software Heritage developers +# Copyright (C) 2021-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -105,12 +105,16 @@ ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModifiedT] class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): """List origins from the "SourceForge" forge.""" + SOURCEFORGE_URL = "https://sourceforge.net" # Part of the lister API, that identifies this lister LISTER_NAME = "sourceforge" + INSTANCE = "main" def __init__( self, scheduler: SchedulerInterface, + url: str = SOURCEFORGE_URL, + instance: str = INSTANCE, incremental: bool = False, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, @@ -119,8 +123,8 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): ): super().__init__( scheduler=scheduler, - url="https://sourceforge.net", - instance="main", + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/tests/test_lister_packages.py b/swh/lister/tests/test_lister_packages.py new file mode 100644 index 0000000000000000000000000000000000000000..f0c6bef49d483ea8a20ec3c2b417fa58fd7e5cd0 --- /dev/null +++ b/swh/lister/tests/test_lister_packages.py @@ -0,0 +1,64 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import importlib +import inspect +import pkgutil + +import pytest + + +def lister_packages(): + import swh.lister + + return [ + mod.name + for mod in pkgutil.iter_modules(swh.lister.__path__) + if mod.ispkg and mod.name != "tests" + ] + + +@pytest.mark.parametrize("lister_package", lister_packages()) +def test_lister_has_mandatory_parameters(lister_package): + from swh.lister.pattern import Lister, StatelessLister + + lister_mandatory_params = { + "scheduler", + "url", + "instance", + "credentials", + "max_origins_per_page", + "max_pages", + "enable_origins", + } + + lister_module = importlib.import_module(f"swh.lister.{lister_package}.lister") + lister_module_members = inspect.getmembers(lister_module) + for name, obj in lister_module_members: + if ( + inspect.isclass(obj) + and obj not in (Lister, StatelessLister) + and issubclass(obj, Lister) + ): + lister_params = set(inspect.getfullargspec(getattr(obj, "__init__")).args) + + missing_params = lister_mandatory_params - lister_params + + assert not missing_params, ( + f"swh.lister.{lister_package}.{name} class is missing the following " + f"parameters in its constructor: {', '.join(missing_params)}.\n" + "Please add them and transmit them to the base lister class constructor " + f"to avoid bad surprises when deploying\nthe {lister_package} lister in " + "staging or production environment." + ) + + +@pytest.mark.parametrize("lister_package", lister_packages()) +def test_lister_package_has_register_function(lister_package): + lister_module = importlib.import_module(f"swh.lister.{lister_package}") + assert hasattr(lister_module, "register"), ( + f"swh.lister.{lister_package} module is missing the register function required " + "to register its celery tasks in scheduler database." + )