From 6e7bc49ec74e89519c3f76ce7b87a71fc74d1fef Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Wed, 6 Sep 2023 11:55:28 +0200 Subject: [PATCH] Harmonize listers parameters and add test to check mandatory ones Ensure that all lister classes have the same set of mandatory parameters in their constructors, notably: scheduler, url, instance and credentials. Add a new test checking listers classes have mandatory parameters declared in their constructors. The purpose is to avoid deployment issues on staging or production environment as celery tasks can fail to be executed if mandatory parameters are not handled by listers. Reated to swh/infra/sysadm-environment#5030. --- swh/lister/arch/lister.py | 9 ++-- swh/lister/aur/lister.py | 8 +-- swh/lister/bitbucket/lister.py | 6 ++- swh/lister/bower/lister.py | 8 +-- swh/lister/conda/lister.py | 5 +- swh/lister/cpan/lister.py | 8 +-- swh/lister/cran/lister.py | 7 ++- swh/lister/crates/lister.py | 8 +-- swh/lister/debian/lister.py | 14 +++--- swh/lister/debian/tasks.py | 5 ++ swh/lister/debian/tests/test_lister.py | 2 +- swh/lister/debian/tests/test_tasks.py | 36 ++++++++++++- swh/lister/github/lister.py | 9 ++-- swh/lister/gnu/lister.py | 9 ++-- swh/lister/golang/lister.py | 8 +-- swh/lister/hackage/lister.py | 9 ++-- swh/lister/hex/lister.py | 15 ++++-- swh/lister/launchpad/lister.py | 7 ++- swh/lister/npm/lister.py | 12 +++-- swh/lister/nuget/lister.py | 8 +-- swh/lister/packagist/lister.py | 7 ++- swh/lister/pubdev/lister.py | 8 +-- swh/lister/puppet/lister.py | 8 +-- swh/lister/pypi/lister.py | 6 ++- swh/lister/rubygems/lister.py | 8 +-- swh/lister/sourceforge/lister.py | 10 ++-- swh/lister/tests/test_lister_packages.py | 64 ++++++++++++++++++++++++ 27 files changed, 231 insertions(+), 73 deletions(-) create mode 100644 swh/lister/tests/test_lister_packages.py diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py index c281f22..cdab728 100644 --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -80,6 +80,7 @@ class ArchLister(StatelessLister[ArchListerPage]): VISIT_TYPE = "arch" INSTANCE = "arch" + BASE_URL = "https://archlinux.org" ARCH_PACKAGE_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}" ARCH_PACKAGE_VERSIONS_URL_PATTERN = "{base_url}/packages/{pkgname[0]}/{pkgname}" ARCH_PACKAGE_DOWNLOAD_URL_PATTERN = ( @@ -93,6 +94,8 @@ class ArchLister(StatelessLister[ArchListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -119,8 +122,8 @@ class ArchLister(StatelessLister[ArchListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - url=flavours["official"]["base_info_url"], - instance=self.INSTANCE, + url=url, + instance=instance, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py index dc43d7d..82a5c40 100644 --- a/swh/lister/aur/lister.py +++ b/swh/lister/aur/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -46,6 +46,8 @@ class AurLister(StatelessLister[AurListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -54,8 +56,8 @@ class AurLister(StatelessLister[AurListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index d65d0c2..00d8abf 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -51,6 +51,8 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_URL, + instance: str = INSTANCE, page_size: int = 1000, incremental: bool = True, credentials: CredentialsType = None, @@ -61,8 +63,8 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): super().__init__( scheduler=scheduler, credentials=credentials, - url=self.API_URL, - instance=self.INSTANCE, + url=url, + instance=instance, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py index cc440dc..71473db 100644 --- a/swh/lister/bower/lister.py +++ b/swh/lister/bower/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -29,6 +29,8 @@ class BowerLister(StatelessLister[BowerListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -37,8 +39,8 @@ class BowerLister(StatelessLister[BowerListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.API_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py index 4f5cb40..d18ac87 100644 --- a/swh/lister/conda/lister.py +++ b/swh/lister/conda/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -39,6 +39,7 @@ class CondaLister(StatelessLister[CondaListerPage]): scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, url: str = BASE_REPO_URL, + instance: str = INSTANCE, channel: str = "", archs: List = [], max_origins_per_page: Optional[int] = None, @@ -48,7 +49,7 @@ class CondaLister(StatelessLister[CondaListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, + instance=instance, url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py index 80669eb..0aee8a8 100644 --- a/swh/lister/cpan/lister.py +++ b/swh/lister/cpan/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -80,6 +80,8 @@ class CpanLister(StatelessLister[CpanListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -88,8 +90,8 @@ class CpanLister(StatelessLister[CpanListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.API_BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index e0dbd32..26db72e 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -34,10 +34,13 @@ class CRANLister(StatelessLister[PageType]): """ LISTER_NAME = "cran" + INSTANCE = "cran" def __init__( self, scheduler: SchedulerInterface, + url: str = CRAN_MIRROR_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -45,8 +48,8 @@ class CRANLister(StatelessLister[PageType]): ): super().__init__( scheduler, - url=CRAN_MIRROR_URL, - instance="cran", + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py index b0b0883..41890ea 100644 --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -65,6 +65,8 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -73,8 +75,8 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - url=self.BASE_URL, - instance=self.INSTANCE, + url=url, + instance=instance, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py index 4a6271e..0b300f3 100644 --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2017-2022 The Software Heritage developers +# Copyright (C) 2017-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -68,12 +68,14 @@ class DebianLister(Lister[DebianListerState, DebianPageType]): """ LISTER_NAME = "debian" + MIRROR_URL = "http://deb.debian.org/debian/" + INSTANCE = "Debian" def __init__( self, scheduler: SchedulerInterface, - distribution: str = "Debian", - mirror_url: str = "http://deb.debian.org/debian/", + url: str = MIRROR_URL, + instance: str = INSTANCE, suites: List[Suite] = ["stretch", "buster", "bullseye"], components: List[Component] = ["main", "contrib", "non-free"], credentials: Optional[CredentialsType] = None, @@ -83,8 +85,8 @@ class DebianLister(Lister[DebianListerState, DebianPageType]): ): super().__init__( scheduler=scheduler, - url=mirror_url, - instance=distribution, + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, @@ -95,7 +97,7 @@ class DebianLister(Lister[DebianListerState, DebianPageType]): if not self.url.endswith("/"): self.url += "/" - self.distribution = distribution + self.distribution = instance self.suites = suites self.components = components diff --git a/swh/lister/debian/tasks.py b/swh/lister/debian/tasks.py index fe62a78..89b21fb 100644 --- a/swh/lister/debian/tasks.py +++ b/swh/lister/debian/tasks.py @@ -10,6 +10,11 @@ from .lister import DebianLister @shared_task(name=__name__ + ".DebianListerTask") def list_debian_distribution(**lister_args): """List a Debian distribution""" + # for backward compatibility with previous parameter names + if "mirror_url" in lister_args: + lister_args["url"] = lister_args.pop("mirror_url") + if "distribution" in lister_args: + lister_args["instance"] = lister_args.pop("distribution") return DebianLister.from_configfile(**lister_args).run().dict() diff --git a/swh/lister/debian/tests/test_lister.py b/swh/lister/debian/tests/test_lister.py index 6f2711d..fcaed46 100644 --- a/swh/lister/debian/tests/test_lister.py +++ b/swh/lister/debian/tests/test_lister.py @@ -65,7 +65,7 @@ def _init_test( lister = DebianLister( scheduler=swh_scheduler, - mirror_url=_mirror_url, + url=_mirror_url, suites=list(debian_sources.keys()), components=_components, ) diff --git a/swh/lister/debian/tests/test_tasks.py b/swh/lister/debian/tests/test_tasks.py index 0a1d30d..78688c9 100644 --- a/swh/lister/debian/tests/test_tasks.py +++ b/swh/lister/debian/tests/test_tasks.py @@ -24,8 +24,8 @@ def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): lister.run.return_value = stats kwargs = dict( - mirror_url="http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/", - distribution="Ubuntu", + url="http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/", + instance="Ubuntu", suites=["xenial", "bionic", "focal"], components=["main", "multiverse", "restricted", "universe"], ) @@ -41,3 +41,35 @@ def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker): lister.run.assert_called_once_with() assert res.result == stats.dict() + + +@patch("swh.lister.debian.tasks.DebianLister") +def test_lister_old_parameter_names( + lister, swh_scheduler_celery_app, swh_scheduler_celery_worker +): + # setup the mocked DebianLister + lister.from_configfile.return_value = lister + stats = ListerStats(pages=12, origins=35618) + lister.run.return_value = stats + + kwargs = dict( + mirror_url="http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/", + distribution="Ubuntu", + suites=["xenial", "bionic", "focal"], + components=["main", "multiverse", "restricted", "universe"], + ) + + res = swh_scheduler_celery_app.send_task( + "swh.lister.debian.tasks.DebianListerTask", kwargs=kwargs + ) + assert res + res.wait() + assert res.successful() + + kwargs["url"] = kwargs.pop("mirror_url") + kwargs["instance"] = kwargs.pop("distribution") + + lister.from_configfile.assert_called_once_with(**kwargs) + lister.run.assert_called_once_with() + + assert res.result == stats.dict() diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 738c516..7e63d16 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020-2022 The Software Heritage developers +# Copyright (C) 2020-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -62,6 +62,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): """ # noqa: B950 LISTER_NAME = "github" + INSTANCE = "github" API_URL = "https://api.github.com/repositories" PAGE_SIZE = 1000 @@ -69,6 +70,8 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_URL, + instance: str = INSTANCE, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -79,8 +82,8 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): super().__init__( scheduler=scheduler, credentials=credentials, - url=self.API_URL, - instance="github", + url=url, + instance=instance, with_github_session=True, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 721bdc2..2af6642 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2021 The Software Heritage developers +# Copyright (C) 2019-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -25,11 +25,14 @@ class GNULister(StatelessLister[GNUPageType]): """ LISTER_NAME = "GNU" + INSTANCE = "GNU" GNU_FTP_URL = "https://ftp.gnu.org" def __init__( self, scheduler: SchedulerInterface, + url: str = GNU_FTP_URL, + instance: str = INSTANCE, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -37,8 +40,8 @@ class GNULister(StatelessLister[GNUPageType]): ): super().__init__( scheduler=scheduler, - url=self.GNU_FTP_URL, - instance="GNU", + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py index 36a247b..368c1d0 100644 --- a/swh/lister/golang/lister.py +++ b/swh/lister/golang/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -45,6 +45,8 @@ class GolangLister(Lister[GolangStateType, GolangPageType]): def __init__( self, scheduler: SchedulerInterface, + url: str = GOLANG_MODULES_INDEX_URL, + instance: str = LISTER_NAME, incremental: bool = False, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, @@ -53,8 +55,8 @@ class GolangLister(Lister[GolangStateType, GolangPageType]): ): super().__init__( scheduler=scheduler, - url=self.GOLANG_MODULES_INDEX_URL, - instance=self.LISTER_NAME, + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/hackage/lister.py b/swh/lister/hackage/lister.py index a86ff67..1872bc6 100644 --- a/swh/lister/hackage/lister.py +++ b/swh/lister/hackage/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -43,17 +43,18 @@ class HackageLister(Lister[HackageListerState, HackageListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, enable_origins: bool = True, - url: Optional[str] = None, ): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=url if url else self.BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/hex/lister.py b/swh/lister/hex/lister.py index b264b60..1ff3a8b 100644 --- a/swh/lister/hex/lister.py +++ b/swh/lister/hex/lister.py @@ -1,11 +1,11 @@ -# Copyright (C) 2021-2022 The Software Heritage developers +# Copyright (C) 2021-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from dataclasses import asdict, dataclass import logging -from typing import Any, Dict, Iterator, List +from typing import Any, Dict, Iterator, List, Optional from urllib.parse import urljoin import iso8601 @@ -46,15 +46,22 @@ class HexLister(Lister[HexListerState, HexListerPage]): def __init__( self, scheduler: SchedulerInterface, - instance: str = "hex", + url: str = HEX_API_URL, + instance: str = LISTER_NAME, page_size: int = 100, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, - url=self.HEX_API_URL, + url=url, instance=instance, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # TODO: Add authentication support self.page_size = page_size diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index 987154c..1545693 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -59,11 +59,14 @@ class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): will be returned """ + LAUNCHPAD_URL = "https://launchpad.net/" LISTER_NAME = "launchpad" def __init__( self, scheduler: SchedulerInterface, + url: str = LAUNCHPAD_URL, + instance: str = LISTER_NAME, incremental: bool = False, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, @@ -72,8 +75,8 @@ class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): ): super().__init__( scheduler=scheduler, - url="https://launchpad.net/", - instance="launchpad", + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py index f10c02d..b1276c6 100644 --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2018-2022 the Software Heritage developers +# Copyright (C) 2018-2023 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -50,6 +50,8 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_FULL_LISTING_URL, + instance: str = INSTANCE, page_size: int = 1000, incremental: bool = False, credentials: CredentialsType = None, @@ -60,10 +62,8 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]): super().__init__( scheduler=scheduler, credentials=credentials, - url=self.API_INCREMENTAL_LISTING_URL - if incremental - else self.API_FULL_LISTING_URL, - instance=self.INSTANCE, + url=url, + instance=instance, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, @@ -75,6 +75,8 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]): # provided as the startkey query parameter value, so we increment the page # size by one to avoid double package processing self.page_size += 1 + else: + self.url = self.API_INCREMENTAL_LISTING_URL self.incremental = incremental self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/nuget/lister.py b/swh/lister/nuget/lister.py index 98f9fc9..1d04f7d 100644 --- a/swh/lister/nuget/lister.py +++ b/swh/lister/nuget/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -43,6 +43,8 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = API_INDEX_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -51,8 +53,8 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.API_INDEX_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py index 99dd986..ba7ac12 100644 --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -55,6 +55,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): """ LISTER_NAME = "Packagist" + INSTANCE = "packagist" PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json" PACKAGIST_PACKAGE_URL_FORMATS = [ # preferred, static, efficient on their side as it can be cached @@ -72,6 +73,8 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): def __init__( self, scheduler: SchedulerInterface, + url: str = PACKAGIST_PACKAGES_LIST_URL, + instance: str = INSTANCE, credentials: CredentialsType = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -80,8 +83,8 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): ): super().__init__( scheduler=scheduler, - url=self.PACKAGIST_PACKAGES_LIST_URL, - instance="packagist", + url=url, + instance=instance, credentials=credentials, with_github_session=True, max_origins_per_page=max_origins_per_page, diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py index 50e4f15..601bdef 100644 --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -35,6 +35,8 @@ class PubDevLister(StatelessLister[PubDevListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -43,8 +45,8 @@ class PubDevLister(StatelessLister[PubDevListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/puppet/lister.py b/swh/lister/puppet/lister.py index 6e84b27..26c7a4c 100644 --- a/swh/lister/puppet/lister.py +++ b/swh/lister/puppet/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -42,6 +42,8 @@ class PuppetLister(Lister[PuppetListerState, PuppetListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -50,8 +52,8 @@ class PuppetLister(Lister[PuppetListerState, PuppetListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index f5141c1..5ba08eb 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -69,6 +69,8 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = PACKAGE_LIST_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -76,8 +78,8 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]): ): super().__init__( scheduler=scheduler, - url=self.PACKAGE_LIST_URL, - instance=self.INSTANCE, + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py index bb317ea..4e59b90 100644 --- a/swh/lister/rubygems/lister.py +++ b/swh/lister/rubygems/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022 The Software Heritage developers +# Copyright (C) 2022-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -62,6 +62,8 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]): def __init__( self, scheduler: SchedulerInterface, + url: str = RUBY_GEMS_POSTGRES_DUMP_BASE_URL, + instance: str = INSTANCE, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, max_pages: Optional[int] = None, @@ -70,8 +72,8 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]): super().__init__( scheduler=scheduler, credentials=credentials, - instance=self.INSTANCE, - url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL, + instance=instance, + url=url, max_origins_per_page=max_origins_per_page, max_pages=max_pages, enable_origins=enable_origins, diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py index 234e198..518a7ec 100644 --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2021-2022 The Software Heritage developers +# Copyright (C) 2021-2023 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -105,12 +105,16 @@ ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModifiedT] class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): """List origins from the "SourceForge" forge.""" + SOURCEFORGE_URL = "https://sourceforge.net" # Part of the lister API, that identifies this lister LISTER_NAME = "sourceforge" + INSTANCE = "main" def __init__( self, scheduler: SchedulerInterface, + url: str = SOURCEFORGE_URL, + instance: str = INSTANCE, incremental: bool = False, credentials: Optional[CredentialsType] = None, max_origins_per_page: Optional[int] = None, @@ -119,8 +123,8 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): ): super().__init__( scheduler=scheduler, - url="https://sourceforge.net", - instance="main", + url=url, + instance=instance, credentials=credentials, max_origins_per_page=max_origins_per_page, max_pages=max_pages, diff --git a/swh/lister/tests/test_lister_packages.py b/swh/lister/tests/test_lister_packages.py new file mode 100644 index 0000000..f0c6bef --- /dev/null +++ b/swh/lister/tests/test_lister_packages.py @@ -0,0 +1,64 @@ +# Copyright (C) 2023 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +import importlib +import inspect +import pkgutil + +import pytest + + +def lister_packages(): + import swh.lister + + return [ + mod.name + for mod in pkgutil.iter_modules(swh.lister.__path__) + if mod.ispkg and mod.name != "tests" + ] + + +@pytest.mark.parametrize("lister_package", lister_packages()) +def test_lister_has_mandatory_parameters(lister_package): + from swh.lister.pattern import Lister, StatelessLister + + lister_mandatory_params = { + "scheduler", + "url", + "instance", + "credentials", + "max_origins_per_page", + "max_pages", + "enable_origins", + } + + lister_module = importlib.import_module(f"swh.lister.{lister_package}.lister") + lister_module_members = inspect.getmembers(lister_module) + for name, obj in lister_module_members: + if ( + inspect.isclass(obj) + and obj not in (Lister, StatelessLister) + and issubclass(obj, Lister) + ): + lister_params = set(inspect.getfullargspec(getattr(obj, "__init__")).args) + + missing_params = lister_mandatory_params - lister_params + + assert not missing_params, ( + f"swh.lister.{lister_package}.{name} class is missing the following " + f"parameters in its constructor: {', '.join(missing_params)}.\n" + "Please add them and transmit them to the base lister class constructor " + f"to avoid bad surprises when deploying\nthe {lister_package} lister in " + "staging or production environment." + ) + + +@pytest.mark.parametrize("lister_package", lister_packages()) +def test_lister_package_has_register_function(lister_package): + lister_module = importlib.import_module(f"swh.lister.{lister_package}") + assert hasattr(lister_module, "register"), ( + f"swh.lister.{lister_package} module is missing the register function required " + "to register its celery tasks in scheduler database." + )