Harmonize listers parameters and add test to check mandatory ones

Ensure that all lister classes have the same set of mandatory parameters
in their constructors, notably: scheduler, url, instance and credentials.

Add a new test checking listers classes have mandatory parameters declared
in their constructors. The purpose is to avoid deployment issues on staging
or production environment as celery tasks can fail to be executed if mandatory
parameters are not handled by listers.

Reated to swh/infra/sysadm-environment#5030.
This commit is contained in:
Antoine Lambert 2023-09-06 11:55:28 +02:00
parent 5f717e311d
commit 6e7bc49ec7
27 changed files with 231 additions and 73 deletions

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -80,6 +80,7 @@ class ArchLister(StatelessLister[ArchListerPage]):
VISIT_TYPE = "arch"
INSTANCE = "arch"
BASE_URL = "https://archlinux.org"
ARCH_PACKAGE_URL_PATTERN = "{base_url}/packages/{repo}/{arch}/{pkgname}"
ARCH_PACKAGE_VERSIONS_URL_PATTERN = "{base_url}/packages/{pkgname[0]}/{pkgname}"
ARCH_PACKAGE_DOWNLOAD_URL_PATTERN = (
@ -93,6 +94,8 @@ class ArchLister(StatelessLister[ArchListerPage]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = BASE_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -119,8 +122,8 @@ class ArchLister(StatelessLister[ArchListerPage]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
url=flavours["official"]["base_info_url"],
instance=self.INSTANCE,
url=url,
instance=instance,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -46,6 +46,8 @@ class AurLister(StatelessLister[AurListerPage]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = BASE_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -54,8 +56,8 @@ class AurLister(StatelessLister[AurListerPage]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.BASE_URL,
instance=instance,
url=url,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,

View file

@ -51,6 +51,8 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = API_URL,
instance: str = INSTANCE,
page_size: int = 1000,
incremental: bool = True,
credentials: CredentialsType = None,
@ -61,8 +63,8 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
url=self.API_URL,
instance=self.INSTANCE,
url=url,
instance=instance,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -29,6 +29,8 @@ class BowerLister(StatelessLister[BowerListerPage]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = API_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -37,8 +39,8 @@ class BowerLister(StatelessLister[BowerListerPage]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.API_URL,
instance=instance,
url=url,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -39,6 +39,7 @@ class CondaLister(StatelessLister[CondaListerPage]):
scheduler: SchedulerInterface,
credentials: Optional[CredentialsType] = None,
url: str = BASE_REPO_URL,
instance: str = INSTANCE,
channel: str = "",
archs: List = [],
max_origins_per_page: Optional[int] = None,
@ -48,7 +49,7 @@ class CondaLister(StatelessLister[CondaListerPage]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
instance=instance,
url=url,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -80,6 +80,8 @@ class CpanLister(StatelessLister[CpanListerPage]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = API_BASE_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -88,8 +90,8 @@ class CpanLister(StatelessLister[CpanListerPage]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.API_BASE_URL,
instance=instance,
url=url,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,

View file

@ -34,10 +34,13 @@ class CRANLister(StatelessLister[PageType]):
"""
LISTER_NAME = "cran"
INSTANCE = "cran"
def __init__(
self,
scheduler: SchedulerInterface,
url: str = CRAN_MIRROR_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -45,8 +48,8 @@ class CRANLister(StatelessLister[PageType]):
):
super().__init__(
scheduler,
url=CRAN_MIRROR_URL,
instance="cran",
url=url,
instance=instance,
credentials=credentials,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -65,6 +65,8 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = BASE_URL,
instance: str = INSTANCE,
credentials: CredentialsType = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -73,8 +75,8 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
url=self.BASE_URL,
instance=self.INSTANCE,
url=url,
instance=instance,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2017-2022 The Software Heritage developers
# Copyright (C) 2017-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -68,12 +68,14 @@ class DebianLister(Lister[DebianListerState, DebianPageType]):
"""
LISTER_NAME = "debian"
MIRROR_URL = "http://deb.debian.org/debian/"
INSTANCE = "Debian"
def __init__(
self,
scheduler: SchedulerInterface,
distribution: str = "Debian",
mirror_url: str = "http://deb.debian.org/debian/",
url: str = MIRROR_URL,
instance: str = INSTANCE,
suites: List[Suite] = ["stretch", "buster", "bullseye"],
components: List[Component] = ["main", "contrib", "non-free"],
credentials: Optional[CredentialsType] = None,
@ -83,8 +85,8 @@ class DebianLister(Lister[DebianListerState, DebianPageType]):
):
super().__init__(
scheduler=scheduler,
url=mirror_url,
instance=distribution,
url=url,
instance=instance,
credentials=credentials,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
@ -95,7 +97,7 @@ class DebianLister(Lister[DebianListerState, DebianPageType]):
if not self.url.endswith("/"):
self.url += "/"
self.distribution = distribution
self.distribution = instance
self.suites = suites
self.components = components

View file

@ -10,6 +10,11 @@ from .lister import DebianLister
@shared_task(name=__name__ + ".DebianListerTask")
def list_debian_distribution(**lister_args):
"""List a Debian distribution"""
# for backward compatibility with previous parameter names
if "mirror_url" in lister_args:
lister_args["url"] = lister_args.pop("mirror_url")
if "distribution" in lister_args:
lister_args["instance"] = lister_args.pop("distribution")
return DebianLister.from_configfile(**lister_args).run().dict()

View file

@ -65,7 +65,7 @@ def _init_test(
lister = DebianLister(
scheduler=swh_scheduler,
mirror_url=_mirror_url,
url=_mirror_url,
suites=list(debian_sources.keys()),
components=_components,
)

View file

@ -24,8 +24,8 @@ def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
lister.run.return_value = stats
kwargs = dict(
mirror_url="http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/",
distribution="Ubuntu",
url="http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/",
instance="Ubuntu",
suites=["xenial", "bionic", "focal"],
components=["main", "multiverse", "restricted", "universe"],
)
@ -41,3 +41,35 @@ def test_lister(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
lister.run.assert_called_once_with()
assert res.result == stats.dict()
@patch("swh.lister.debian.tasks.DebianLister")
def test_lister_old_parameter_names(
lister, swh_scheduler_celery_app, swh_scheduler_celery_worker
):
# setup the mocked DebianLister
lister.from_configfile.return_value = lister
stats = ListerStats(pages=12, origins=35618)
lister.run.return_value = stats
kwargs = dict(
mirror_url="http://www-ftp.lip6.fr/pub/linux/distributions/Ubuntu/archive/",
distribution="Ubuntu",
suites=["xenial", "bionic", "focal"],
components=["main", "multiverse", "restricted", "universe"],
)
res = swh_scheduler_celery_app.send_task(
"swh.lister.debian.tasks.DebianListerTask", kwargs=kwargs
)
assert res
res.wait()
assert res.successful()
kwargs["url"] = kwargs.pop("mirror_url")
kwargs["instance"] = kwargs.pop("distribution")
lister.from_configfile.assert_called_once_with(**kwargs)
lister.run.assert_called_once_with()
assert res.result == stats.dict()

View file

@ -1,4 +1,4 @@
# Copyright (C) 2020-2022 The Software Heritage developers
# Copyright (C) 2020-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -62,6 +62,7 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
""" # noqa: B950
LISTER_NAME = "github"
INSTANCE = "github"
API_URL = "https://api.github.com/repositories"
PAGE_SIZE = 1000
@ -69,6 +70,8 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = API_URL,
instance: str = INSTANCE,
credentials: CredentialsType = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -79,8 +82,8 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
url=self.API_URL,
instance="github",
url=url,
instance=instance,
with_github_session=True,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2019-2021 The Software Heritage developers
# Copyright (C) 2019-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -25,11 +25,14 @@ class GNULister(StatelessLister[GNUPageType]):
"""
LISTER_NAME = "GNU"
INSTANCE = "GNU"
GNU_FTP_URL = "https://ftp.gnu.org"
def __init__(
self,
scheduler: SchedulerInterface,
url: str = GNU_FTP_URL,
instance: str = INSTANCE,
credentials: CredentialsType = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -37,8 +40,8 @@ class GNULister(StatelessLister[GNUPageType]):
):
super().__init__(
scheduler=scheduler,
url=self.GNU_FTP_URL,
instance="GNU",
url=url,
instance=instance,
credentials=credentials,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -45,6 +45,8 @@ class GolangLister(Lister[GolangStateType, GolangPageType]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = GOLANG_MODULES_INDEX_URL,
instance: str = LISTER_NAME,
incremental: bool = False,
credentials: CredentialsType = None,
max_origins_per_page: Optional[int] = None,
@ -53,8 +55,8 @@ class GolangLister(Lister[GolangStateType, GolangPageType]):
):
super().__init__(
scheduler=scheduler,
url=self.GOLANG_MODULES_INDEX_URL,
instance=self.LISTER_NAME,
url=url,
instance=instance,
credentials=credentials,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -43,17 +43,18 @@ class HackageLister(Lister[HackageListerState, HackageListerPage]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = BASE_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
url: Optional[str] = None,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=url if url else self.BASE_URL,
instance=instance,
url=url,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,

View file

@ -1,11 +1,11 @@
# Copyright (C) 2021-2022 The Software Heritage developers
# Copyright (C) 2021-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import asdict, dataclass
import logging
from typing import Any, Dict, Iterator, List
from typing import Any, Dict, Iterator, List, Optional
from urllib.parse import urljoin
import iso8601
@ -46,15 +46,22 @@ class HexLister(Lister[HexListerState, HexListerPage]):
def __init__(
self,
scheduler: SchedulerInterface,
instance: str = "hex",
url: str = HEX_API_URL,
instance: str = LISTER_NAME,
page_size: int = 100,
credentials: CredentialsType = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
url=self.HEX_API_URL,
url=url,
instance=instance,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
)
# TODO: Add authentication support
self.page_size = page_size

View file

@ -59,11 +59,14 @@ class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]):
will be returned
"""
LAUNCHPAD_URL = "https://launchpad.net/"
LISTER_NAME = "launchpad"
def __init__(
self,
scheduler: SchedulerInterface,
url: str = LAUNCHPAD_URL,
instance: str = LISTER_NAME,
incremental: bool = False,
credentials: CredentialsType = None,
max_origins_per_page: Optional[int] = None,
@ -72,8 +75,8 @@ class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]):
):
super().__init__(
scheduler=scheduler,
url="https://launchpad.net/",
instance="launchpad",
url=url,
instance=instance,
credentials=credentials,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2018-2022 the Software Heritage developers
# Copyright (C) 2018-2023 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -50,6 +50,8 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = API_FULL_LISTING_URL,
instance: str = INSTANCE,
page_size: int = 1000,
incremental: bool = False,
credentials: CredentialsType = None,
@ -60,10 +62,8 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
url=self.API_INCREMENTAL_LISTING_URL
if incremental
else self.API_FULL_LISTING_URL,
instance=self.INSTANCE,
url=url,
instance=instance,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
@ -75,6 +75,8 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]):
# provided as the startkey query parameter value, so we increment the page
# size by one to avoid double package processing
self.page_size += 1
else:
self.url = self.API_INCREMENTAL_LISTING_URL
self.incremental = incremental
self.session.headers.update({"Accept": "application/json"})

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -43,6 +43,8 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = API_INDEX_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -51,8 +53,8 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.API_INDEX_URL,
instance=instance,
url=url,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,

View file

@ -55,6 +55,7 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
"""
LISTER_NAME = "Packagist"
INSTANCE = "packagist"
PACKAGIST_PACKAGES_LIST_URL = "https://packagist.org/packages/list.json"
PACKAGIST_PACKAGE_URL_FORMATS = [
# preferred, static, efficient on their side as it can be cached
@ -72,6 +73,8 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = PACKAGIST_PACKAGES_LIST_URL,
instance: str = INSTANCE,
credentials: CredentialsType = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -80,8 +83,8 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]):
):
super().__init__(
scheduler=scheduler,
url=self.PACKAGIST_PACKAGES_LIST_URL,
instance="packagist",
url=url,
instance=instance,
credentials=credentials,
with_github_session=True,
max_origins_per_page=max_origins_per_page,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -35,6 +35,8 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = BASE_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -43,8 +45,8 @@ class PubDevLister(StatelessLister[PubDevListerPage]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.BASE_URL,
instance=instance,
url=url,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -42,6 +42,8 @@ class PuppetLister(Lister[PuppetListerState, PuppetListerPage]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = BASE_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -50,8 +52,8 @@ class PuppetLister(Lister[PuppetListerState, PuppetListerPage]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.BASE_URL,
instance=instance,
url=url,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,

View file

@ -69,6 +69,8 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = PACKAGE_LIST_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -76,8 +78,8 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]):
):
super().__init__(
scheduler=scheduler,
url=self.PACKAGE_LIST_URL,
instance=self.INSTANCE,
url=url,
instance=instance,
credentials=credentials,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -62,6 +62,8 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
def __init__(
self,
scheduler: SchedulerInterface,
url: str = RUBY_GEMS_POSTGRES_DUMP_BASE_URL,
instance: str = INSTANCE,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
@ -70,8 +72,8 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL,
instance=instance,
url=url,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,

View file

@ -1,4 +1,4 @@
# Copyright (C) 2021-2022 The Software Heritage developers
# Copyright (C) 2021-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -105,12 +105,16 @@ ProjectsLastModifiedCache = Dict[Tuple[str, str], LastModifiedT]
class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
"""List origins from the "SourceForge" forge."""
SOURCEFORGE_URL = "https://sourceforge.net"
# Part of the lister API, that identifies this lister
LISTER_NAME = "sourceforge"
INSTANCE = "main"
def __init__(
self,
scheduler: SchedulerInterface,
url: str = SOURCEFORGE_URL,
instance: str = INSTANCE,
incremental: bool = False,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
@ -119,8 +123,8 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]):
):
super().__init__(
scheduler=scheduler,
url="https://sourceforge.net",
instance="main",
url=url,
instance=instance,
credentials=credentials,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,

View file

@ -0,0 +1,64 @@
# Copyright (C) 2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import importlib
import inspect
import pkgutil
import pytest
def lister_packages():
import swh.lister
return [
mod.name
for mod in pkgutil.iter_modules(swh.lister.__path__)
if mod.ispkg and mod.name != "tests"
]
@pytest.mark.parametrize("lister_package", lister_packages())
def test_lister_has_mandatory_parameters(lister_package):
from swh.lister.pattern import Lister, StatelessLister
lister_mandatory_params = {
"scheduler",
"url",
"instance",
"credentials",
"max_origins_per_page",
"max_pages",
"enable_origins",
}
lister_module = importlib.import_module(f"swh.lister.{lister_package}.lister")
lister_module_members = inspect.getmembers(lister_module)
for name, obj in lister_module_members:
if (
inspect.isclass(obj)
and obj not in (Lister, StatelessLister)
and issubclass(obj, Lister)
):
lister_params = set(inspect.getfullargspec(getattr(obj, "__init__")).args)
missing_params = lister_mandatory_params - lister_params
assert not missing_params, (
f"swh.lister.{lister_package}.{name} class is missing the following "
f"parameters in its constructor: {', '.join(missing_params)}.\n"
"Please add them and transmit them to the base lister class constructor "
f"to avoid bad surprises when deploying\nthe {lister_package} lister in "
"staging or production environment."
)
@pytest.mark.parametrize("lister_package", lister_packages())
def test_lister_package_has_register_function(lister_package):
lister_module = importlib.import_module(f"swh.lister.{lister_package}")
assert hasattr(lister_module, "register"), (
f"swh.lister.{lister_package} module is missing the register function required "
"to register its celery tasks in scheduler database."
)