From e785e67315c63cdc4acca64e9c32e8e7e270074b Mon Sep 17 00:00:00 2001 From: Nicolas Dandrimont Date: Mon, 5 Dec 2022 16:33:45 +0100 Subject: [PATCH] Hook up recently introduced options to all listers Hopefully one day we'll be able to replace all of this mess with PEP692 TypedDict kwargs, but that's only on track for Python 3.12. --- swh/lister/arch/lister.py | 6 ++++++ swh/lister/aur/lister.py | 6 ++++++ swh/lister/bitbucket/lister.py | 6 ++++++ swh/lister/bower/lister.py | 6 ++++++ swh/lister/cgit/lister.py | 6 ++++++ swh/lister/conda/lister.py | 6 ++++++ swh/lister/cpan/lister.py | 6 ++++++ swh/lister/cran/lister.py | 11 ++++++++++- swh/lister/crates/lister.py | 6 ++++++ swh/lister/debian/lister.py | 6 ++++++ swh/lister/fedora/lister.py | 8 +++++++- swh/lister/github/lister.py | 6 ++++++ swh/lister/gitlab/lister.py | 6 ++++++ swh/lister/gnu/lister.py | 6 ++++++ swh/lister/gogs/lister.py | 6 ++++++ swh/lister/golang/lister.py | 6 ++++++ swh/lister/hackage/lister.py | 6 ++++++ swh/lister/launchpad/lister.py | 6 ++++++ swh/lister/maven/lister.py | 6 ++++++ swh/lister/nixguix/lister.py | 6 ++++++ swh/lister/npm/lister.py | 6 ++++++ swh/lister/nuget/lister.py | 6 ++++++ swh/lister/opam/lister.py | 6 ++++++ swh/lister/packagist/lister.py | 6 ++++++ swh/lister/phabricator/lister.py | 11 ++++++++++- swh/lister/pubdev/lister.py | 6 ++++++ swh/lister/puppet/lister.py | 6 ++++++ swh/lister/pypi/lister.py | 6 ++++++ swh/lister/rubygems/lister.py | 6 ++++++ swh/lister/sourceforge/lister.py | 6 ++++++ swh/lister/tuleap/lister.py | 6 ++++++ 31 files changed, 195 insertions(+), 3 deletions(-) diff --git a/swh/lister/arch/lister.py b/swh/lister/arch/lister.py index 563fa18..c281f22 100644 --- a/swh/lister/arch/lister.py +++ b/swh/lister/arch/lister.py @@ -94,6 +94,9 @@ class ArchLister(StatelessLister[ArchListerPage]): self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, flavours: Dict[str, Any] = { "official": { "archs": ["x86_64"], @@ -118,6 +121,9 @@ class ArchLister(StatelessLister[ArchListerPage]): credentials=credentials, url=flavours["official"]["base_info_url"], instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.flavours = flavours diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py index 9bbdf37..dc43d7d 100644 --- a/swh/lister/aur/lister.py +++ b/swh/lister/aur/lister.py @@ -47,12 +47,18 @@ class AurLister(StatelessLister[AurListerPage]): self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) def download_packages_index(self) -> List[Dict[str, Any]]: diff --git a/swh/lister/bitbucket/lister.py b/swh/lister/bitbucket/lister.py index 7bcec03..05720c9 100644 --- a/swh/lister/bitbucket/lister.py +++ b/swh/lister/bitbucket/lister.py @@ -53,12 +53,18 @@ class BitbucketLister(Lister[BitbucketListerState, List[Dict[str, Any]]]): page_size: int = 1000, incremental: bool = True, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.API_URL, instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.incremental = incremental diff --git a/swh/lister/bower/lister.py b/swh/lister/bower/lister.py index 5b488e4..cc440dc 100644 --- a/swh/lister/bower/lister.py +++ b/swh/lister/bower/lister.py @@ -30,12 +30,18 @@ class BowerLister(StatelessLister[BowerListerPage]): self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.API_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index 49458d0..4a9aeab 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -50,6 +50,9 @@ class CGitLister(StatelessLister[Repositories]): instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, base_git_url: Optional[str] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): """Lister class for CGit repositories. @@ -67,6 +70,9 @@ class CGitLister(StatelessLister[Repositories]): url=url, instance=instance, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/html"}) diff --git a/swh/lister/conda/lister.py b/swh/lister/conda/lister.py index ab0190f..4f5cb40 100644 --- a/swh/lister/conda/lister.py +++ b/swh/lister/conda/lister.py @@ -41,12 +41,18 @@ class CondaLister(StatelessLister[CondaListerPage]): url: str = BASE_REPO_URL, channel: str = "", archs: List = [], + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=url, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.channel: str = channel self.archs: List[str] = archs diff --git a/swh/lister/cpan/lister.py b/swh/lister/cpan/lister.py index 32f7479..80669eb 100644 --- a/swh/lister/cpan/lister.py +++ b/swh/lister/cpan/lister.py @@ -81,12 +81,18 @@ class CpanLister(StatelessLister[CpanListerPage]): self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.API_BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.artifacts: Dict[str, List[Dict[str, Any]]] = defaultdict(list) diff --git a/swh/lister/cran/lister.py b/swh/lister/cran/lister.py index 35e3d2b..728c6d3 100644 --- a/swh/lister/cran/lister.py +++ b/swh/lister/cran/lister.py @@ -32,9 +32,18 @@ class CRANLister(StatelessLister[PageType]): self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( - scheduler, url=CRAN_MIRROR, instance="cran", credentials=credentials + scheduler, + url=CRAN_MIRROR, + instance="cran", + credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) def get_pages(self) -> Iterator[PageType]: diff --git a/swh/lister/crates/lister.py b/swh/lister/crates/lister.py index eca9f10..6b8c94a 100644 --- a/swh/lister/crates/lister.py +++ b/swh/lister/crates/lister.py @@ -66,12 +66,18 @@ class CratesLister(Lister[CratesListerState, CratesListerPage]): self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=self.BASE_URL, instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.index_metadata: Dict[str, str] = {} diff --git a/swh/lister/debian/lister.py b/swh/lister/debian/lister.py index 940e453..23d520a 100644 --- a/swh/lister/debian/lister.py +++ b/swh/lister/debian/lister.py @@ -77,12 +77,18 @@ class DebianLister(Lister[DebianListerState, DebianPageType]): suites: List[Suite] = ["stretch", "buster", "bullseye"], components: List[Component] = ["main", "contrib", "non-free"], credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=mirror_url, instance=distribution, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # to ensure urljoin will produce valid Sources URL diff --git a/swh/lister/fedora/lister.py b/swh/lister/fedora/lister.py index 8f3dced..34712b3 100644 --- a/swh/lister/fedora/lister.py +++ b/swh/lister/fedora/lister.py @@ -6,7 +6,7 @@ from dataclasses import dataclass, field from datetime import datetime, timezone import logging -from typing import Any, Dict, Iterator, List, Set, Type +from typing import Any, Dict, Iterator, List, Optional, Set, Type from urllib.error import HTTPError from urllib.parse import urljoin @@ -91,12 +91,18 @@ class FedoraLister(Lister[FedoraListerState, FedoraPageType]): instance: str = "fedora", url: str = "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/", releases: List[Release] = [34, 35, 36], + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=url, instance=instance, credentials={}, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.releases = releases diff --git a/swh/lister/github/lister.py b/swh/lister/github/lister.py index 5728727..738c516 100644 --- a/swh/lister/github/lister.py +++ b/swh/lister/github/lister.py @@ -70,6 +70,9 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, first_id: Optional[int] = None, last_id: Optional[int] = None, ): @@ -79,6 +82,9 @@ class GitHubLister(Lister[GitHubListerState, List[Dict[str, Any]]]): url=self.API_URL, instance="github", with_github_session=True, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.first_id = first_id diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py index 7823ee2..3ad2bfd 100644 --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -103,6 +103,9 @@ class GitLabLister(Lister[GitLabListerState, PageResult]): name: Optional[str] = "gitlab", instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, incremental: bool = False, ignored_project_prefixes: Optional[List[str]] = None, ): @@ -113,6 +116,9 @@ class GitLabLister(Lister[GitLabListerState, PageResult]): url=url.rstrip("/"), instance=instance, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.incremental = incremental self.last_page: Optional[str] = None diff --git a/swh/lister/gnu/lister.py b/swh/lister/gnu/lister.py index 65eca1f..721bdc2 100644 --- a/swh/lister/gnu/lister.py +++ b/swh/lister/gnu/lister.py @@ -31,12 +31,18 @@ class GNULister(StatelessLister[GNUPageType]): self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=self.GNU_FTP_URL, instance="GNU", credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # no side-effect calls in constructor, if extra state is needed, as preconized # by the pattern docstring, this must happen in the get_pages method. diff --git a/swh/lister/gogs/lister.py b/swh/lister/gogs/lister.py index ce8a398..cdc5576 100644 --- a/swh/lister/gogs/lister.py +++ b/swh/lister/gogs/lister.py @@ -75,12 +75,18 @@ class GogsLister(Lister[GogsListerState, GogsListerPage]): api_token: Optional[str] = None, page_size: int = 50, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.query_params = { diff --git a/swh/lister/golang/lister.py b/swh/lister/golang/lister.py index 10e5935..36a247b 100644 --- a/swh/lister/golang/lister.py +++ b/swh/lister/golang/lister.py @@ -47,12 +47,18 @@ class GolangLister(Lister[GolangStateType, GolangPageType]): scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=self.GOLANG_MODULES_INDEX_URL, instance=self.LISTER_NAME, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/hackage/lister.py b/swh/lister/hackage/lister.py index 04fb6f2..a86ff67 100644 --- a/swh/lister/hackage/lister.py +++ b/swh/lister/hackage/lister.py @@ -44,6 +44,9 @@ class HackageLister(Lister[HackageListerState, HackageListerPage]): self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, url: Optional[str] = None, ): super().__init__( @@ -51,6 +54,9 @@ class HackageLister(Lister[HackageListerState, HackageListerPage]): credentials=credentials, instance=self.INSTANCE, url=url if url else self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # Ensure to set this with same value as the http api search endpoint use # (50 as of august 2022) diff --git a/swh/lister/launchpad/lister.py b/swh/lister/launchpad/lister.py index e9c36fa..b9daa18 100644 --- a/swh/lister/launchpad/lister.py +++ b/swh/lister/launchpad/lister.py @@ -66,12 +66,18 @@ class LaunchpadLister(Lister[LaunchpadListerState, LaunchpadPageType]): scheduler: SchedulerInterface, incremental: bool = False, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url="https://launchpad.net/", instance="launchpad", credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.incremental = incremental self.date_last_modified: Dict[str, Optional[datetime]] = { diff --git a/swh/lister/maven/lister.py b/swh/lister/maven/lister.py index 195a8a3..8dc702c 100644 --- a/swh/lister/maven/lister.py +++ b/swh/lister/maven/lister.py @@ -61,6 +61,9 @@ class MavenLister(Lister[MavenListerState, RepoPage]): index_url: str = None, instance: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, incremental: bool = True, ): """Lister class for Maven repositories. @@ -88,6 +91,9 @@ class MavenLister(Lister[MavenListerState, RepoPage]): url=url, instance=instance, with_github_session=True, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/nixguix/lister.py b/swh/lister/nixguix/lister.py index 3e410aa..3440a8e 100644 --- a/swh/lister/nixguix/lister.py +++ b/swh/lister/nixguix/lister.py @@ -320,6 +320,9 @@ class NixGuixLister(StatelessLister[PageResult]): origin_upstream: str, instance: Optional[str] = None, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, # canonicalize urls, can be turned off during docker runs canonicalize: bool = True, extensions_to_ignore: List[str] = [], @@ -331,6 +334,9 @@ class NixGuixLister(StatelessLister[PageResult]): instance=instance, credentials=credentials, with_github_session=canonicalize, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # either full fqdn NixOS/nixpkgs or guix repository urls # maybe add an assert on those specific urls? diff --git a/swh/lister/npm/lister.py b/swh/lister/npm/lister.py index b940699..f10c02d 100644 --- a/swh/lister/npm/lister.py +++ b/swh/lister/npm/lister.py @@ -53,6 +53,9 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]): page_size: int = 1000, incremental: bool = False, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, @@ -61,6 +64,9 @@ class NpmLister(Lister[NpmListerState, List[Dict[str, Any]]]): if incremental else self.API_FULL_LISTING_URL, instance=self.INSTANCE, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.page_size = page_size diff --git a/swh/lister/nuget/lister.py b/swh/lister/nuget/lister.py index 54a6c22..98f9fc9 100644 --- a/swh/lister/nuget/lister.py +++ b/swh/lister/nuget/lister.py @@ -44,12 +44,18 @@ class NugetLister(Lister[NugetListerState, NugetListerPage]): self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.API_INDEX_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.listing_date: Optional[datetime] = None diff --git a/swh/lister/opam/lister.py b/swh/lister/opam/lister.py index 724d198..6b54e66 100644 --- a/swh/lister/opam/lister.py +++ b/swh/lister/opam/lister.py @@ -45,6 +45,9 @@ class OpamLister(StatelessLister[PageType]): url: str, instance: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, opam_root: str = "/tmp/opam/", ): super().__init__( @@ -52,6 +55,9 @@ class OpamLister(StatelessLister[PageType]): credentials=credentials, url=url, instance=instance, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.env = os.environ.copy() # Opam root folder is initialized in the :meth:`get_pages` method as no diff --git a/swh/lister/packagist/lister.py b/swh/lister/packagist/lister.py index e9fa296..af57b55 100644 --- a/swh/lister/packagist/lister.py +++ b/swh/lister/packagist/lister.py @@ -53,6 +53,9 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): self, scheduler: SchedulerInterface, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, @@ -60,6 +63,9 @@ class PackagistLister(Lister[PackagistListerState, PackagistPageType]): instance="packagist", credentials=credentials, with_github_session=True, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py index 4556178..651dc8e 100644 --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -40,9 +40,18 @@ class PhabricatorLister(StatelessLister[PageType]): instance: Optional[str] = None, api_token: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( - scheduler, urljoin(url, self.API_REPOSITORY_PATH), instance, credentials + scheduler=scheduler, + url=urljoin(url, self.API_REPOSITORY_PATH), + instance=instance, + credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/pubdev/lister.py b/swh/lister/pubdev/lister.py index fd1dc45..50e4f15 100644 --- a/swh/lister/pubdev/lister.py +++ b/swh/lister/pubdev/lister.py @@ -36,12 +36,18 @@ class PubDevLister(StatelessLister[PubDevListerPage]): self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"}) diff --git a/swh/lister/puppet/lister.py b/swh/lister/puppet/lister.py index 39deecf..6e84b27 100644 --- a/swh/lister/puppet/lister.py +++ b/swh/lister/puppet/lister.py @@ -43,12 +43,18 @@ class PuppetLister(Lister[PuppetListerState, PuppetListerPage]): self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # Store the datetime the lister runs for incremental purpose self.listing_date = datetime.now() diff --git a/swh/lister/pypi/lister.py b/swh/lister/pypi/lister.py index 443c21d..64f14fa 100644 --- a/swh/lister/pypi/lister.py +++ b/swh/lister/pypi/lister.py @@ -70,12 +70,18 @@ class PyPILister(Lister[PyPIListerState, PackageListPage]): self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url=self.PACKAGE_LIST_URL, instance=self.INSTANCE, credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # used as termination condition and if useful, becomes the new state when the diff --git a/swh/lister/rubygems/lister.py b/swh/lister/rubygems/lister.py index 917a2d6..bb317ea 100644 --- a/swh/lister/rubygems/lister.py +++ b/swh/lister/rubygems/lister.py @@ -63,12 +63,18 @@ class RubyGemsLister(StatelessLister[RubyGemsListerPage]): self, scheduler: SchedulerInterface, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, instance=self.INSTANCE, url=self.RUBY_GEMS_POSTGRES_DUMP_BASE_URL, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) def get_latest_dump_file(self) -> str: diff --git a/swh/lister/sourceforge/lister.py b/swh/lister/sourceforge/lister.py index ba8c412..234e198 100644 --- a/swh/lister/sourceforge/lister.py +++ b/swh/lister/sourceforge/lister.py @@ -113,12 +113,18 @@ class SourceForgeLister(Lister[SourceForgeListerState, SourceForgeListerPage]): scheduler: SchedulerInterface, incremental: bool = False, credentials: Optional[CredentialsType] = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, url="https://sourceforge.net", instance="main", credentials=credentials, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) # Will hold the currently saved "last modified" dates to compare against our diff --git a/swh/lister/tuleap/lister.py b/swh/lister/tuleap/lister.py index 4a55499..ce5cadf 100644 --- a/swh/lister/tuleap/lister.py +++ b/swh/lister/tuleap/lister.py @@ -45,12 +45,18 @@ class TuleapLister(StatelessLister[RepoPage]): url: str, instance: Optional[str] = None, credentials: CredentialsType = None, + max_origins_per_page: Optional[int] = None, + max_pages: Optional[int] = None, + enable_origins: bool = True, ): super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, + max_origins_per_page=max_origins_per_page, + max_pages=max_pages, + enable_origins=enable_origins, ) self.session.headers.update({"Accept": "application/json"})