pattern: Use URL network location as instance name when not provided
Make the instance parameter of the base pattern lister optional and set lister name to URL network location when not provided. It simplifies lister creation when associated forge type have a lot of instances in the wild (e.g. gitlab or cgit) while giving more details about the listed forge instance. Also process listers for forge with multiple instances (cgit, gitea, gitlab, phabricator and tuleap) to ensure URL network location will be used when instance parameter is not provided. Related to T3403
This commit is contained in:
parent
df46b22098
commit
6c12350863
7 changed files with 33 additions and 28 deletions
|
@ -58,16 +58,12 @@ class CGitLister(StatelessLister[Repositories]):
|
|||
Args:
|
||||
url: main URL of the CGit instance, i.e. url of the index
|
||||
of published git repositories on this instance.
|
||||
instance: Name of cgit instance. Defaults to url's hostname
|
||||
instance: Name of cgit instance. Defaults to url's network location
|
||||
if unset.
|
||||
base_git_url: Optional base git url which allows the origin url
|
||||
computations.
|
||||
|
||||
"""
|
||||
if not instance:
|
||||
instance = urlparse(url).hostname
|
||||
assert instance is not None # Make mypy happy
|
||||
|
||||
super().__init__(
|
||||
scheduler=scheduler, url=url, instance=instance, credentials=credentials,
|
||||
)
|
||||
|
|
|
@ -11,7 +11,6 @@ from urllib.parse import urljoin
|
|||
import iso8601
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
from urllib3.util import parse_url
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
|
@ -47,9 +46,6 @@ class GiteaLister(StatelessLister[RepoListPage]):
|
|||
page_size: int = 50,
|
||||
credentials: CredentialsType = None,
|
||||
):
|
||||
if instance is None:
|
||||
instance = parse_url(url).host
|
||||
|
||||
super().__init__(
|
||||
scheduler=scheduler, credentials=credentials, url=url, instance=instance,
|
||||
)
|
||||
|
|
|
@ -14,7 +14,6 @@ import requests
|
|||
from requests.exceptions import HTTPError
|
||||
from requests.status_codes import codes
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
from urllib3.util import parse_url
|
||||
|
||||
from swh.lister import USER_AGENT
|
||||
from swh.lister.pattern import CredentialsType, Lister
|
||||
|
@ -88,7 +87,8 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
|
|||
scheduler: a scheduler instance
|
||||
url: the api v4 url of the gitlab instance to visit (e.g.
|
||||
https://gitlab.com/api/v4/)
|
||||
instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...)
|
||||
instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...),
|
||||
url network location will be used if not provided
|
||||
incremental: defines if incremental listing is activated or not
|
||||
|
||||
"""
|
||||
|
@ -103,8 +103,6 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
|
|||
credentials: Optional[CredentialsType] = None,
|
||||
incremental: bool = False,
|
||||
):
|
||||
if instance is None:
|
||||
instance = parse_url(url).host
|
||||
super().__init__(
|
||||
scheduler=scheduler,
|
||||
url=url.rstrip("/"),
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
# Copyright (C) 2020 The Software Heritage developers
|
||||
# Copyright (C) 2020-2021 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from swh.core.config import load_from_envvar
|
||||
from swh.core.utils import grouper
|
||||
|
@ -17,10 +20,10 @@ class ListerStats:
|
|||
pages: int = 0
|
||||
origins: int = 0
|
||||
|
||||
def __add__(self, other: "ListerStats") -> "ListerStats":
|
||||
def __add__(self, other: ListerStats) -> ListerStats:
|
||||
return self.__class__(self.pages + other.pages, self.origins + other.origins)
|
||||
|
||||
def __iadd__(self, other: "ListerStats"):
|
||||
def __iadd__(self, other: ListerStats):
|
||||
self.pages += other.pages
|
||||
self.origins += other.origins
|
||||
|
||||
|
@ -65,8 +68,8 @@ class Lister(Generic[StateType, PageType]):
|
|||
scheduler: the instance of the Scheduler being used to register the
|
||||
origins listed by this lister
|
||||
url: a URL representing this lister, e.g. the API's base URL
|
||||
instance: the instance name used, in conjunction with :attr:`LISTER_NAME`, to
|
||||
uniquely identify this lister instance.
|
||||
instance: the instance name, to uniquely identify this lister instance,
|
||||
if not provided the URL network location will be used
|
||||
credentials: dictionary of credentials for all listers. The first level
|
||||
identifies the :attr:`LISTER_NAME`, the second level the lister
|
||||
:attr:`instance`. The final level is a list of dicts containing the
|
||||
|
@ -86,14 +89,17 @@ class Lister(Generic[StateType, PageType]):
|
|||
self,
|
||||
scheduler: SchedulerInterface,
|
||||
url: str,
|
||||
instance: str,
|
||||
instance: Optional[str] = None,
|
||||
credentials: CredentialsType = None,
|
||||
):
|
||||
if not self.LISTER_NAME:
|
||||
raise ValueError("Must set the LISTER_NAME attribute on Lister classes")
|
||||
|
||||
self.url = url
|
||||
self.instance = instance
|
||||
if instance is not None:
|
||||
self.instance = instance
|
||||
else:
|
||||
self.instance = urlparse(url).netloc
|
||||
|
||||
self.scheduler = scheduler
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2019-2020 the Software Heritage developers
|
||||
# Copyright (C) 2019-2021 the Software Heritage developers
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
from collections import defaultdict
|
||||
|
@ -27,7 +27,8 @@ class PhabricatorLister(StatelessLister[PageType]):
|
|||
Args:
|
||||
url: base URL of a phabricator forge
|
||||
(for instance https://forge.softwareheritage.org)
|
||||
instance: string identifier for the listed forge
|
||||
instance: string identifier for the listed forge,
|
||||
URL network location will be used if not provided
|
||||
api_token: authentication token for Conduit API
|
||||
"""
|
||||
|
||||
|
@ -38,7 +39,7 @@ class PhabricatorLister(StatelessLister[PageType]):
|
|||
self,
|
||||
scheduler: SchedulerInterface,
|
||||
url: str,
|
||||
instance: str,
|
||||
instance: Optional[str] = None,
|
||||
api_token: Optional[str] = None,
|
||||
credentials: CredentialsType = None,
|
||||
):
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (C) 2020 The Software Heritage developers
|
||||
# Copyright (C) 2020-2021 The Software Heritage developers
|
||||
# See the AUTHORS file at the top-level directory of this distribution
|
||||
# License: GNU General Public License version 3, or any later version
|
||||
# See top-level LICENSE file for more information
|
||||
|
@ -39,6 +39,18 @@ def test_instantiation(swh_scheduler):
|
|||
lister.run()
|
||||
|
||||
|
||||
def test_lister_instance_name(swh_scheduler):
|
||||
lister = InstantiableLister(
|
||||
scheduler=swh_scheduler, url="https://example.org", instance="example"
|
||||
)
|
||||
|
||||
assert lister.instance == "example"
|
||||
|
||||
lister = InstantiableLister(scheduler=swh_scheduler, url="https://example.org")
|
||||
|
||||
assert lister.instance == "example.org"
|
||||
|
||||
|
||||
def test_instantiation_from_configfile(swh_scheduler, mocker):
|
||||
mock_load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
|
||||
mock_get_scheduler = mocker.patch("swh.lister.pattern.get_scheduler")
|
||||
|
|
|
@ -10,7 +10,6 @@ from urllib.parse import urljoin
|
|||
import iso8601
|
||||
import requests
|
||||
from tenacity.before_sleep import before_sleep_log
|
||||
from urllib3.util import parse_url
|
||||
|
||||
from swh.lister.utils import throttling_retry
|
||||
from swh.scheduler.interface import SchedulerInterface
|
||||
|
@ -51,9 +50,6 @@ class TuleapLister(StatelessLister[RepoPage]):
|
|||
instance: Optional[str] = None,
|
||||
credentials: CredentialsType = None,
|
||||
):
|
||||
if instance is None:
|
||||
instance = parse_url(url).host
|
||||
|
||||
super().__init__(
|
||||
scheduler=scheduler, credentials=credentials, url=url, instance=instance,
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue