pattern: Use URL network location as instance name when not provided

Make the instance parameter of the base pattern lister optional and set
lister name to URL network location when not provided.

It simplifies lister creation when associated forge type have a lot of
instances in the wild (e.g. gitlab or cgit) while giving more details
about the listed forge instance.

Also process listers for forge with multiple instances (cgit, gitea,
gitlab, phabricator and tuleap) to ensure URL network location will be
used when instance parameter is not provided.

Related to T3403
This commit is contained in:
Antoine Lambert 2021-07-13 12:33:41 +02:00
parent df46b22098
commit 6c12350863
7 changed files with 33 additions and 28 deletions

View file

@ -58,16 +58,12 @@ class CGitLister(StatelessLister[Repositories]):
Args:
url: main URL of the CGit instance, i.e. url of the index
of published git repositories on this instance.
instance: Name of cgit instance. Defaults to url's hostname
instance: Name of cgit instance. Defaults to url's network location
if unset.
base_git_url: Optional base git url which allows the origin url
computations.
"""
if not instance:
instance = urlparse(url).hostname
assert instance is not None # Make mypy happy
super().__init__(
scheduler=scheduler, url=url, instance=instance, credentials=credentials,
)

View file

@ -11,7 +11,6 @@ from urllib.parse import urljoin
import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from urllib3.util import parse_url
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
@ -47,9 +46,6 @@ class GiteaLister(StatelessLister[RepoListPage]):
page_size: int = 50,
credentials: CredentialsType = None,
):
if instance is None:
instance = parse_url(url).host
super().__init__(
scheduler=scheduler, credentials=credentials, url=url, instance=instance,
)

View file

@ -14,7 +14,6 @@ import requests
from requests.exceptions import HTTPError
from requests.status_codes import codes
from tenacity.before_sleep import before_sleep_log
from urllib3.util import parse_url
from swh.lister import USER_AGENT
from swh.lister.pattern import CredentialsType, Lister
@ -88,7 +87,8 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
scheduler: a scheduler instance
url: the api v4 url of the gitlab instance to visit (e.g.
https://gitlab.com/api/v4/)
instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...)
instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...),
url network location will be used if not provided
incremental: defines if incremental listing is activated or not
"""
@ -103,8 +103,6 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
credentials: Optional[CredentialsType] = None,
incremental: bool = False,
):
if instance is None:
instance = parse_url(url).host
super().__init__(
scheduler=scheduler,
url=url.rstrip("/"),

View file

@ -1,10 +1,13 @@
# Copyright (C) 2020 The Software Heritage developers
# Copyright (C) 2020-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from __future__ import annotations
from dataclasses import dataclass
from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar
from urllib.parse import urlparse
from swh.core.config import load_from_envvar
from swh.core.utils import grouper
@ -17,10 +20,10 @@ class ListerStats:
pages: int = 0
origins: int = 0
def __add__(self, other: "ListerStats") -> "ListerStats":
def __add__(self, other: ListerStats) -> ListerStats:
return self.__class__(self.pages + other.pages, self.origins + other.origins)
def __iadd__(self, other: "ListerStats"):
def __iadd__(self, other: ListerStats):
self.pages += other.pages
self.origins += other.origins
@ -65,8 +68,8 @@ class Lister(Generic[StateType, PageType]):
scheduler: the instance of the Scheduler being used to register the
origins listed by this lister
url: a URL representing this lister, e.g. the API's base URL
instance: the instance name used, in conjunction with :attr:`LISTER_NAME`, to
uniquely identify this lister instance.
instance: the instance name, to uniquely identify this lister instance,
if not provided the URL network location will be used
credentials: dictionary of credentials for all listers. The first level
identifies the :attr:`LISTER_NAME`, the second level the lister
:attr:`instance`. The final level is a list of dicts containing the
@ -86,14 +89,17 @@ class Lister(Generic[StateType, PageType]):
self,
scheduler: SchedulerInterface,
url: str,
instance: str,
instance: Optional[str] = None,
credentials: CredentialsType = None,
):
if not self.LISTER_NAME:
raise ValueError("Must set the LISTER_NAME attribute on Lister classes")
self.url = url
self.instance = instance
if instance is not None:
self.instance = instance
else:
self.instance = urlparse(url).netloc
self.scheduler = scheduler

View file

@ -1,4 +1,4 @@
# Copyright (C) 2019-2020 the Software Heritage developers
# Copyright (C) 2019-2021 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from collections import defaultdict
@ -27,7 +27,8 @@ class PhabricatorLister(StatelessLister[PageType]):
Args:
url: base URL of a phabricator forge
(for instance https://forge.softwareheritage.org)
instance: string identifier for the listed forge
instance: string identifier for the listed forge,
URL network location will be used if not provided
api_token: authentication token for Conduit API
"""
@ -38,7 +39,7 @@ class PhabricatorLister(StatelessLister[PageType]):
self,
scheduler: SchedulerInterface,
url: str,
instance: str,
instance: Optional[str] = None,
api_token: Optional[str] = None,
credentials: CredentialsType = None,
):

View file

@ -1,4 +1,4 @@
# Copyright (C) 2020 The Software Heritage developers
# Copyright (C) 2020-2021 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
@ -39,6 +39,18 @@ def test_instantiation(swh_scheduler):
lister.run()
def test_lister_instance_name(swh_scheduler):
lister = InstantiableLister(
scheduler=swh_scheduler, url="https://example.org", instance="example"
)
assert lister.instance == "example"
lister = InstantiableLister(scheduler=swh_scheduler, url="https://example.org")
assert lister.instance == "example.org"
def test_instantiation_from_configfile(swh_scheduler, mocker):
mock_load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar")
mock_get_scheduler = mocker.patch("swh.lister.pattern.get_scheduler")

View file

@ -10,7 +10,6 @@ from urllib.parse import urljoin
import iso8601
import requests
from tenacity.before_sleep import before_sleep_log
from urllib3.util import parse_url
from swh.lister.utils import throttling_retry
from swh.scheduler.interface import SchedulerInterface
@ -51,9 +50,6 @@ class TuleapLister(StatelessLister[RepoPage]):
instance: Optional[str] = None,
credentials: CredentialsType = None,
):
if instance is None:
instance = parse_url(url).host
super().__init__(
scheduler=scheduler, credentials=credentials, url=url, instance=instance,
)