From 6c123508634c6467b2fb9be83de69e6a5e77363a Mon Sep 17 00:00:00 2001 From: Antoine Lambert Date: Tue, 13 Jul 2021 12:33:41 +0200 Subject: [PATCH] pattern: Use URL network location as instance name when not provided Make the instance parameter of the base pattern lister optional and set lister name to URL network location when not provided. It simplifies lister creation when associated forge type have a lot of instances in the wild (e.g. gitlab or cgit) while giving more details about the listed forge instance. Also process listers for forge with multiple instances (cgit, gitea, gitlab, phabricator and tuleap) to ensure URL network location will be used when instance parameter is not provided. Related to T3403 --- swh/lister/cgit/lister.py | 6 +----- swh/lister/gitea/lister.py | 4 ---- swh/lister/gitlab/lister.py | 6 ++---- swh/lister/pattern.py | 20 +++++++++++++------- swh/lister/phabricator/lister.py | 7 ++++--- swh/lister/tests/test_pattern.py | 14 +++++++++++++- swh/lister/tuleap/lister.py | 4 ---- 7 files changed, 33 insertions(+), 28 deletions(-) diff --git a/swh/lister/cgit/lister.py b/swh/lister/cgit/lister.py index 9a19cd5..c513884 100644 --- a/swh/lister/cgit/lister.py +++ b/swh/lister/cgit/lister.py @@ -58,16 +58,12 @@ class CGitLister(StatelessLister[Repositories]): Args: url: main URL of the CGit instance, i.e. url of the index of published git repositories on this instance. - instance: Name of cgit instance. Defaults to url's hostname + instance: Name of cgit instance. Defaults to url's network location if unset. base_git_url: Optional base git url which allows the origin url computations. """ - if not instance: - instance = urlparse(url).hostname - assert instance is not None # Make mypy happy - super().__init__( scheduler=scheduler, url=url, instance=instance, credentials=credentials, ) diff --git a/swh/lister/gitea/lister.py b/swh/lister/gitea/lister.py index ebb7b6e..19ca4aa 100644 --- a/swh/lister/gitea/lister.py +++ b/swh/lister/gitea/lister.py @@ -11,7 +11,6 @@ from urllib.parse import urljoin import iso8601 import requests from tenacity.before_sleep import before_sleep_log -from urllib3.util import parse_url from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface @@ -47,9 +46,6 @@ class GiteaLister(StatelessLister[RepoListPage]): page_size: int = 50, credentials: CredentialsType = None, ): - if instance is None: - instance = parse_url(url).host - super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, ) diff --git a/swh/lister/gitlab/lister.py b/swh/lister/gitlab/lister.py index 2528e25..7adf73b 100644 --- a/swh/lister/gitlab/lister.py +++ b/swh/lister/gitlab/lister.py @@ -14,7 +14,6 @@ import requests from requests.exceptions import HTTPError from requests.status_codes import codes from tenacity.before_sleep import before_sleep_log -from urllib3.util import parse_url from swh.lister import USER_AGENT from swh.lister.pattern import CredentialsType, Lister @@ -88,7 +87,8 @@ class GitLabLister(Lister[GitLabListerState, PageResult]): scheduler: a scheduler instance url: the api v4 url of the gitlab instance to visit (e.g. https://gitlab.com/api/v4/) - instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...) + instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...), + url network location will be used if not provided incremental: defines if incremental listing is activated or not """ @@ -103,8 +103,6 @@ class GitLabLister(Lister[GitLabListerState, PageResult]): credentials: Optional[CredentialsType] = None, incremental: bool = False, ): - if instance is None: - instance = parse_url(url).host super().__init__( scheduler=scheduler, url=url.rstrip("/"), diff --git a/swh/lister/pattern.py b/swh/lister/pattern.py index 6f077e4..63a2fff 100644 --- a/swh/lister/pattern.py +++ b/swh/lister/pattern.py @@ -1,10 +1,13 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information +from __future__ import annotations + from dataclasses import dataclass from typing import Any, Dict, Generic, Iterable, Iterator, List, Optional, TypeVar +from urllib.parse import urlparse from swh.core.config import load_from_envvar from swh.core.utils import grouper @@ -17,10 +20,10 @@ class ListerStats: pages: int = 0 origins: int = 0 - def __add__(self, other: "ListerStats") -> "ListerStats": + def __add__(self, other: ListerStats) -> ListerStats: return self.__class__(self.pages + other.pages, self.origins + other.origins) - def __iadd__(self, other: "ListerStats"): + def __iadd__(self, other: ListerStats): self.pages += other.pages self.origins += other.origins @@ -65,8 +68,8 @@ class Lister(Generic[StateType, PageType]): scheduler: the instance of the Scheduler being used to register the origins listed by this lister url: a URL representing this lister, e.g. the API's base URL - instance: the instance name used, in conjunction with :attr:`LISTER_NAME`, to - uniquely identify this lister instance. + instance: the instance name, to uniquely identify this lister instance, + if not provided the URL network location will be used credentials: dictionary of credentials for all listers. The first level identifies the :attr:`LISTER_NAME`, the second level the lister :attr:`instance`. The final level is a list of dicts containing the @@ -86,14 +89,17 @@ class Lister(Generic[StateType, PageType]): self, scheduler: SchedulerInterface, url: str, - instance: str, + instance: Optional[str] = None, credentials: CredentialsType = None, ): if not self.LISTER_NAME: raise ValueError("Must set the LISTER_NAME attribute on Lister classes") self.url = url - self.instance = instance + if instance is not None: + self.instance = instance + else: + self.instance = urlparse(url).netloc self.scheduler = scheduler diff --git a/swh/lister/phabricator/lister.py b/swh/lister/phabricator/lister.py index 193239c..1dbee37 100644 --- a/swh/lister/phabricator/lister.py +++ b/swh/lister/phabricator/lister.py @@ -1,4 +1,4 @@ -# Copyright (C) 2019-2020 the Software Heritage developers +# Copyright (C) 2019-2021 the Software Heritage developers # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information from collections import defaultdict @@ -27,7 +27,8 @@ class PhabricatorLister(StatelessLister[PageType]): Args: url: base URL of a phabricator forge (for instance https://forge.softwareheritage.org) - instance: string identifier for the listed forge + instance: string identifier for the listed forge, + URL network location will be used if not provided api_token: authentication token for Conduit API """ @@ -38,7 +39,7 @@ class PhabricatorLister(StatelessLister[PageType]): self, scheduler: SchedulerInterface, url: str, - instance: str, + instance: Optional[str] = None, api_token: Optional[str] = None, credentials: CredentialsType = None, ): diff --git a/swh/lister/tests/test_pattern.py b/swh/lister/tests/test_pattern.py index 2ff15e9..795b715 100644 --- a/swh/lister/tests/test_pattern.py +++ b/swh/lister/tests/test_pattern.py @@ -1,4 +1,4 @@ -# Copyright (C) 2020 The Software Heritage developers +# Copyright (C) 2020-2021 The Software Heritage developers # See the AUTHORS file at the top-level directory of this distribution # License: GNU General Public License version 3, or any later version # See top-level LICENSE file for more information @@ -39,6 +39,18 @@ def test_instantiation(swh_scheduler): lister.run() +def test_lister_instance_name(swh_scheduler): + lister = InstantiableLister( + scheduler=swh_scheduler, url="https://example.org", instance="example" + ) + + assert lister.instance == "example" + + lister = InstantiableLister(scheduler=swh_scheduler, url="https://example.org") + + assert lister.instance == "example.org" + + def test_instantiation_from_configfile(swh_scheduler, mocker): mock_load_from_envvar = mocker.patch("swh.lister.pattern.load_from_envvar") mock_get_scheduler = mocker.patch("swh.lister.pattern.get_scheduler") diff --git a/swh/lister/tuleap/lister.py b/swh/lister/tuleap/lister.py index 6145359..b630508 100644 --- a/swh/lister/tuleap/lister.py +++ b/swh/lister/tuleap/lister.py @@ -10,7 +10,6 @@ from urllib.parse import urljoin import iso8601 import requests from tenacity.before_sleep import before_sleep_log -from urllib3.util import parse_url from swh.lister.utils import throttling_retry from swh.scheduler.interface import SchedulerInterface @@ -51,9 +50,6 @@ class TuleapLister(StatelessLister[RepoPage]): instance: Optional[str] = None, credentials: CredentialsType = None, ): - if instance is None: - instance = parse_url(url).host - super().__init__( scheduler=scheduler, credentials=credentials, url=url, instance=instance, )