gitlab: allow ignoring projects with certain path prefixes

Some GitLab instances use specific namespaces for transient repositories
that it doesn't make sense to archive (for example, gitlab.org has a set
of QA namespaces used for integration testing of their production
deployments; drupal has an `issues/` namespace with forks of repos that
are only used for collaboration on merge requests, and aren't that
useful to be archived).
This commit is contained in:
Nicolas Dandrimont 2022-12-05 15:36:40 +01:00
parent 64267f8f50
commit 5ea79ee3e0
2 changed files with 45 additions and 1 deletions

View file

@ -6,7 +6,7 @@
from dataclasses import asdict, dataclass
import logging
import random
from typing import Any, Dict, Iterator, Optional, Tuple
from typing import Any, Dict, Iterator, List, Optional, Tuple
from urllib.parse import parse_qs, urlencode, urlparse
import iso8601
@ -92,6 +92,7 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
instance: a specific instance name (e.g. gitlab, tor, git-kernel, ...),
url network location will be used if not provided
incremental: defines if incremental listing is activated or not
ignored_project_prefixes: List of prefixes of project paths to ignore
"""
@ -103,6 +104,7 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
instance: Optional[str] = None,
credentials: Optional[CredentialsType] = None,
incremental: bool = False,
ignored_project_prefixes: Optional[List[str]] = None,
):
if name is not None:
self.LISTER_NAME = name
@ -115,6 +117,9 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
self.incremental = incremental
self.last_page: Optional[str] = None
self.per_page = 100
self.ignored_project_prefixes: Optional[Tuple[str, ...]] = None
if ignored_project_prefixes:
self.ignored_project_prefixes = tuple(ignored_project_prefixes)
self.session.headers.update({"Accept": "application/json"})
@ -203,6 +208,10 @@ class GitLabLister(Lister[GitLabListerState, PageResult]):
repositories = page_result.repositories if page_result.repositories else []
for repo in repositories:
if self.ignored_project_prefixes and repo["path_with_namespace"].startswith(
self.ignored_project_prefixes
):
continue
visit_type = repo.get("vcs_type", "git")
visit_type = VCS_MAPPING.get(visit_type, visit_type)
yield ListedOrigin(

View file

@ -356,3 +356,38 @@ def test_lister_gitlab_url_computation(url, swh_scheduler):
)
def test__parse_id_after(url, expected_result):
assert _parse_id_after(url) == expected_result
def test_lister_gitlab_ignored_project_prefixes(datadir, swh_scheduler, requests_mock):
"""Gitlab lister supports listing with ignored project prefixes"""
instance = "gitlab.com"
lister = GitLabLister(
swh_scheduler,
url=api_url(instance),
instance=instance,
ignored_project_prefixes=["jonan/"],
)
response = gitlab_page_response(datadir, instance, 1)
requests_mock.get(
lister.page_url(),
[{"json": response}],
additional_matcher=_match_request,
)
listed_result = lister.run()
# 2 origins start with jonan/
expected_nb_origins = len(response) - 2
assert listed_result == ListerStats(pages=1, origins=expected_nb_origins)
scheduler_origins = lister.scheduler.get_listed_origins(
lister.lister_obj.id
).results
assert len(scheduler_origins) == expected_nb_origins
for listed_origin in scheduler_origins:
assert listed_origin.visit_type == "git"
assert listed_origin.url.startswith(f"https://{instance}")
assert not listed_origin.url.startswith(f"https://{instance}/jonan/")
assert listed_origin.last_update is not None