D lang lister

Add a dlang module that retrieve origins from an http api endpoint.
Each origin is a git based project url on github.com, gitlab.com or
bitbucket.com.
This commit is contained in:
Franck Bret 2023-07-18 10:33:30 +02:00
parent 3ab856288c
commit 398a3d3a9d
8 changed files with 948 additions and 0 deletions

View file

@ -65,6 +65,7 @@ setup(
lister.cran=swh.lister.cran:register
lister.crates=swh.lister.crates:register
lister.debian=swh.lister.debian:register
lister.dlang=swh.lister.dlang:register
lister.fedora=swh.lister.fedora:register
lister.gitea=swh.lister.gitea:register
lister.github=swh.lister.github:register

View file

@ -0,0 +1,75 @@
# Copyright (C) 2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""
Dlang lister
=============
D is a general-purpose programming language with static typing, systems-level access,
and C-like syntax.
The `Dlang`_ lister list origins from its packages manager registry `DUB`_.
The registry provides an `http api endpoint`_ that helps in getting the packages index
with name, url, versions and dates.
As of July 2023 `DUB`_ list 2364 package names.
Origins retrieving strategy
---------------------------
To build a list of origins we make a GET request to an `http api endpoint`_ that returns
a Json array of objects.
The origin url for each package is constructed with the information of corresponding
`repository` entry which represents Git based projects hosted on Github, GitLab or
Bitbucket.
Page listing
------------
There is only one page listing all origins url.
Origins from page
-----------------
The lister is stateless and yields all origins url from one page. It is a list of package
url with last update information.
Running tests
-------------
Activate the virtualenv and run from within swh-lister directory::
pytest -s -vv --log-cli-level=DEBUG swh/lister/dlang/tests
Testing with Docker
-------------------
Change directory to swh/docker then launch the docker environment::
docker compose up -d
Then schedule a dlang listing task::
docker compose exec swh-scheduler swh scheduler task add -p oneshot list-dlang
You can follow lister execution by displaying logs of swh-lister service::
docker compose logs -f swh-lister
.. _Dlang: https://dlang.org/
.. _DUB: https://code.dlang.org/
.. _http api endpoint: https://code.dlang.org/api/packages/dump"
"""
def register():
from .lister import DlangLister
return {
"lister": DlangLister,
"task_modules": ["%s.tasks" % __name__],
}

View file

@ -0,0 +1,93 @@
# Copyright (C) 2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import logging
from typing import Any, Dict, Iterator, List, Optional
import iso8601
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
DlangListerPage = List[Dict[str, Any]]
class DlangLister(StatelessLister[DlangListerPage]):
"""List D lang origins."""
LISTER_NAME = "dlang"
VISIT_TYPE = "git" # D lang origins url are Git repositories
INSTANCE = "dlang"
BASE_URL = "https://code.dlang.org"
PACKAGES_DUMP_URL = BASE_URL + "/api/packages/dump"
KINDS = {
"github": "https://github.com",
"gitlab": "https://gitlab.com",
"bitbucket": "https://bitbucket.com",
}
KIND_URL_PATTERN = "{url}/{owner}/{project}"
def __init__(
self,
scheduler: SchedulerInterface,
credentials: Optional[CredentialsType] = None,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.PACKAGES_DUMP_URL,
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
)
self.session.headers.update({"Accept": "application/json"})
def get_pages(self) -> Iterator[DlangListerPage]:
"""Yield an iterator which returns 'page'
It uses the api endpoint provided by `https://registry.dlang.io/packages`
to get a list of package names with an origin url that corresponds to Git
repository.
There is only one page that list all origins urls.
"""
response = self.http_request(self.url)
yield response.json()
def get_origins_from_page(self, page: DlangListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances"""
assert self.lister_obj.id is not None
for entry in page:
repo: Dict[str, Any] = entry["repository"]
kind: str = repo["kind"]
if kind not in self.KINDS:
logging.error("Can not build a repository url with %r" % repo)
continue
repo_url = self.KIND_URL_PATTERN.format(
url=self.KINDS[kind], owner=repo["owner"], project=repo["project"]
)
last_update = iso8601.parse_date(entry["stats"]["updatedAt"])
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=repo_url,
last_update=last_update,
)

19
swh/lister/dlang/tasks.py Normal file
View file

@ -0,0 +1,19 @@
# Copyright (C) 2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import shared_task
from swh.lister.dlang.lister import DlangLister
@shared_task(name=__name__ + ".DlangListerTask")
def list_dlang(**lister_args):
"""Lister task for D lang packages registry"""
return DlangLister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"

View file

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1,41 @@
# Copyright (C) 2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import iso8601
from swh.lister.dlang.lister import DlangLister
expected_origins = [
{
"url": "https://github.com/katyukha/TheProcess",
"last_update": "2023-07-12T14:42:46.231Z",
},
{
"url": "https://gitlab.com/AntonMeep/silly",
"last_update": "2023-07-12T01:32:31.235Z",
},
]
def test_dlang_lister(datadir, requests_mock_datadir, swh_scheduler):
lister = DlangLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 1
assert res.origins == 1 + 1
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
assert len(scheduler_origins) == len(expected_origins)
assert {
(
scheduled.visit_type,
scheduled.url,
scheduled.last_update,
)
for scheduled in scheduler_origins
} == {
("git", expected["url"], iso8601.parse_date(expected["last_update"]))
for expected in expected_origins
}

View file

@ -0,0 +1,31 @@
# Copyright (C) 2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
def test_dlang_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.dlang.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
def test_dlang_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
# setup the mocked DlangLister
lister = mocker.patch("swh.lister.dlang.tasks.DlangLister")
lister.from_configfile.return_value = lister
stats = ListerStats(pages=42, origins=42)
lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task("swh.lister.dlang.tasks.DlangListerTask")
assert res
res.wait()
assert res.successful()
assert res.result == stats.dict()
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()