Arch User Repository (AUR) lister

Add 'aur' module to swh-lister with data fixtures and tests.
For now, origin url are package vcs (Git) url.
This commit is contained in:
Franck Bret 2022-06-24 12:19:15 +02:00
parent 6a53a6ad06
commit 97b353bf0b
9 changed files with 518 additions and 0 deletions

View file

@ -56,6 +56,7 @@ setup(
lister=swh.lister.cli
[swh.workers]
lister.arch=swh.lister.arch:register
lister.aur=swh.lister.aur:register
lister.bitbucket=swh.lister.bitbucket:register
lister.cgit=swh.lister.cgit:register
lister.cran=swh.lister.cran:register

135
swh/lister/aur/__init__.py Normal file
View file

@ -0,0 +1,135 @@
# Copyright (C) 2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
"""
AUR (Arch User Repository) lister
=================================
The AUR lister list origins from `aur.archlinux.org`_, the Arch User Repository.
For each package, there is a git repository, we use the git url as origin and the
snapshot url as the artifact for the loader to download.
Each git repository consist of a directory (for which name corresponds to the package name),
and at least two files, .SRCINFO and PKGBUILD which are recipes for building the package.
Each package has a version, the latest one. There isn't any archives of previous versions,
so the lister will always list one version per package.
As of August 2022 `aur.archlinux.org`_ list 84438 packages. Please note that this amount
is the total of `regular`_ and `split`_ packages.
We will archive `regular` and `split` packages but only their `pkgbase` because that is
the only one that actually has source code.
The packages amount is 78554 after removing the split ones.
Origins retrieving strategy
---------------------------
An rpc api exists but it is recommended to save bandwidth so it's not used. See
`New AUR Metadata Archives`_ for more on this topic.
To get an index of all AUR existing packages we download a `packages-meta-v1.json.gz`_
which contains a json file listing all existing packages definitions.
Each entry describes the latest released version of a package. The origin url
for a package is built using `pkgbase` and corresponds to a git repository.
Note that we list only standard package (when pkgbase equal pkgname), not the ones
belonging to split packages.
It takes only a couple of minutes to download the 7 MB index archive and parses its
content.
Page listing
------------
Each page is related to one package. As its not possible to get all previous
versions, it will always returns one line.
Each page corresponds to a package with a `version`, an `url` for a Git
repository, a `project_url` which represents the upstream project url and
a canonical `snapshot_url` from which a tar.gz archive of the package can
be downloaded.
The data schema for each line is:
* **pkgname**: Package name
* **version**: Package version
* **url**: Git repository url for a package
* **snapshot_url**: Package download url
* **project_url**: Upstream project url if any
* **last_modified**: Iso8601 last update date
Origins from page
-----------------
The lister yields one origin per page.
The origin url corresponds to the git url of a package, for example ``https://aur.archlinux.org/{package}.git``.
Additionally we add some data set to "extra_loader_arguments":
* **artifacts**: Represent data about the Aur package snapshot to download,
following :ref:`original-artifacts-json specification <original-artifacts-json>`
* **aur_metadata**: To store all other interesting attributes that do not belongs to artifacts.
Origin data example::
{
"visit_type": "aur",
"url": "https://aur.archlinux.org/hg-evolve.git",
"extra_loader_arguments": {
"artifacts": [
{
"filename": "hg-evolve.tar.gz",
"url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950
"version": "10.5.1-1",
}
],
"aur_metadata": [
{
"version": "10.5.1-1",
"project_url": "https://www.mercurial-scm.org/doc/evolution/",
"last_update": "2022-04-27T20:02:56+00:00",
"pkgname": "hg-evolve",
}
],
},
Running tests
-------------
Activate the virtualenv and run from within swh-lister directory::
pytest -s -vv --log-cli-level=DEBUG swh/lister/aur/tests
Testing with Docker
-------------------
Change directory to swh/docker then launch the docker environment::
docker-compose up -d
Then connect to the lister::
docker exec -it docker_swh-lister_1 bash
And run the lister (The output of this listing results in oneshot tasks in the scheduler)::
swh lister run -l aur
.. _aur.archlinux.org: https://aur.archlinux.org
.. _New AUR Metadata Archives: https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html
.. _packages-meta-v1.json.gz: https://aur.archlinux.org/packages-meta-v1.json.gz
.. _regular: https://wiki.archlinux.org/title/PKGBUILD#Package_name
.. _split: https://man.archlinux.org/man/PKGBUILD.5#PACKAGE_SPLITTING
"""
def register():
from .lister import AurLister
return {
"lister": AurLister,
"task_modules": ["%s.tasks" % __name__],
}

174
swh/lister/aur/lister.py Normal file
View file

@ -0,0 +1,174 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
import datetime
import gzip
import json
import logging
from pathlib import Path
import shutil
from typing import Any, Dict, Iterator, Optional
import requests
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import CredentialsType, StatelessLister
logger = logging.getLogger(__name__)
# Aliasing the page results returned by `get_pages` method from the lister.
AurListerPage = Dict[str, Any]
class AurLister(StatelessLister[AurListerPage]):
"""List Arch User Repository (AUR) origins.
Given an url (used as a base url, default is 'https://aur.archlinux.org'),
download a 'packages-meta-v1.json.gz' which contains a json file listing all
existing packages definitions.
Each entry describes the latest released version of a package. The origin url
for a package is built using 'pkgname' and corresponds to a git repository.
An rpc api exists but it is recommended to save bandwidth so it's not used. See
https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html
for more on this.
"""
LISTER_NAME = "aur"
VISIT_TYPE = "aur"
INSTANCE = "aur"
BASE_URL = "https://aur.archlinux.org"
DEFAULT_PACKAGES_INDEX_URL = "{base_url}/packages-meta-v1.json.gz"
PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git"
PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz"
DESTINATION_PATH = Path("/tmp/aur_archive")
def __init__(
self,
scheduler: SchedulerInterface,
credentials: Optional[CredentialsType] = None,
):
super().__init__(
scheduler=scheduler,
credentials=credentials,
instance=self.INSTANCE,
url=self.BASE_URL,
)
def download_index_archive(self) -> Path:
"""Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string,
and download the archive to self.DESTINATION_PATH
Returns:
a directory Path where the archive has been downloaded to.
"""
url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url)
filename = url.split("/")[-1]
destination = Path(self.DESTINATION_PATH, filename)
if not Path(self.DESTINATION_PATH).exists():
Path(self.DESTINATION_PATH).mkdir()
response = requests.get(url, stream=True)
destination.write_bytes(response.raw.read())
assert destination.exists()
return destination
def get_pages(self) -> Iterator[AurListerPage]:
"""Yield an iterator which returns 'page'
Each page corresponds to a package with a 'version', an 'url' for a Git
repository, a 'project_url' which represents the upstream project url and
a canonical 'snapshot_url' from which a tar.gz archive of the package can
be downloaded.
"""
index = self.download_index_archive()
with gzip.open(index, "rb") as f:
assert f.readable()
file_content = f.read()
packages = json.loads(file_content)
assert packages
counter: int = 0
for package in packages:
# Exclude lines where Name differs from PackageBase as they represents
# split package and they don't have resolvable snapshots url
if package["Name"] == package["PackageBase"]:
pkgname = package["PackageBase"]
version = package["Version"]
project_url = package["URL"]
last_modified = datetime.datetime.fromtimestamp(
float(package["LastModified"]), tz=datetime.timezone.utc
).isoformat()
counter += 1
yield {
"pkgname": pkgname,
"version": version,
"url": self.PACKAGE_VCS_URL_PATTERN.format(
base_url=self.BASE_URL, pkgname=pkgname
),
"snapshot_url": self.PACKAGE_SNAPSHOT_URL_PATTERN.format(
base_url=self.BASE_URL, pkgname=pkgname
),
"project_url": project_url,
"last_modified": last_modified,
}
logger.debug("Found %s AUR packages in aur_index", counter)
def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]:
"""Iterate on all pages and yield ListedOrigin instances.
It uses the vcs (Git) url as an origin and adds `artifacts` and `aur_metadata`
entries to 'extra_loader_arguments'.
`artifacts` describe the file to download and `aur_metadata` store some
metadata that can be useful for the loader.
"""
assert self.lister_obj.id is not None
url = origin["url"]
last_update = datetime.datetime.fromisoformat(origin["last_modified"])
filename = origin["snapshot_url"].split("/")[-1]
artifacts = [
{
"filename": filename,
"url": origin["snapshot_url"],
"version": origin["version"],
}
]
aur_metadata = [
{
"version": origin["version"],
"project_url": origin["project_url"],
"last_update": origin["last_modified"],
"pkgname": origin["pkgname"],
}
]
yield ListedOrigin(
lister_id=self.lister_obj.id,
visit_type=self.VISIT_TYPE,
url=url,
last_update=last_update,
extra_loader_arguments={
"artifacts": artifacts,
"aur_metadata": aur_metadata,
},
)
def finalize(self) -> None:
# Cleanup by removing the repository directory
if self.DESTINATION_PATH.exists():
shutil.rmtree(self.DESTINATION_PATH)
logger.debug(
"Successfully removed %s directory", str(self.DESTINATION_PATH)
)

19
swh/lister/aur/tasks.py Normal file
View file

@ -0,0 +1,19 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from celery import shared_task
from swh.lister.aur.lister import AurLister
@shared_task(name=__name__ + ".AurListerTask")
def list_aur(**lister_args):
"""Lister task for Arch User Repository (AUR)"""
return AurLister.from_configfile(**lister_args).run().dict()
@shared_task(name=__name__ + ".ping")
def _ping():
return "OK"

View file

View file

@ -0,0 +1,27 @@
#!/usr/bin/env bash
# Script to generate packages-meta-v1.json.gz
# files and fake http responses for https_aur.archlinux.org
# For tests purposes only
set -euo pipefail
# files and directories
mkdir https_aur.archlinux.org
mkdir -p tmp_dir/archives/
cd tmp_dir/archives/
echo -e '''[
{"ID":787300,"Name":"tealdeer-git","PackageBaseID":110159,"PackageBase":"tealdeer-git","Version":"r255.30b7c5f-1","Description":"A fast tldr client in Rust.","URL":"https://github.com/dbrgn/tealdeer","NumVotes":11,"Popularity":0.009683,"OutOfDate":null,"Maintainer":"dbrgn","FirstSubmitted":1460795753,"LastModified":1599251812,"URLPath":"/cgit/aur.git/snapshot/tealdeer-git.tar.gz"},
{"ID":860370,"Name":"ibus-git","PackageBaseID":163059,"PackageBase":"ibus-git","Version":"1.5.23+12+gef4c5c7e-1","Description":"Next Generation Input Bus for Linux","URL":"https://github.com/ibus/ibus/wiki","NumVotes":1,"Popularity":0.989573,"OutOfDate":null,"Maintainer":"tallero","FirstSubmitted":1612764731,"LastModified":1612764731,"URLPath":"/cgit/aur.git/snapshot/ibus-git.tar.gz"},
{"ID":1043337,"Name":"libervia-web-hg","PackageBaseID":170485,"PackageBase":"libervia-web-hg","Version":"0.9.0.r1492.3a34d78f2717-1","Description":"Salut à Toi, multi-frontends multi-purposes XMPP client (Web interface)","URL":"http://salut-a-toi.org/","NumVotes":0,"Popularity":0.0,"OutOfDate":null,"Maintainer":"jnanar","FirstSubmitted":1630224837,"LastModified":1645889458,"URLPath":"/cgit/aur.git/snapshot/libervia-web-hg.tar.gz"},
{"ID":1072642,"Name":"hg-evolve","PackageBaseID":135047,"PackageBase":"hg-evolve","Version":"10.5.1-1","Description":"Flexible evolution of Mercurial history","URL":"https://www.mercurial-scm.org/doc/evolution/","NumVotes":6,"Popularity":0.003887,"OutOfDate":null,"Maintainer":"damien-43","FirstSubmitted":1534190432,"LastModified":1651089776,"URLPath":"/cgit/aur.git/snapshot/hg-evolve.tar.gz"}
]''' > packages-meta-v1.json
# Gzip archive
gzip -c packages-meta-v1.json > ../../https_aur.archlinux.org/packages-meta-v1.json.gz
# Clean up removing tmp_dir
cd ../../
rm -rf tmp_dir/

View file

@ -0,0 +1,131 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.aur.lister import AurLister
expected_origins = [
{
"visit_type": "aur",
"url": "https://aur.archlinux.org/hg-evolve.git",
"extra_loader_arguments": {
"artifacts": [
{
"filename": "hg-evolve.tar.gz",
"url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950
"version": "10.5.1-1",
}
],
"aur_metadata": [
{
"version": "10.5.1-1",
"project_url": "https://www.mercurial-scm.org/doc/evolution/",
"last_update": "2022-04-27T20:02:56+00:00",
"pkgname": "hg-evolve",
}
],
},
},
{
"visit_type": "aur",
"url": "https://aur.archlinux.org/ibus-git.git",
"extra_loader_arguments": {
"artifacts": [
{
"filename": "ibus-git.tar.gz",
"url": "https://aur.archlinux.org/cgit/aur.git/snapshot/ibus-git.tar.gz", # noqa: B950
"version": "1.5.23+12+gef4c5c7e-1",
}
],
"aur_metadata": [
{
"version": "1.5.23+12+gef4c5c7e-1",
"project_url": "https://github.com/ibus/ibus/wiki",
"last_update": "2021-02-08T06:12:11+00:00",
"pkgname": "ibus-git",
}
],
},
},
{
"visit_type": "aur",
"url": "https://aur.archlinux.org/libervia-web-hg.git",
"extra_loader_arguments": {
"artifacts": [
{
"filename": "libervia-web-hg.tar.gz",
"url": "https://aur.archlinux.org/cgit/aur.git/snapshot/libervia-web-hg.tar.gz", # noqa: B950
"version": "0.9.0.r1492.3a34d78f2717-1",
}
],
"aur_metadata": [
{
"version": "0.9.0.r1492.3a34d78f2717-1",
"project_url": "http://salut-a-toi.org/",
"last_update": "2022-02-26T15:30:58+00:00",
"pkgname": "libervia-web-hg",
}
],
},
},
{
"visit_type": "aur",
"url": "https://aur.archlinux.org/tealdeer-git.git",
"extra_loader_arguments": {
"artifacts": [
{
"filename": "tealdeer-git.tar.gz",
"url": "https://aur.archlinux.org/cgit/aur.git/snapshot/tealdeer-git.tar.gz", # noqa: B950
"version": "r255.30b7c5f-1",
}
],
"aur_metadata": [
{
"version": "r255.30b7c5f-1",
"project_url": "https://github.com/dbrgn/tealdeer",
"last_update": "2020-09-04T20:36:52+00:00",
"pkgname": "tealdeer-git",
}
],
},
},
]
def test_aur_lister(datadir, requests_mock_datadir, swh_scheduler):
lister = AurLister(scheduler=swh_scheduler)
res = lister.run()
assert res.pages == 4
assert res.origins == 4
scheduler_origins_sorted = sorted(
swh_scheduler.get_listed_origins(lister.lister_obj.id).results,
key=lambda x: x.url,
)
expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url"))
assert len(scheduler_origins_sorted) == len(expected_origins_sorted)
assert [
(
scheduled.visit_type,
scheduled.url,
scheduled.extra_loader_arguments.get("artifacts"),
)
for scheduled in scheduler_origins_sorted
] == [
(
"aur",
expected.get("url"),
expected.get("extra_loader_arguments").get("artifacts"),
)
for expected in expected_origins_sorted
]
def test_aur_lister_directory_cleanup(datadir, requests_mock_datadir, swh_scheduler):
lister = AurLister(scheduler=swh_scheduler)
lister.run()
# Repository directory should not exists after the lister runs
assert not lister.DESTINATION_PATH.exists()

View file

@ -0,0 +1,31 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
def test_aur_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.aur.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
def test_aur_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
# setup the mocked AurLister
lister = mocker.patch("swh.lister.aur.tasks.AurLister")
lister.from_configfile.return_value = lister
stats = ListerStats(pages=42, origins=42)
lister.run.return_value = stats
res = swh_scheduler_celery_app.send_task("swh.lister.aur.tasks.AurListerTask")
assert res
res.wait()
assert res.successful()
assert res.result == stats.dict()
lister.from_configfile.assert_called_once_with()
lister.run.assert_called_once_with()