From 97b353bf0b9a726ebb3d414f809aeac26a229f21 Mon Sep 17 00:00:00 2001 From: Franck Bret Date: Fri, 24 Jun 2022 12:19:15 +0200 Subject: [PATCH] Arch User Repository (AUR) lister Add 'aur' module to swh-lister with data fixtures and tests. For now, origin url are package vcs (Git) url. --- setup.py | 1 + swh/lister/aur/__init__.py | 135 ++++++++++++++ swh/lister/aur/lister.py | 174 ++++++++++++++++++ swh/lister/aur/tasks.py | 19 ++ swh/lister/aur/tests/__init__.py | 0 .../aur/tests/data/fake_aur_packages.sh | 27 +++ .../packages-meta-v1.json.gz | Bin 0 -> 701 bytes swh/lister/aur/tests/test_lister.py | 131 +++++++++++++ swh/lister/aur/tests/test_tasks.py | 31 ++++ 9 files changed, 518 insertions(+) create mode 100644 swh/lister/aur/__init__.py create mode 100644 swh/lister/aur/lister.py create mode 100644 swh/lister/aur/tasks.py create mode 100644 swh/lister/aur/tests/__init__.py create mode 100755 swh/lister/aur/tests/data/fake_aur_packages.sh create mode 100644 swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz create mode 100644 swh/lister/aur/tests/test_lister.py create mode 100644 swh/lister/aur/tests/test_tasks.py diff --git a/setup.py b/setup.py index 5bb77b6..2340cab 100755 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ setup( lister=swh.lister.cli [swh.workers] lister.arch=swh.lister.arch:register + lister.aur=swh.lister.aur:register lister.bitbucket=swh.lister.bitbucket:register lister.cgit=swh.lister.cgit:register lister.cran=swh.lister.cran:register diff --git a/swh/lister/aur/__init__.py b/swh/lister/aur/__init__.py new file mode 100644 index 0000000..d6db8a2 --- /dev/null +++ b/swh/lister/aur/__init__.py @@ -0,0 +1,135 @@ +# Copyright (C) 2022 the Software Heritage developers +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + + +""" +AUR (Arch User Repository) lister +================================= + +The AUR lister list origins from `aur.archlinux.org`_, the Arch User Repository. +For each package, there is a git repository, we use the git url as origin and the +snapshot url as the artifact for the loader to download. + +Each git repository consist of a directory (for which name corresponds to the package name), +and at least two files, .SRCINFO and PKGBUILD which are recipes for building the package. + +Each package has a version, the latest one. There isn't any archives of previous versions, +so the lister will always list one version per package. + +As of August 2022 `aur.archlinux.org`_ list 84438 packages. Please note that this amount +is the total of `regular`_ and `split`_ packages. +We will archive `regular` and `split` packages but only their `pkgbase` because that is +the only one that actually has source code. +The packages amount is 78554 after removing the split ones. + +Origins retrieving strategy +--------------------------- + +An rpc api exists but it is recommended to save bandwidth so it's not used. See +`New AUR Metadata Archives`_ for more on this topic. + +To get an index of all AUR existing packages we download a `packages-meta-v1.json.gz`_ +which contains a json file listing all existing packages definitions. + +Each entry describes the latest released version of a package. The origin url +for a package is built using `pkgbase` and corresponds to a git repository. + +Note that we list only standard package (when pkgbase equal pkgname), not the ones +belonging to split packages. + +It takes only a couple of minutes to download the 7 MB index archive and parses its +content. + +Page listing +------------ + +Each page is related to one package. As its not possible to get all previous +versions, it will always returns one line. + +Each page corresponds to a package with a `version`, an `url` for a Git +repository, a `project_url` which represents the upstream project url and +a canonical `snapshot_url` from which a tar.gz archive of the package can +be downloaded. + +The data schema for each line is: + +* **pkgname**: Package name +* **version**: Package version +* **url**: Git repository url for a package +* **snapshot_url**: Package download url +* **project_url**: Upstream project url if any +* **last_modified**: Iso8601 last update date + +Origins from page +----------------- + +The lister yields one origin per page. +The origin url corresponds to the git url of a package, for example ``https://aur.archlinux.org/{package}.git``. + +Additionally we add some data set to "extra_loader_arguments": + +* **artifacts**: Represent data about the Aur package snapshot to download, + following :ref:`original-artifacts-json specification ` +* **aur_metadata**: To store all other interesting attributes that do not belongs to artifacts. + +Origin data example:: + + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/hg-evolve.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "hg-evolve.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950 + "version": "10.5.1-1", + } + ], + "aur_metadata": [ + { + "version": "10.5.1-1", + "project_url": "https://www.mercurial-scm.org/doc/evolution/", + "last_update": "2022-04-27T20:02:56+00:00", + "pkgname": "hg-evolve", + } + ], + }, + +Running tests +------------- + +Activate the virtualenv and run from within swh-lister directory:: + + pytest -s -vv --log-cli-level=DEBUG swh/lister/aur/tests + +Testing with Docker +------------------- + +Change directory to swh/docker then launch the docker environment:: + + docker-compose up -d + +Then connect to the lister:: + + docker exec -it docker_swh-lister_1 bash + +And run the lister (The output of this listing results in “oneshot” tasks in the scheduler):: + + swh lister run -l aur + +.. _aur.archlinux.org: https://aur.archlinux.org +.. _New AUR Metadata Archives: https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html +.. _packages-meta-v1.json.gz: https://aur.archlinux.org/packages-meta-v1.json.gz +.. _regular: https://wiki.archlinux.org/title/PKGBUILD#Package_name +.. _split: https://man.archlinux.org/man/PKGBUILD.5#PACKAGE_SPLITTING +""" + + +def register(): + from .lister import AurLister + + return { + "lister": AurLister, + "task_modules": ["%s.tasks" % __name__], + } diff --git a/swh/lister/aur/lister.py b/swh/lister/aur/lister.py new file mode 100644 index 0000000..47586ce --- /dev/null +++ b/swh/lister/aur/lister.py @@ -0,0 +1,174 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +import datetime +import gzip +import json +import logging +from pathlib import Path +import shutil +from typing import Any, Dict, Iterator, Optional + +import requests + +from swh.scheduler.interface import SchedulerInterface +from swh.scheduler.model import ListedOrigin + +from ..pattern import CredentialsType, StatelessLister + +logger = logging.getLogger(__name__) + +# Aliasing the page results returned by `get_pages` method from the lister. +AurListerPage = Dict[str, Any] + + +class AurLister(StatelessLister[AurListerPage]): + """List Arch User Repository (AUR) origins. + + Given an url (used as a base url, default is 'https://aur.archlinux.org'), + download a 'packages-meta-v1.json.gz' which contains a json file listing all + existing packages definitions. + + Each entry describes the latest released version of a package. The origin url + for a package is built using 'pkgname' and corresponds to a git repository. + + An rpc api exists but it is recommended to save bandwidth so it's not used. See + https://lists.archlinux.org/pipermail/aur-general/2021-November/036659.html + for more on this. + """ + + LISTER_NAME = "aur" + VISIT_TYPE = "aur" + INSTANCE = "aur" + + BASE_URL = "https://aur.archlinux.org" + DEFAULT_PACKAGES_INDEX_URL = "{base_url}/packages-meta-v1.json.gz" + PACKAGE_VCS_URL_PATTERN = "{base_url}/{pkgname}.git" + PACKAGE_SNAPSHOT_URL_PATTERN = "{base_url}/cgit/aur.git/snapshot/{pkgname}.tar.gz" + + DESTINATION_PATH = Path("/tmp/aur_archive") + + def __init__( + self, + scheduler: SchedulerInterface, + credentials: Optional[CredentialsType] = None, + ): + super().__init__( + scheduler=scheduler, + credentials=credentials, + instance=self.INSTANCE, + url=self.BASE_URL, + ) + + def download_index_archive(self) -> Path: + """Build an url based on self.DEFAULT_PACKAGES_INDEX_URL format string, + and download the archive to self.DESTINATION_PATH + + Returns: + a directory Path where the archive has been downloaded to. + """ + url = self.DEFAULT_PACKAGES_INDEX_URL.format(base_url=self.url) + filename = url.split("/")[-1] + destination = Path(self.DESTINATION_PATH, filename) + + if not Path(self.DESTINATION_PATH).exists(): + Path(self.DESTINATION_PATH).mkdir() + + response = requests.get(url, stream=True) + destination.write_bytes(response.raw.read()) + assert destination.exists() + + return destination + + def get_pages(self) -> Iterator[AurListerPage]: + """Yield an iterator which returns 'page' + + Each page corresponds to a package with a 'version', an 'url' for a Git + repository, a 'project_url' which represents the upstream project url and + a canonical 'snapshot_url' from which a tar.gz archive of the package can + be downloaded. + """ + index = self.download_index_archive() + + with gzip.open(index, "rb") as f: + assert f.readable() + file_content = f.read() + packages = json.loads(file_content) + + assert packages + + counter: int = 0 + for package in packages: + # Exclude lines where Name differs from PackageBase as they represents + # split package and they don't have resolvable snapshots url + if package["Name"] == package["PackageBase"]: + pkgname = package["PackageBase"] + version = package["Version"] + project_url = package["URL"] + last_modified = datetime.datetime.fromtimestamp( + float(package["LastModified"]), tz=datetime.timezone.utc + ).isoformat() + counter += 1 + yield { + "pkgname": pkgname, + "version": version, + "url": self.PACKAGE_VCS_URL_PATTERN.format( + base_url=self.BASE_URL, pkgname=pkgname + ), + "snapshot_url": self.PACKAGE_SNAPSHOT_URL_PATTERN.format( + base_url=self.BASE_URL, pkgname=pkgname + ), + "project_url": project_url, + "last_modified": last_modified, + } + logger.debug("Found %s AUR packages in aur_index", counter) + + def get_origins_from_page(self, origin: AurListerPage) -> Iterator[ListedOrigin]: + """Iterate on all pages and yield ListedOrigin instances. + It uses the vcs (Git) url as an origin and adds `artifacts` and `aur_metadata` + entries to 'extra_loader_arguments'. + + `artifacts` describe the file to download and `aur_metadata` store some + metadata that can be useful for the loader. + """ + assert self.lister_obj.id is not None + + url = origin["url"] + last_update = datetime.datetime.fromisoformat(origin["last_modified"]) + filename = origin["snapshot_url"].split("/")[-1] + + artifacts = [ + { + "filename": filename, + "url": origin["snapshot_url"], + "version": origin["version"], + } + ] + aur_metadata = [ + { + "version": origin["version"], + "project_url": origin["project_url"], + "last_update": origin["last_modified"], + "pkgname": origin["pkgname"], + } + ] + + yield ListedOrigin( + lister_id=self.lister_obj.id, + visit_type=self.VISIT_TYPE, + url=url, + last_update=last_update, + extra_loader_arguments={ + "artifacts": artifacts, + "aur_metadata": aur_metadata, + }, + ) + + def finalize(self) -> None: + # Cleanup by removing the repository directory + if self.DESTINATION_PATH.exists(): + shutil.rmtree(self.DESTINATION_PATH) + logger.debug( + "Successfully removed %s directory", str(self.DESTINATION_PATH) + ) diff --git a/swh/lister/aur/tasks.py b/swh/lister/aur/tasks.py new file mode 100644 index 0000000..52de9db --- /dev/null +++ b/swh/lister/aur/tasks.py @@ -0,0 +1,19 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from celery import shared_task + +from swh.lister.aur.lister import AurLister + + +@shared_task(name=__name__ + ".AurListerTask") +def list_aur(**lister_args): + """Lister task for Arch User Repository (AUR)""" + return AurLister.from_configfile(**lister_args).run().dict() + + +@shared_task(name=__name__ + ".ping") +def _ping(): + return "OK" diff --git a/swh/lister/aur/tests/__init__.py b/swh/lister/aur/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/swh/lister/aur/tests/data/fake_aur_packages.sh b/swh/lister/aur/tests/data/fake_aur_packages.sh new file mode 100755 index 0000000..26ad1e3 --- /dev/null +++ b/swh/lister/aur/tests/data/fake_aur_packages.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# Script to generate packages-meta-v1.json.gz +# files and fake http responses for https_aur.archlinux.org +# For tests purposes only + +set -euo pipefail + +# files and directories +mkdir https_aur.archlinux.org + +mkdir -p tmp_dir/archives/ +cd tmp_dir/archives/ + +echo -e '''[ +{"ID":787300,"Name":"tealdeer-git","PackageBaseID":110159,"PackageBase":"tealdeer-git","Version":"r255.30b7c5f-1","Description":"A fast tldr client in Rust.","URL":"https://github.com/dbrgn/tealdeer","NumVotes":11,"Popularity":0.009683,"OutOfDate":null,"Maintainer":"dbrgn","FirstSubmitted":1460795753,"LastModified":1599251812,"URLPath":"/cgit/aur.git/snapshot/tealdeer-git.tar.gz"}, +{"ID":860370,"Name":"ibus-git","PackageBaseID":163059,"PackageBase":"ibus-git","Version":"1.5.23+12+gef4c5c7e-1","Description":"Next Generation Input Bus for Linux","URL":"https://github.com/ibus/ibus/wiki","NumVotes":1,"Popularity":0.989573,"OutOfDate":null,"Maintainer":"tallero","FirstSubmitted":1612764731,"LastModified":1612764731,"URLPath":"/cgit/aur.git/snapshot/ibus-git.tar.gz"}, +{"ID":1043337,"Name":"libervia-web-hg","PackageBaseID":170485,"PackageBase":"libervia-web-hg","Version":"0.9.0.r1492.3a34d78f2717-1","Description":"Salut à Toi, multi-frontends multi-purposes XMPP client (Web interface)","URL":"http://salut-a-toi.org/","NumVotes":0,"Popularity":0.0,"OutOfDate":null,"Maintainer":"jnanar","FirstSubmitted":1630224837,"LastModified":1645889458,"URLPath":"/cgit/aur.git/snapshot/libervia-web-hg.tar.gz"}, +{"ID":1072642,"Name":"hg-evolve","PackageBaseID":135047,"PackageBase":"hg-evolve","Version":"10.5.1-1","Description":"Flexible evolution of Mercurial history","URL":"https://www.mercurial-scm.org/doc/evolution/","NumVotes":6,"Popularity":0.003887,"OutOfDate":null,"Maintainer":"damien-43","FirstSubmitted":1534190432,"LastModified":1651089776,"URLPath":"/cgit/aur.git/snapshot/hg-evolve.tar.gz"} +]''' > packages-meta-v1.json + +# Gzip archive +gzip -c packages-meta-v1.json > ../../https_aur.archlinux.org/packages-meta-v1.json.gz + +# Clean up removing tmp_dir +cd ../../ +rm -rf tmp_dir/ diff --git a/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz b/swh/lister/aur/tests/data/https_aur.archlinux.org/packages-meta-v1.json.gz new file mode 100644 index 0000000000000000000000000000000000000000..56b8241d94c3867af8acbd299ae27a2ee2d0f3e6 GIT binary patch literal 701 zcmV;u0z&;CiwFogjRh=p|ZPaevq-hrhOyU)A(^~N-%7{ZZv$r|pBJ;PaDa<2TDFy6?Uvf_@-_zBazOGXGHlI?8wYi_yHLbfiT zB0>TTb6SwfM=o^Ab)m$ z&4WTVVVSF08?Fy^nU}V?m4@rCKi8XF*qSL}9=$OPAk2~!kGv~uuBuaJT)(ZY>ybBM zqBZVu3S)02>6Ffd(&onIjWC9nPA^U%%?Kq<;llMgk)^1_Dnha>A}B@Cs?#YmbEgv) zuFsHJ6?AXivW1>Y6K)$680Ml+-mlRMfHZ-az5)=r)&DR*!LV=sR{pE;C?G+Ek5P0y z<5gUcg7N|XFZqK3AGx!Dbvxj^U04H7tOk`-;6k+a;eQ;ulb@HRxD&e^_Az9sbMYVs z!|IwVIRqj>5lv!>Q4hpcUGC=_Q=fEkWUnpP1-bO6i^s~w0xzriOdMxe2^5q_m)Yj|7gm+ysD3#qvVUnkS) zdVJn};klb3!&Su!{(gs^gQq)PpZP|LK&n}|!wmLD=Mb(REo+$?q=jJ=#c5~gKCPIf zY37zU(Aslz02+;wINCs)&wPF_>w7+c3lkXAeO%iGTf+n0^q>JOXElEidCfsrWLG02 zD=^`zuu8BR%!M{mJ@#g1xm*U#I?LBZvogPwMYs{#MUnJxF`TCK;4Q`)_ZRpv9zsHJ jj57C^M|~s&!8D^Zc>|J-V;dMp-;e$Ru%lBIO9lV{cAiU- literal 0 HcmV?d00001 diff --git a/swh/lister/aur/tests/test_lister.py b/swh/lister/aur/tests/test_lister.py new file mode 100644 index 0000000..c403dad --- /dev/null +++ b/swh/lister/aur/tests/test_lister.py @@ -0,0 +1,131 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information +from swh.lister.aur.lister import AurLister + +expected_origins = [ + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/hg-evolve.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "hg-evolve.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/hg-evolve.tar.gz", # noqa: B950 + "version": "10.5.1-1", + } + ], + "aur_metadata": [ + { + "version": "10.5.1-1", + "project_url": "https://www.mercurial-scm.org/doc/evolution/", + "last_update": "2022-04-27T20:02:56+00:00", + "pkgname": "hg-evolve", + } + ], + }, + }, + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/ibus-git.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "ibus-git.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/ibus-git.tar.gz", # noqa: B950 + "version": "1.5.23+12+gef4c5c7e-1", + } + ], + "aur_metadata": [ + { + "version": "1.5.23+12+gef4c5c7e-1", + "project_url": "https://github.com/ibus/ibus/wiki", + "last_update": "2021-02-08T06:12:11+00:00", + "pkgname": "ibus-git", + } + ], + }, + }, + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/libervia-web-hg.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "libervia-web-hg.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/libervia-web-hg.tar.gz", # noqa: B950 + "version": "0.9.0.r1492.3a34d78f2717-1", + } + ], + "aur_metadata": [ + { + "version": "0.9.0.r1492.3a34d78f2717-1", + "project_url": "http://salut-a-toi.org/", + "last_update": "2022-02-26T15:30:58+00:00", + "pkgname": "libervia-web-hg", + } + ], + }, + }, + { + "visit_type": "aur", + "url": "https://aur.archlinux.org/tealdeer-git.git", + "extra_loader_arguments": { + "artifacts": [ + { + "filename": "tealdeer-git.tar.gz", + "url": "https://aur.archlinux.org/cgit/aur.git/snapshot/tealdeer-git.tar.gz", # noqa: B950 + "version": "r255.30b7c5f-1", + } + ], + "aur_metadata": [ + { + "version": "r255.30b7c5f-1", + "project_url": "https://github.com/dbrgn/tealdeer", + "last_update": "2020-09-04T20:36:52+00:00", + "pkgname": "tealdeer-git", + } + ], + }, + }, +] + + +def test_aur_lister(datadir, requests_mock_datadir, swh_scheduler): + lister = AurLister(scheduler=swh_scheduler) + res = lister.run() + + assert res.pages == 4 + assert res.origins == 4 + + scheduler_origins_sorted = sorted( + swh_scheduler.get_listed_origins(lister.lister_obj.id).results, + key=lambda x: x.url, + ) + expected_origins_sorted = sorted(expected_origins, key=lambda x: x.get("url")) + + assert len(scheduler_origins_sorted) == len(expected_origins_sorted) + + assert [ + ( + scheduled.visit_type, + scheduled.url, + scheduled.extra_loader_arguments.get("artifacts"), + ) + for scheduled in scheduler_origins_sorted + ] == [ + ( + "aur", + expected.get("url"), + expected.get("extra_loader_arguments").get("artifacts"), + ) + for expected in expected_origins_sorted + ] + + +def test_aur_lister_directory_cleanup(datadir, requests_mock_datadir, swh_scheduler): + lister = AurLister(scheduler=swh_scheduler) + lister.run() + # Repository directory should not exists after the lister runs + assert not lister.DESTINATION_PATH.exists() diff --git a/swh/lister/aur/tests/test_tasks.py b/swh/lister/aur/tests/test_tasks.py new file mode 100644 index 0000000..44e72d1 --- /dev/null +++ b/swh/lister/aur/tests/test_tasks.py @@ -0,0 +1,31 @@ +# Copyright (C) 2022 The Software Heritage developers +# See the AUTHORS file at the top-level directory of this distribution +# License: GNU General Public License version 3, or any later version +# See top-level LICENSE file for more information + +from swh.lister.pattern import ListerStats + + +def test_aur_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker): + res = swh_scheduler_celery_app.send_task("swh.lister.aur.tasks.ping") + assert res + res.wait() + assert res.successful() + assert res.result == "OK" + + +def test_aur_lister(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker): + # setup the mocked AurLister + lister = mocker.patch("swh.lister.aur.tasks.AurLister") + lister.from_configfile.return_value = lister + stats = ListerStats(pages=42, origins=42) + lister.run.return_value = stats + + res = swh_scheduler_celery_app.send_task("swh.lister.aur.tasks.AurListerTask") + assert res + res.wait() + assert res.successful() + assert res.result == stats.dict() + + lister.from_configfile.assert_called_once_with() + lister.run.assert_called_once_with()