rpm: Turn fedora lister into a generic Red Hat based distribution one

As Red Hat based linux distributions share the same type of package repository,
rework the fedora lister into a generic one to list RPM source packages and
their versions from numerous distributions.

For a given distribution, the RPM lister will fetch packages metadata from a
list of release identifiers and a list of software components. Source packages
are then processed and relevant info are extracted to be sent to the RPM loader.
When all releases and components were processed, the lister collected all versions
for each package name and send those info to the scheduler that will create RPM
loading tasks afterwards.

Nevertheless, as there is no generic way to list all releases and components for
a given distribution but also to guess the right URL to retrieve packages metadata
from, those info need to be manually provided to the lister as input parameters.
Some examples of those parameters for various distributions can be found in the
config directory of the lister.

Regarding the produced origin URLs, as there is no way to find valid HTTP ones
for all distributions, the same behavior as with the debian lister is used and
they have the following form: rpm://{instance}/packages/{package_name} where
the instance variable corresponds to the name of the listed distribution such
as Fedora, CentOS, or openSUSE.

Related to swh/meta#5011.
This commit is contained in:
Antoine Lambert 2023-08-16 13:25:23 +00:00
parent fcfb7004db
commit 95714f6f37
23 changed files with 1096 additions and 577 deletions

View file

@ -13,7 +13,6 @@ following Python modules:
- `swh.lister.cgit`
- `swh.lister.cran`
- `swh.lister.debian`
- `swh.liser.fedora`
- `swh.lister.gitea`
- `swh.lister.github`
- `swh.lister.gitlab`
@ -27,6 +26,7 @@ following Python modules:
- `swh.lister.packagist`
- `swh.lister.phabricator`
- `swh.lister.pypi`
- `swh.lister.rpm`
- `swh.lister.tuleap`
Dependencies

View file

@ -65,7 +65,6 @@ setup(
lister.cran=swh.lister.cran:register
lister.crates=swh.lister.crates:register
lister.debian=swh.lister.debian:register
lister.fedora=swh.lister.fedora:register
lister.gitea=swh.lister.gitea:register
lister.github=swh.lister.github:register
lister.gitiles=swh.lister.gitiles:register
@ -87,6 +86,7 @@ setup(
lister.pubdev=swh.lister.pubdev:register
lister.puppet=swh.lister.puppet:register
lister.pypi=swh.lister.pypi:register
lister.rpm=swh.lister.rpm:register
lister.rubygems=swh.lister.rubygems:register
lister.sourceforge=swh.lister.sourceforge:register
lister.stagit=swh.lister.stagit:register

View file

@ -1,265 +0,0 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import dataclass, field
from datetime import datetime, timezone
import logging
from typing import Any, Dict, Iterator, List, Optional, Set, Type
from urllib.error import HTTPError
from urllib.parse import urljoin
import repomd
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import Lister
logger = logging.getLogger(__name__)
Release = int
Edition = str
PkgName = str
PkgVersion = str
FedoraOrigin = str
FedoraPageType = Type[repomd.Repo]
"""Each page is a list of packages from a given Fedora (release, edition) pair"""
def get_editions(release: Release) -> List[Edition]:
"""Get list of editions for a given release."""
# Ignore dirs that don't contain .rpm files:
# Docker,CloudImages,Atomic*,Spins,Live,Cloud_Atomic,Silverblue
if release < 20:
return ["Everything", "Fedora"]
elif release < 28:
return ["Everything", "Server", "Workstation"]
else:
return ["Everything", "Server", "Workstation", "Modular"]
def get_last_modified(pkg: repomd.Package) -> datetime:
"""Get timezone aware last modified time in UTC from RPM package metadata."""
ts = pkg._element.find("common:time", namespaces=repomd._ns).get("build")
return datetime.utcfromtimestamp(int(ts)).replace(tzinfo=timezone.utc)
def get_checksums(pkg: repomd.Package) -> Dict[str, str]:
"""Get checksums associated to rpm archive."""
cs = pkg._element.find("common:checksum", namespaces=repomd._ns)
cs_type = cs.get("type")
if cs_type == "sha":
cs_type = "sha1"
return {cs_type: cs.text}
@dataclass
class FedoraListerState:
"""State of Fedora lister"""
package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict)
"""Dictionary mapping a package name to all the versions found during
last listing"""
class FedoraLister(Lister[FedoraListerState, FedoraPageType]):
"""
List source packages for given Fedora releases.
The lister will create a snapshot for each package name from all its
available versions.
If a package snapshot is different from the last listing operation,
it will be sent to the scheduler that will create a loading task
to archive newly found source code.
Args:
scheduler: instance of SchedulerInterface
url: fedora package archives mirror URL
releases: list of fedora releases to process
"""
LISTER_NAME = "fedora"
def __init__(
self,
scheduler: SchedulerInterface,
instance: str = "fedora",
url: str = "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/",
releases: List[Release] = [34, 35, 36],
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
):
super().__init__(
scheduler=scheduler,
url=url,
instance=instance,
credentials={},
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
)
self.releases = releases
self.listed_origins: Dict[FedoraOrigin, ListedOrigin] = {}
"will hold all listed origins info"
self.origins_to_send: Set[FedoraOrigin] = set()
"will hold updated origins since last listing"
self.package_versions: Dict[PkgName, Set[PkgVersion]] = {}
"will contain the lister state after a call to run"
self.last_page = False
def state_from_dict(self, d: Dict[str, Any]) -> FedoraListerState:
return FedoraListerState(package_versions={k: set(v) for k, v in d.items()})
def state_to_dict(self, state: FedoraListerState) -> Dict[str, Any]:
return {k: list(v) for k, v in state.package_versions.items()}
def page_request(self, release: Release, edition: Edition) -> FedoraPageType:
"""Return parsed packages for a given fedora release."""
index_url = urljoin(
self.url,
f"{release}/{edition}/source/SRPMS/"
if release < 24
else f"{release}/{edition}/source/tree/",
)
repo = repomd.load(index_url) # throws error if no repomd.xml is not found
self.last_page = (
release == self.releases[-1] and edition == get_editions(release)[-1]
)
logger.debug(
"Fetched metadata from url: %s, found %d packages", index_url, len(repo)
)
# TODO: Extract more fields like "provides" and "requires" from *primary.xml
# as extrinsic metadata using the pkg._element.findtext method
return repo
def get_pages(self) -> Iterator[FedoraPageType]:
"""Return an iterator on parsed fedora packages, one page per (release, edition) pair"""
for release in self.releases:
for edition in get_editions(release):
logger.debug("Listing fedora release %s edition %s", release, edition)
self.current_release = release
self.current_edition = edition
try:
yield self.page_request(release, edition)
except HTTPError as http_error:
if http_error.getcode() == 404:
logger.debug(
"No packages metadata found for fedora release %s edition %s",
release,
edition,
)
continue
raise
def origin_url_for_package(self, package_name: PkgName) -> FedoraOrigin:
"""Return the origin url for the given package"""
return f"https://src.fedoraproject.org/rpms/{package_name}"
def get_origins_from_page(self, page: FedoraPageType) -> Iterator[ListedOrigin]:
"""Convert a page of fedora package sources into an iterator of ListedOrigin."""
assert self.lister_obj.id is not None
origins_to_send = set()
# iterate on each package's metadata
for pkg_metadata in page:
# extract package metadata
package_name = pkg_metadata.name
package_version = pkg_metadata.vr
package_version_split = package_version.split(".")
if package_version_split[-1].startswith("fc"):
# remove trailing ".fcXY" in version for the rpm loader to avoid
# creating multiple releases targeting same directory
package_version = ".".join(package_version_split[:-1])
package_build_time = get_last_modified(pkg_metadata)
package_download_path = pkg_metadata.location
# build origin url
origin_url = self.origin_url_for_package(package_name)
# create package version key as expected by the fedora (rpm) loader
package_version_key = (
f"fedora{self.current_release}/{self.current_edition}/"
f"{package_version}"
).lower()
# this is the first time a package is listed
if origin_url not in self.listed_origins:
# create a ListedOrigin object for it that can be later
# updated with new package versions info
self.listed_origins[origin_url] = ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="rpm",
extra_loader_arguments={"packages": {}},
last_update=package_build_time,
)
# init set that will contain all listed package versions
self.package_versions[package_name] = set()
# origin will be yielded at the end of that method
origins_to_send.add(origin_url)
# update package metadata in parameter that will be provided
# to the rpm loader
self.listed_origins[origin_url].extra_loader_arguments["packages"][
package_version_key
] = {
"name": package_name,
"version": package_version,
"url": urljoin(page.baseurl, package_download_path),
"buildTime": package_build_time.isoformat(),
"checksums": get_checksums(pkg_metadata),
}
last_update = self.listed_origins[origin_url].last_update
if last_update is not None and package_build_time > last_update:
self.listed_origins[origin_url].last_update = package_build_time
# add package version key to the set of found versions
self.package_versions[package_name].add(package_version_key)
# package has already been listed during a previous listing process
if package_name in self.state.package_versions:
new_versions = (
self.package_versions[package_name]
- self.state.package_versions[package_name]
)
# no new versions so far, no need to send the origin to the scheduler
if not new_versions:
origins_to_send.remove(origin_url)
logger.debug(
"Found %s packages to update (new ones or packages with new versions).",
len(origins_to_send),
)
logger.debug(
"Current total number of listed packages is equal to %s.",
len(self.listed_origins),
)
# yield from origins_to_send.values()
self.origins_to_send.update(origins_to_send)
if self.last_page:
# yield listed origins when all fedora releases and editions processed
yield from [
self.listed_origins[origin_url] for origin_url in self.origins_to_send
]
def finalize(self):
# set mapping between listed package names and versions as lister state
self.state.package_versions = self.package_versions
self.updated = len(self.listed_origins) > 0

View file

@ -1,21 +0,0 @@
# Copyright (C) 2022 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Dict
from celery import shared_task
from .lister import FedoraLister
@shared_task(name=__name__ + ".FullFedoraRelister")
def list_fedora_full(**lister_args) -> Dict[str, int]:
"""Full update of a Fedora instance"""
lister = FedoraLister.from_configfile(**lister_args)
return lister.run().dict()
@shared_task(name=__name__ + ".ping")
def _ping() -> str:
return "OK"

View file

@ -1,221 +0,0 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from io import StringIO
from pathlib import Path
from typing import List
from unittest.mock import MagicMock
from urllib.error import HTTPError
import pytest
from swh.lister.fedora.lister import FedoraLister, Release, get_editions
from swh.scheduler.interface import SchedulerInterface
def mock_repomd(datadir, mocker, use_altered_fedora36=False):
"""Mocks the .xml files fetched by repomd for the next lister run"""
paths = ["repomd26.xml", "primary26.xml.gz", "repomd36.xml", "primary36.xml.gz"]
if use_altered_fedora36:
paths[3] = "primary36-altered.xml.gz"
cm = MagicMock()
cm.read.side_effect = [
Path(datadir, "archives.fedoraproject.org", path).read_bytes() for path in paths
]
cm.__enter__.return_value = cm
mocker.patch("repomd.urllib.request.urlopen").return_value = cm
def rpm_url(release, path):
return (
"https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/"
f"{release}/Everything/source/tree/Packages/{path}"
)
@pytest.fixture
def pkg_versions():
return {
"https://src.fedoraproject.org/rpms/0install": {
"fedora26/everything/2.11-4": {
"name": "0install",
"version": "2.11-4",
"buildTime": "2017-02-10T04:59:31+00:00",
"url": rpm_url(26, "0/0install-2.11-4.fc26.src.rpm"),
"checksums": {
# note: we intentionally altered the original
# primary26.xml file to test sha1 usage
"sha1": "a6fdef5d1026dea208eeeba148f55ac2f545989b",
},
}
},
"https://src.fedoraproject.org/rpms/0xFFFF": {
"fedora26/everything/0.3.9-15": {
"name": "0xFFFF",
"version": "0.3.9-15",
"buildTime": "2017-02-10T05:01:53+00:00",
"url": rpm_url(26, "0/0xFFFF-0.3.9-15.fc26.src.rpm"),
"checksums": {
"sha256": "96f9c163c0402d2b30e5343c8397a6d50e146c85a446804396b119ef9698231f"
},
},
"fedora36/everything/0.9-4": {
"name": "0xFFFF",
"version": "0.9-4",
"buildTime": "2022-01-19T19:13:53+00:00",
"url": rpm_url(36, "0/0xFFFF-0.9-4.fc36.src.rpm"),
"checksums": {
"sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd"
},
},
},
"https://src.fedoraproject.org/rpms/2ping": {
"fedora36/everything/4.5.1-2": {
"name": "2ping",
"version": "4.5.1-2",
"buildTime": "2022-01-19T19:12:21+00:00",
"url": rpm_url(36, "2/2ping-4.5.1-2.fc36.src.rpm"),
"checksums": {
"sha256": "2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28"
},
}
},
}
def run_lister(
swh_scheduler: SchedulerInterface,
releases: List[Release],
pkg_versions: dict,
origin_count: int,
updated: bool = True,
):
"""Runs the lister and tests that the listed origins are correct."""
lister = FedoraLister(scheduler=swh_scheduler, releases=releases)
stats = lister.run()
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
lister_state = lister.get_state_from_scheduler()
state_pkg_versions = {k.split("/")[-1]: set(v) for k, v in pkg_versions.items()}
# One edition from each release (we mocked get_editions)
assert stats.pages == (len(releases) if updated else 0)
assert stats.origins == origin_count
assert {
o.url: o.extra_loader_arguments["packages"] for o in scheduler_origins
} == pkg_versions
assert lister_state.package_versions == state_pkg_versions
assert lister.updated == updated
def test_get_editions():
assert get_editions(18) == ["Everything", "Fedora"]
assert get_editions(26) == ["Everything", "Server", "Workstation"]
assert get_editions(34) == ["Everything", "Server", "Workstation", "Modular"]
@pytest.mark.parametrize("status_code", [400, 404, 500])
def test_fedora_lister_http_error(
swh_scheduler: SchedulerInterface, mocker: MagicMock, status_code: int
):
"""
Simulates handling of HTTP Errors while fetching of packages for fedora releases.
"""
releases = [18]
is_404 = status_code == 404
def side_effect(url):
if is_404:
raise HTTPError(
url, status_code, "Not Found", {"content-type": "text/html"}, StringIO()
)
else:
raise HTTPError(
url,
status_code,
"Internal server error",
{"content-type": "text/html"},
StringIO(),
)
urlopen_patch = mocker.patch("repomd.urllib.request.urlopen")
urlopen_patch.side_effect = side_effect
expected_pkgs: dict = {}
if is_404:
run_lister(
swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False
)
else:
with pytest.raises(HTTPError):
run_lister(
swh_scheduler, releases, expected_pkgs, origin_count=0, updated=False
)
def test_full_lister_fedora(
swh_scheduler: SchedulerInterface,
mocker: MagicMock,
datadir: Path,
pkg_versions: dict,
):
"""
Simulates a full listing of packages for fedora releases.
"""
releases = [26, 36]
get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions")
get_editions_patch.return_value = ["Everything"]
mock_repomd(datadir, mocker)
run_lister(swh_scheduler, releases, pkg_versions, origin_count=3)
def test_incremental_lister(
swh_scheduler: SchedulerInterface,
mocker: MagicMock,
datadir: Path,
pkg_versions: dict,
):
"""
Simulates an incremental listing of packages for fedora releases.
"""
releases = [26, 36]
get_editions_patch = mocker.patch("swh.lister.fedora.lister.get_editions")
get_editions_patch.return_value = ["Everything"]
# First run
mock_repomd(datadir, mocker)
run_lister(swh_scheduler, releases, pkg_versions, origin_count=3)
# Second run (no updates)
mock_repomd(datadir, mocker)
run_lister(swh_scheduler, releases, pkg_versions, origin_count=0)
# Use an altered version of primary36.xml in which we updated the version
# of package 0xFFFF to 0.10:
mock_repomd(datadir, mocker, use_altered_fedora36=True)
# Add new version to the set of expected pkg versions:
pkg_versions["https://src.fedoraproject.org/rpms/0xFFFF"].update(
{
"fedora36/everything/0.10-4": {
"name": "0xFFFF",
"version": "0.10-4",
"buildTime": "2022-01-19T19:13:53+00:00",
"url": rpm_url(36, "0/0xFFFF-0.10-4.fc36.src.rpm"),
"checksums": {
"sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd"
},
}
}
)
# Third run (0xFFFF in fedora36 editions got updated and it needs to be listed)
run_lister(swh_scheduler, releases, pkg_versions, origin_count=1)

View file

@ -1,60 +0,0 @@
# Copyright (C) 2022 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from unittest.mock import patch
from swh.lister.pattern import ListerStats
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.fedora.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
@patch("swh.lister.fedora.tasks.FedoraLister")
def test_full_listing(lister, swh_scheduler_celery_app, swh_scheduler_celery_worker):
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
kwargs = dict(
url="https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/"
)
res = swh_scheduler_celery_app.send_task(
"swh.lister.fedora.tasks.FullFedoraRelister",
kwargs=kwargs,
)
assert res
res.wait()
assert res.successful()
lister.from_configfile.assert_called_once_with(**kwargs)
lister.run.assert_called_once_with()
@patch("swh.lister.fedora.tasks.FedoraLister")
def test_full_listing_params(
lister, swh_scheduler_celery_app, swh_scheduler_celery_worker
):
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
kwargs = dict(
url="https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/",
instance="archives.fedoraproject.org",
releases=["36"],
)
res = swh_scheduler_celery_app.send_task(
"swh.lister.fedora.tasks.FullFedoraRelister",
kwargs=kwargs,
)
assert res
res.wait()
assert res.successful()
lister.from_configfile.assert_called_once_with(**kwargs)
lister.run.assert_called_once_with()

View file

@ -1,13 +1,13 @@
# Copyright (C) 2022 The Software Heritage developers
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import FedoraLister
from .lister import RPMLister
return {
"lister": FedoraLister,
"lister": RPMLister,
"task_modules": [f"{__name__}.tasks"],
}

View file

@ -0,0 +1,100 @@
# RPM lister parameters to process CentOS source packages
url: https://www.centos.org
instance: CentOS
rpm_src_data:
- base_url: https://vault.centos.org/
releases:
- "3.7"
- "3.8"
- "3.9"
- "4.0"
- "4.1"
- "4.2"
- "4.3"
- "4.4"
- "4.5"
- "4.6"
- "4.7"
- "4.8"
- "4.9"
- "5.0"
- "5.1"
- "5.2"
- "5.3"
- "5.4"
- "5.5"
- "5.6"
- "5.7"
- "5.8"
- "5.9"
- "5.10"
- "5.11"
- "6.0"
- "6.1"
- "6.2"
- "6.3"
- "6.4"
- "6.5"
- "6.6"
- "6.7"
- "6.8"
- "6.9"
- "6.10"
- "7.0.1406"
- "7.1.1503"
- "7.2.1511"
- "7.3.1611"
- "7.4.1708"
- "7.5.1804"
- "7.6.1810"
- "7.7.1908"
- "7.8.2003"
- "7.9.2009"
- "8-stream"
- "8.0.1905"
- "8.1.1911"
- "8.2.2004"
- "8.3.2011"
- "8.4.2105"
- "8.5.2111"
components:
- AppStream
- BaseOS
- HighAvailability
- PowerTools
- SCL
- addons
- centosplus
- contrib
- cr
- csgfs
- dotnet
- extras
- fasttrack
- opstools
- os
- rt
- testing
- updates
- xen4
index_url_templates:
- $base_url/$release/$component/Source/
- $base_url/$release/$component/SRPMS/
- $base_url/$release/$component/x86_64/
- base_url: https://mirror.stream.centos.org
releases:
- 9-stream
components:
- AppStream
- BaseOS
- CRB
- HighAvailability
- NFV
- RT
- ResilientStorage
index_url_templates:
- $base_url/$release/$component/source/tree/

View file

@ -0,0 +1,77 @@
# RPM lister parameters to process Fedora source packages
url: https://fedoraproject.org
instance: "Fedora"
rpm_src_data:
- base_url: https://archives.fedoraproject.org/pub/archive/fedora/linux/
releases:
- "2"
- "3"
- "4"
- "5"
- "6"
components:
- core
- extras
index_url_templates:
- $base_url/$component/$release/SRPMS
- $base_url/$component/$release/source/SRPMS
- $base_url/$component/$release/x86_64/os/
- base_url: https://archives.fedoraproject.org/pub/archive/fedora/linux/
releases:
- "7"
- "8"
- "9"
- "10"
- "11"
- "12"
- "13"
- "14"
- "15"
- "16"
- "17"
- "18"
- "19"
- "20"
- "21"
- "22"
- "23"
- "24"
- "25"
- "26"
- "27"
- "28"
- "29"
- "30"
- "31"
- "32"
- "33"
- "34"
- "35"
components:
- Everything
- Server
- Workstation
- Modular
- Fedora
index_url_templates:
- $base_url/releases/$release/$component/source/tree/
- $base_url/updates/$release/$component/source/tree/
- $base_url/releases/$release/$component/source/SRPMS/
- $base_url/updates/$release/SRPMS/
- base_url: https://dl.fedoraproject.org/pub/fedora/linux/
releases:
- "36"
- "37"
- "38"
components:
- Everything
- Server
- Workstation
- Modular
- Fedora
index_url_templates:
- $base_url/releases/$release/$component/source/tree/
- $base_url/updates/$release/$component/source/tree/

View file

@ -0,0 +1,26 @@
# RPM lister parameters to process openSUSE source packages
url: http://opensuse.org
instance: openSUSE
rpm_src_data:
- base_url: http://download.opensuse.org/source/
releases:
- tumbleweed
- jump/15.2
- leap/15.0-Current
- leap/15.0
- leap/15.1
- leap/15.2
- leap/15.3
- leap/15.4
- leap/15.5
- leap/42.2
- leap/42.3-Current
- leap/42.3
components:
- oss
- non-oss
index_url_templates:
- $base_url/distribution/$release/repo/$component/
- $base_url/distribution/$release/repo/$component/suse/
- $base_url/$release/repo/$component/

View file

@ -0,0 +1,156 @@
# RPM lister parameters to process Oracle Linux source packages
url: https://www.oracle.com/linux
instance: OracleLinux
rpm_src_data:
- base_url: https://yum.oracle.com/repo/EnterpriseLinux/
releases:
- EL5
components:
- addons
- oracle_addons
- unsupported
- 0/base
- 1/base
- 2/base
- 3/base
- 4/base
- 5/base
index_url_templates:
- $base_url/$release/$component/x86_64
- base_url: https://yum.oracle.com/repo/OracleLinux/
releases:
- OL5
- OL6
- OL7
- OL8
- OL9
components:
- 0/base
- 0/baseos/base
- 1/base
- 1/baseos/base
- 10/base
- 11/base
- 2/base
- 2/baseos/base
- 3/base
- 3/baseos/base
- 4/base
- 4/baseos/base
- 4/security/validation
- 5/base
- 5/baseos/base
- 6/base
- 6/baseos/base
- 7/base
- 7/baseos/base
- 8/base
- 8/baseos/base
- 8/security/validation
- 9/base
- MODRHCK
- MySQL
- MySQL56
- MySQL57_community
- MySQL80/community
- MySQL80/connectors/community
- MySQL80/tools/community
- MySQL80_community
- RDMA
- SoftwareCollections
- UEK/latest
- UEKR3
- UEKR3/latest
- UEKR3_OFED20
- UEKR4
- UEKR4/OFED
- UEKR4/archive
- UEKR5
- UEKR5/RDMA
- UEKR5/archive
- UEKR6
- UEKR6/RDMA
- UEKR7
- UEKR7/RDMA
- addons
- appstream
- appstream/developer
- automation2
- baseos/developer
- baseos/latest
- beta
- ceph
- ceph30
- codeready/builder
- codeready/builder/developer
- developer
- developer/EPEL
- developer/EPEL/modular
- developer/UEKR5
- developer/UEKR6
- developer/UEKR7
- developer/golang117
- developer/golang118
- developer/golang119
- developer/kvm/utils
- developer/nodejs12
- developer/olcne
- developer/php74
- developer_EPEL
- developer_gluster310
- developer_gluster312
- distro/builder
- gluster/appstream
- gluster312
- gluster41
- gluster5
- gluster6
- gluster8
- kvm/appstream
- kvm/utils
- latest
- latest/archive
- leapp
- ofed_UEK
- olcne
- olcne11
- olcne12
- olcne13
- olcne14
- olcne15
- olcne16
- openstack10
- openstack21
- openstack30
- openstack40
- openstack40_extras
- openstack50
- openstack50_extras
- optional
- optional/archive
- optional/beta
- oracle/instantclient
- oracle/instantclient21
- oraclelinuxmanager210/client
- oraclelinuxmanager210/server
- ovirt42
- ovirt42/extras
- ovirt43
- ovirt43/extras
- ovirt44
- ovirt44/extras
- security/validation
- spacewalk210/client
- spacewalk210/server
- spacewalk24/client
- spacewalk24/server
- spacewalk26/client
- spacewalk26/server
- spacewalk27/client
- spacewalk27/server
index_url_templates:
- $base_url/$release/$component/x86_64

View file

@ -0,0 +1,38 @@
# RPM lister parameters to process Rocky Linux source packages
url: https://rockylinux.org
instance: RockyLinux
rpm_src_data:
- base_url: https://download.rockylinux.org/
releases:
- "8.3"
- "8.4"
- "8.4-RC1"
- "8.5"
- "8.6"
- "8.7"
- "8.8"
- "9.0"
- "9.1"
- "9.2"
components:
- AppStream
- BaseOS
- Devel
- HighAvailability
- Minimal
- PowerTools
- ResilientStorage
- CRB
- NFV
- RT
- SAP
- SAPHANA
- devel
- extras
- plus
- nfv
- rockyrpi
index_url_templates:
- $base_url/vault/rocky/$release/$component/source/tree/
- $base_url/pub/rocky/$release/$component/source/tree/

314
swh/lister/rpm/lister.py Normal file
View file

@ -0,0 +1,314 @@
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import dataclass, field
from datetime import datetime, timezone
from itertools import product
import logging
from string import Template
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from urllib.parse import urljoin
import repomd
from typing_extensions import TypedDict
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import Lister
logger = logging.getLogger(__name__)
Release = str
Component = str
PkgName = str
PkgVersion = str
RPMOrigin = str
RPMPageType = Optional[Tuple[Release, Component, repomd.Repo]]
"""Each page is a list of packages for a given (release, component) pair
from a Red Hat based distribution."""
class RPMSourceData(TypedDict):
"""Dictionary holding relevant data for listing RPM source packages.
See content of the lister config directory to get examples of RPM
source data for famous RedHat based distributions.
"""
base_url: str
"""Base URL of a RPM repository"""
releases: List[Release]
"""List of release identifiers for a Red Hat based distribution"""
components: List[Component]
"""List of components for a Red Hat based distribution"""
index_url_templates: List[str]
"""List of URL templates to discover source packages metadata, the
following variables can be substituted in them: ``base_url``, ``release``
and ``edition``, see :class:`string.Template` for more details about the
format. The generated URLs must target directories containing a sub-directory
named ``repodata``, which contains packages metadata, in order to be
successfully processed by the lister."""
def _get_last_modified(pkg: repomd.Package) -> datetime:
"""Get timezone aware last modified time in UTC from RPM package metadata."""
ts = pkg._element.find("common:time", namespaces=repomd._ns).get("build")
return datetime.utcfromtimestamp(int(ts)).replace(tzinfo=timezone.utc)
def _get_checksums(pkg: repomd.Package) -> Dict[str, str]:
"""Get checksums associated to rpm archive."""
cs = pkg._element.find("common:checksum", namespaces=repomd._ns)
cs_type = cs.get("type")
if cs_type == "sha":
cs_type = "sha1"
return {cs_type: cs.text}
@dataclass
class RPMListerState:
"""State of RPM lister"""
package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict)
"""Dictionary mapping a package name to all the versions found during
last listing"""
class RPMLister(Lister[RPMListerState, RPMPageType]):
"""
List source packages for a Red Hat based linux distribution.
The lister creates a snapshot for each package from all its available versions.
In incremental mode, only packages with different snapshot since the last listing
operation will be sent to the scheduler that will create loading tasks to archive
newly found source code.
Args:
scheduler: instance of SchedulerInterface
url: Red Hat based distribution info URL
instance: name of Red Hat based distribution
rpm_src_data: list of dictionaries holding data required to list RPM source packages,
see examples in the config directory.
incremental: if :const:`True`, only packages with new versions are sent to the
scheduler when relisting
"""
LISTER_NAME = "rpm"
def __init__(
self,
scheduler: SchedulerInterface,
url: str,
instance: str,
rpm_src_data: List[RPMSourceData],
incremental: bool = False,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
):
super().__init__(
scheduler=scheduler,
url=url,
instance=instance,
credentials={},
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
)
self.rpm_src_data = rpm_src_data
self.incremental = incremental
self.listed_origins: Dict[RPMOrigin, ListedOrigin] = {}
self.origins_to_send: Set[RPMOrigin] = set()
self.package_versions: Dict[PkgName, Set[PkgVersion]] = {}
def state_from_dict(self, d: Dict[str, Any]) -> RPMListerState:
return RPMListerState(package_versions={k: set(v) for k, v in d.items()})
def state_to_dict(self, state: RPMListerState) -> Dict[str, Any]:
return {k: list(v) for k, v in state.package_versions.items()}
def repo_request(
self,
index_url_template: Template,
base_url: str,
release: Release,
component: Component,
) -> Optional[RPMPageType]:
"""Return parsed packages for a given distribution release and component."""
index_url = index_url_template.substitute(
base_url=base_url.rstrip("/"), release=release, component=component
)
try:
repo = repomd.load(index_url) # throws error if no repomd.xml is not found
except Exception:
logger.debug("Repository metadata not found at URL %s", index_url)
return None
else:
logger.debug(
"Fetched metadata from url: %s, found %d packages", index_url, len(repo)
)
return repo
def get_pages(self) -> Iterator[RPMPageType]:
"""Return an iterator on parsed rpm packages, one page per (release, component) pair."""
for rpm_src_data in self.rpm_src_data:
index_url_templates = [
Template(index_url_template)
for index_url_template in rpm_src_data["index_url_templates"]
]
# try all possible package repository URLs for each (release, component) pair
for release, component, index_url_template in product(
rpm_src_data["releases"],
rpm_src_data["components"],
index_url_templates,
):
repo = self.repo_request(
index_url_template,
rpm_src_data["base_url"],
release,
component,
)
if repo is not None:
# valid package repository found, yield page
yield (release, component, repo)
yield None
def origin_url_for_package(self, package_name: PkgName) -> RPMOrigin:
"""Return the origin url for the given package."""
# TODO: Use a better origin URL before deploying the lister to production
# https://gitlab.softwareheritage.org/swh/devel/swh-model/-/issues/4632
return f"rpm://{self.instance}/packages/{package_name}"
def get_origins_from_page(self, page: RPMPageType) -> Iterator[ListedOrigin]:
"""Convert a page of rpm package sources into an iterator of ListedOrigin."""
assert self.lister_obj.id is not None
if page is None:
# all pages processed, yield listed origins
for origin_url in self.origins_to_send:
yield self.listed_origins[origin_url]
return
release, component, repo = page
logger.debug(
"Listing %s release %s component %s from repository metadata located at %s",
self.instance,
release,
component,
repo.baseurl,
)
origins_to_send = set()
new_origins_count = 0
# iterate on each package's metadata
for pkg_metadata in repo:
if pkg_metadata.arch != "src":
# not a source package, skip it
continue
# extract package metadata
package_name = pkg_metadata.name
# we extract the intrinsic version of the package for the rpm loader
# to avoid creating different releases targeting the same directory
# 2.12-10.el8 => 2.12-10
package_version_split = pkg_metadata.vr.rsplit("-", maxsplit=1)
package_version = "-".join(
[
package_version_split[0],
package_version_split[1].split(".", maxsplit=1)[0],
]
)
# create package version key as expected by the rpm loader
package_version_key = f"{release}/{component}/{package_version}"
package_build_time = _get_last_modified(pkg_metadata)
package_download_url = urljoin(
repo.baseurl.rstrip("/") + "/", pkg_metadata.location
)
checksums = _get_checksums(pkg_metadata)
# build origin url
origin_url = self.origin_url_for_package(package_name)
# this is the first time a package is listed
if origin_url not in self.listed_origins:
# create a ListedOrigin object for it that can be later
# updated with new package versions info
self.listed_origins[origin_url] = ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="rpm",
extra_loader_arguments={"packages": {}},
last_update=package_build_time,
)
# init set that will contain all listed package versions
self.package_versions[package_name] = set()
new_origins_count += 1
# origins will be yielded when all pages processed
origins_to_send.add(origin_url)
# update package metadata in parameter that will be provided
# to the rpm loader
self.listed_origins[origin_url].extra_loader_arguments["packages"][
package_version_key
] = {
"name": package_name,
"version": package_version,
"url": package_download_url,
"build_time": package_build_time.isoformat(),
"checksums": checksums,
}
last_update = self.listed_origins[origin_url].last_update
if last_update is not None and package_build_time > last_update:
self.listed_origins[origin_url].last_update = package_build_time
# add package version key to the set of found versions
self.package_versions[package_name].add(package_version_key)
# package has already been listed during a previous listing process
if self.incremental and package_name in self.state.package_versions:
new_versions = (
self.package_versions[package_name]
- self.state.package_versions[package_name]
)
# no new versions so far, no need to send the origin to the scheduler
if not new_versions:
origins_to_send.remove(origin_url)
logger.debug(
"Found %s packages to update (%s new ones and %s packages with new versions).",
len(origins_to_send),
new_origins_count,
len(origins_to_send) - new_origins_count,
)
logger.debug(
"Current total number of listed source packages is equal to %s.",
len(self.listed_origins),
)
self.origins_to_send.update(origins_to_send)
def finalize(self):
if self.incremental:
# set mapping between listed package names and versions as lister state
self.state.package_versions = self.package_versions
self.updated = len(self.listed_origins) > 0

28
swh/lister/rpm/tasks.py Normal file
View file

@ -0,0 +1,28 @@
# Copyright (C) 2022-2023 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Dict
from celery import shared_task
from .lister import RPMLister
@shared_task(name=__name__ + ".FullRPMLister")
def list_rpm_full(**lister_args) -> Dict[str, int]:
"""Full listing of Red Hat based distribution source packages"""
lister = RPMLister.from_configfile(**lister_args)
return lister.run().dict()
@shared_task(name=__name__ + ".IncrementalRPMLister")
def list_rpm_incremental(**lister_args) -> Dict[str, int]:
"""Incremental listing of Red Hat based distribution source packages"""
lister = RPMLister.from_configfile(**lister_args, incremental=True)
return lister.run().dict()
@shared_task(name=__name__ + ".ping")
def _ping() -> str:
return "OK"

View file

@ -0,0 +1,283 @@
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from pathlib import Path
from string import Template
from typing import List
import pytest
from urllib3.exceptions import HTTPError
from swh.lister.rpm.lister import Component, Release, RPMLister
from swh.scheduler.interface import SchedulerInterface
FEDORA_URL = "https://fedoraproject.org/"
FEDORA_ARCHIVE_URL = "https://archives.fedoraproject.org/pub/archive/fedora/linux"
FEDORA_INDEX_URL_TEMPLATES = [
"$base_url/releases/$release/$component/source/tree/",
"$base_url/updates/$release/$component/source/tree/",
"$base_url/releases/$release/$component/source/SRPMS/",
"$base_url/updates/$release/SRPMS/",
]
def mock_repomd(mocker, side_effect):
"""Mocks the .xml files fetched by repomd for the next lister run"""
cm = mocker.MagicMock()
cm.read.side_effect = side_effect
cm.__enter__.return_value = cm
mocker.patch("repomd.urllib.request.urlopen").return_value = cm
def mock_fedora_repomd(datadir, mocker, use_altered_fedora36=False):
repodata = [
["repomd26.xml", "primary26.xml.gz"],
["repomd36.xml", "primary36.xml.gz"],
]
if use_altered_fedora36:
repodata[1][1] = "primary36-altered.xml.gz"
side_effect = []
for paths in repodata:
side_effect += [
Path(datadir, "archives.fedoraproject.org", path).read_bytes()
for path in paths
]
side_effect += [HTTPError() for _ in range(len(FEDORA_INDEX_URL_TEMPLATES) - 1)]
mock_repomd(mocker, side_effect)
def rpm_repodata_url(release, component):
return Template(FEDORA_INDEX_URL_TEMPLATES[0]).substitute(
base_url=FEDORA_ARCHIVE_URL, release=release, component=component
)
def rpm_src_package_url(release, component, path):
return f"{rpm_repodata_url(release, component)}Packages/{path}"
def rpm_package_origin_url(package_name, instance="Fedora"):
return f"rpm://{instance}/packages/{package_name}"
@pytest.fixture
def pkg_versions():
return {
f"{rpm_package_origin_url('0install')}": {
"26/Everything/2.11-4": {
"name": "0install",
"version": "2.11-4",
"build_time": "2017-02-10T04:59:31+00:00",
"url": rpm_src_package_url(
release="26",
component="Everything",
path="0/0install-2.11-4.fc26.src.rpm",
),
"checksums": {
# note: we intentionally altered the original
# primary26.xml file to test sha1 usage
"sha1": "a6fdef5d1026dea208eeeba148f55ac2f545989b",
},
}
},
f"{rpm_package_origin_url('0xFFFF')}": {
"26/Everything/0.3.9-15": {
"name": "0xFFFF",
"version": "0.3.9-15",
"build_time": "2017-02-10T05:01:53+00:00",
"url": rpm_src_package_url(
release="26",
component="Everything",
path="0/0xFFFF-0.3.9-15.fc26.src.rpm",
),
"checksums": {
"sha256": "96f9c163c0402d2b30e5343c8397a6d50e146c85a446804396b119ef9698231f"
},
},
"36/Everything/0.9-4": {
"name": "0xFFFF",
"version": "0.9-4",
"build_time": "2022-01-19T19:13:53+00:00",
"url": rpm_src_package_url(
release="36",
component="Everything",
path="0/0xFFFF-0.9-4.fc36.src.rpm",
),
"checksums": {
"sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd"
},
},
},
f"{rpm_package_origin_url('2ping')}": {
"36/Everything/4.5.1-2": {
"name": "2ping",
"version": "4.5.1-2",
"build_time": "2022-01-19T19:12:21+00:00",
"url": rpm_src_package_url(
release="36",
component="Everything",
path="2/2ping-4.5.1-2.fc36.src.rpm",
),
"checksums": {
"sha256": "2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28"
},
}
},
}
def run_lister(
swh_scheduler: SchedulerInterface,
releases: List[Release],
components: List[Component],
pkg_versions: dict,
origin_count: int,
incremental: bool = False,
updated: bool = True,
):
"""Runs the lister and tests that the listed origins are correct."""
lister = RPMLister(
scheduler=swh_scheduler,
url=FEDORA_URL,
instance="Fedora",
rpm_src_data=[
{
"base_url": FEDORA_ARCHIVE_URL,
"releases": releases,
"components": components,
"index_url_templates": FEDORA_INDEX_URL_TEMPLATES,
}
],
incremental=incremental,
)
stats = lister.run()
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
lister_state = lister.get_state_from_scheduler()
state_pkg_versions = {k.split("/")[-1]: set(v) for k, v in pkg_versions.items()}
# One component from each release plus extra null page to flush origins
assert stats.pages == (len(releases) + 1 if updated else 1)
assert stats.origins == origin_count
assert {
o.url: o.extra_loader_arguments["packages"] for o in scheduler_origins
} == pkg_versions
if incremental:
assert lister_state.package_versions == state_pkg_versions
assert lister.updated == updated
@pytest.mark.parametrize("status_code", [400, 404, 500])
def test_fedora_lister_http_error(swh_scheduler, mocker, status_code):
"""
Simulates handling of HTTP Errors while fetching packages for fedora releases.
"""
release = "18"
component = "Everything"
mock_repomd(
mocker,
side_effect=[HTTPError() for _ in range(len(FEDORA_INDEX_URL_TEMPLATES))],
)
run_lister(
swh_scheduler,
releases=[release],
components=[component],
pkg_versions={},
origin_count=0,
updated=False,
)
def test_full_rpm_lister(
swh_scheduler,
mocker,
datadir,
pkg_versions,
):
"""
Simulates a full listing of packages for fedora releases.
"""
mock_fedora_repomd(datadir, mocker)
run_lister(
swh_scheduler,
releases=["26", "36"],
components=["Everything"],
pkg_versions=pkg_versions,
origin_count=3,
)
def test_incremental_rpm_lister(
swh_scheduler,
mocker,
datadir,
pkg_versions,
):
"""
Simulates an incremental listing of packages for fedora releases.
"""
# First run
mock_fedora_repomd(datadir, mocker)
run_lister(
swh_scheduler,
releases=["26", "36"],
components=["Everything"],
pkg_versions=pkg_versions,
origin_count=3,
incremental=True,
)
# Second run (no updates)
mock_fedora_repomd(datadir, mocker)
run_lister(
swh_scheduler,
releases=["26", "36"],
components=["Everything"],
pkg_versions=pkg_versions,
origin_count=0,
incremental=True,
)
# Use an altered version of primary36.xml in which we updated the version
# of package 0xFFFF to 0.10:
mock_fedora_repomd(datadir, mocker, use_altered_fedora36=True)
# Add new version to the set of expected pkg versions:
pkg_versions[rpm_package_origin_url("0xFFFF")].update(
{
"36/Everything/0.10-4": {
"name": "0xFFFF",
"version": "0.10-4",
"build_time": "2022-01-19T19:13:53+00:00",
"url": rpm_src_package_url(
release="36",
component="Everything",
path="0/0xFFFF-0.10-4.fc36.src.rpm",
),
"checksums": {
"sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd"
},
}
}
)
# Third run (0xFFFF in fedora36 component got updated and it needs to be listed)
run_lister(
swh_scheduler,
releases=["26", "36"],
components=["Everything"],
pkg_versions=pkg_versions,
origin_count=1,
incremental=True,
)

View file

@ -0,0 +1,67 @@
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
from .test_lister import FEDORA_ARCHIVE_URL, FEDORA_INDEX_URL_TEMPLATES, FEDORA_URL
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.rpm.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
LISTER_KWARGS = dict(
url=FEDORA_URL,
instance="fedora",
rpm_src_data=[
{
"base_url": FEDORA_ARCHIVE_URL,
"releases": ["36"],
"components": ["Everything"],
"index_url_templates": FEDORA_INDEX_URL_TEMPLATES,
}
],
)
def test_full_listing(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
lister = mocker.patch("swh.lister.rpm.tasks.RPMLister")
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
res = swh_scheduler_celery_app.send_task(
"swh.lister.rpm.tasks.FullRPMLister",
kwargs=LISTER_KWARGS,
)
assert res
res.wait()
assert res.successful()
lister.from_configfile.assert_called_once_with(**LISTER_KWARGS)
lister.run.assert_called_once_with()
def test_incremental_listing(
swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
):
lister = mocker.patch("swh.lister.rpm.tasks.RPMLister")
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
res = swh_scheduler_celery_app.send_task(
"swh.lister.rpm.tasks.IncrementalRPMLister",
kwargs=LISTER_KWARGS,
)
assert res
res.wait()
assert res.successful()
lister.from_configfile.assert_called_once_with(**LISTER_KWARGS, incremental=True)
lister.run.assert_called_once_with()

View file

@ -38,9 +38,7 @@ lister_args = {
"url": "https://guix.gnu.org/sources.json",
"origin_upstream": "https://git.savannah.gnu.org/cgit/guix.git/",
},
"fedora": {
"url": "https://archives.fedoraproject.org/pub/archive/fedora/linux/releases/",
},
"rpm": {"url": "http://opensuse.org", "instance": "openSUSE", "rpm_src_data": []},
"pagure": {"instance": "pagure.io"},
"gitweb": {
"url": "https://git.distorted.org.uk/~mdw/",
@ -64,8 +62,7 @@ def test_get_lister_wrong_input():
def test_get_lister(swh_scheduler_config):
"""Instantiating a supported lister should be ok"""
# Drop launchpad lister from the lister to check, its test setup is more involved
# than the other listers and it's not currently done here
for lister_name in SUPPORTED_LISTERS:
lst = get_lister(
lister_name,