rpm: Turn fedora lister into a generic Red Hat based distribution one

As Red Hat based linux distributions share the same type of package repository,
rework the fedora lister into a generic one to list RPM source packages and
their versions from numerous distributions.

For a given distribution, the RPM lister will fetch packages metadata from a
list of release identifiers and a list of software components. Source packages
are then processed and relevant info are extracted to be sent to the RPM loader.
When all releases and components were processed, the lister collected all versions
for each package name and send those info to the scheduler that will create RPM
loading tasks afterwards.

Nevertheless, as there is no generic way to list all releases and components for
a given distribution but also to guess the right URL to retrieve packages metadata
from, those info need to be manually provided to the lister as input parameters.
Some examples of those parameters for various distributions can be found in the
config directory of the lister.

Regarding the produced origin URLs, as there is no way to find valid HTTP ones
for all distributions, the same behavior as with the debian lister is used and
they have the following form: rpm://{instance}/packages/{package_name} where
the instance variable corresponds to the name of the listed distribution such
as Fedora, CentOS, or openSUSE.

Related to swh/meta#5011.
This commit is contained in:
Antoine Lambert 2023-08-16 13:25:23 +00:00
parent fcfb7004db
commit 95714f6f37
23 changed files with 1096 additions and 577 deletions

View file

@ -0,0 +1,13 @@
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
def register():
from .lister import RPMLister
return {
"lister": RPMLister,
"task_modules": [f"{__name__}.tasks"],
}

View file

@ -0,0 +1,100 @@
# RPM lister parameters to process CentOS source packages
url: https://www.centos.org
instance: CentOS
rpm_src_data:
- base_url: https://vault.centos.org/
releases:
- "3.7"
- "3.8"
- "3.9"
- "4.0"
- "4.1"
- "4.2"
- "4.3"
- "4.4"
- "4.5"
- "4.6"
- "4.7"
- "4.8"
- "4.9"
- "5.0"
- "5.1"
- "5.2"
- "5.3"
- "5.4"
- "5.5"
- "5.6"
- "5.7"
- "5.8"
- "5.9"
- "5.10"
- "5.11"
- "6.0"
- "6.1"
- "6.2"
- "6.3"
- "6.4"
- "6.5"
- "6.6"
- "6.7"
- "6.8"
- "6.9"
- "6.10"
- "7.0.1406"
- "7.1.1503"
- "7.2.1511"
- "7.3.1611"
- "7.4.1708"
- "7.5.1804"
- "7.6.1810"
- "7.7.1908"
- "7.8.2003"
- "7.9.2009"
- "8-stream"
- "8.0.1905"
- "8.1.1911"
- "8.2.2004"
- "8.3.2011"
- "8.4.2105"
- "8.5.2111"
components:
- AppStream
- BaseOS
- HighAvailability
- PowerTools
- SCL
- addons
- centosplus
- contrib
- cr
- csgfs
- dotnet
- extras
- fasttrack
- opstools
- os
- rt
- testing
- updates
- xen4
index_url_templates:
- $base_url/$release/$component/Source/
- $base_url/$release/$component/SRPMS/
- $base_url/$release/$component/x86_64/
- base_url: https://mirror.stream.centos.org
releases:
- 9-stream
components:
- AppStream
- BaseOS
- CRB
- HighAvailability
- NFV
- RT
- ResilientStorage
index_url_templates:
- $base_url/$release/$component/source/tree/

View file

@ -0,0 +1,77 @@
# RPM lister parameters to process Fedora source packages
url: https://fedoraproject.org
instance: "Fedora"
rpm_src_data:
- base_url: https://archives.fedoraproject.org/pub/archive/fedora/linux/
releases:
- "2"
- "3"
- "4"
- "5"
- "6"
components:
- core
- extras
index_url_templates:
- $base_url/$component/$release/SRPMS
- $base_url/$component/$release/source/SRPMS
- $base_url/$component/$release/x86_64/os/
- base_url: https://archives.fedoraproject.org/pub/archive/fedora/linux/
releases:
- "7"
- "8"
- "9"
- "10"
- "11"
- "12"
- "13"
- "14"
- "15"
- "16"
- "17"
- "18"
- "19"
- "20"
- "21"
- "22"
- "23"
- "24"
- "25"
- "26"
- "27"
- "28"
- "29"
- "30"
- "31"
- "32"
- "33"
- "34"
- "35"
components:
- Everything
- Server
- Workstation
- Modular
- Fedora
index_url_templates:
- $base_url/releases/$release/$component/source/tree/
- $base_url/updates/$release/$component/source/tree/
- $base_url/releases/$release/$component/source/SRPMS/
- $base_url/updates/$release/SRPMS/
- base_url: https://dl.fedoraproject.org/pub/fedora/linux/
releases:
- "36"
- "37"
- "38"
components:
- Everything
- Server
- Workstation
- Modular
- Fedora
index_url_templates:
- $base_url/releases/$release/$component/source/tree/
- $base_url/updates/$release/$component/source/tree/

View file

@ -0,0 +1,26 @@
# RPM lister parameters to process openSUSE source packages
url: http://opensuse.org
instance: openSUSE
rpm_src_data:
- base_url: http://download.opensuse.org/source/
releases:
- tumbleweed
- jump/15.2
- leap/15.0-Current
- leap/15.0
- leap/15.1
- leap/15.2
- leap/15.3
- leap/15.4
- leap/15.5
- leap/42.2
- leap/42.3-Current
- leap/42.3
components:
- oss
- non-oss
index_url_templates:
- $base_url/distribution/$release/repo/$component/
- $base_url/distribution/$release/repo/$component/suse/
- $base_url/$release/repo/$component/

View file

@ -0,0 +1,156 @@
# RPM lister parameters to process Oracle Linux source packages
url: https://www.oracle.com/linux
instance: OracleLinux
rpm_src_data:
- base_url: https://yum.oracle.com/repo/EnterpriseLinux/
releases:
- EL5
components:
- addons
- oracle_addons
- unsupported
- 0/base
- 1/base
- 2/base
- 3/base
- 4/base
- 5/base
index_url_templates:
- $base_url/$release/$component/x86_64
- base_url: https://yum.oracle.com/repo/OracleLinux/
releases:
- OL5
- OL6
- OL7
- OL8
- OL9
components:
- 0/base
- 0/baseos/base
- 1/base
- 1/baseos/base
- 10/base
- 11/base
- 2/base
- 2/baseos/base
- 3/base
- 3/baseos/base
- 4/base
- 4/baseos/base
- 4/security/validation
- 5/base
- 5/baseos/base
- 6/base
- 6/baseos/base
- 7/base
- 7/baseos/base
- 8/base
- 8/baseos/base
- 8/security/validation
- 9/base
- MODRHCK
- MySQL
- MySQL56
- MySQL57_community
- MySQL80/community
- MySQL80/connectors/community
- MySQL80/tools/community
- MySQL80_community
- RDMA
- SoftwareCollections
- UEK/latest
- UEKR3
- UEKR3/latest
- UEKR3_OFED20
- UEKR4
- UEKR4/OFED
- UEKR4/archive
- UEKR5
- UEKR5/RDMA
- UEKR5/archive
- UEKR6
- UEKR6/RDMA
- UEKR7
- UEKR7/RDMA
- addons
- appstream
- appstream/developer
- automation2
- baseos/developer
- baseos/latest
- beta
- ceph
- ceph30
- codeready/builder
- codeready/builder/developer
- developer
- developer/EPEL
- developer/EPEL/modular
- developer/UEKR5
- developer/UEKR6
- developer/UEKR7
- developer/golang117
- developer/golang118
- developer/golang119
- developer/kvm/utils
- developer/nodejs12
- developer/olcne
- developer/php74
- developer_EPEL
- developer_gluster310
- developer_gluster312
- distro/builder
- gluster/appstream
- gluster312
- gluster41
- gluster5
- gluster6
- gluster8
- kvm/appstream
- kvm/utils
- latest
- latest/archive
- leapp
- ofed_UEK
- olcne
- olcne11
- olcne12
- olcne13
- olcne14
- olcne15
- olcne16
- openstack10
- openstack21
- openstack30
- openstack40
- openstack40_extras
- openstack50
- openstack50_extras
- optional
- optional/archive
- optional/beta
- oracle/instantclient
- oracle/instantclient21
- oraclelinuxmanager210/client
- oraclelinuxmanager210/server
- ovirt42
- ovirt42/extras
- ovirt43
- ovirt43/extras
- ovirt44
- ovirt44/extras
- security/validation
- spacewalk210/client
- spacewalk210/server
- spacewalk24/client
- spacewalk24/server
- spacewalk26/client
- spacewalk26/server
- spacewalk27/client
- spacewalk27/server
index_url_templates:
- $base_url/$release/$component/x86_64

View file

@ -0,0 +1,38 @@
# RPM lister parameters to process Rocky Linux source packages
url: https://rockylinux.org
instance: RockyLinux
rpm_src_data:
- base_url: https://download.rockylinux.org/
releases:
- "8.3"
- "8.4"
- "8.4-RC1"
- "8.5"
- "8.6"
- "8.7"
- "8.8"
- "9.0"
- "9.1"
- "9.2"
components:
- AppStream
- BaseOS
- Devel
- HighAvailability
- Minimal
- PowerTools
- ResilientStorage
- CRB
- NFV
- RT
- SAP
- SAPHANA
- devel
- extras
- plus
- nfv
- rockyrpi
index_url_templates:
- $base_url/vault/rocky/$release/$component/source/tree/
- $base_url/pub/rocky/$release/$component/source/tree/

314
swh/lister/rpm/lister.py Normal file
View file

@ -0,0 +1,314 @@
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from dataclasses import dataclass, field
from datetime import datetime, timezone
from itertools import product
import logging
from string import Template
from typing import Any, Dict, Iterator, List, Optional, Set, Tuple
from urllib.parse import urljoin
import repomd
from typing_extensions import TypedDict
from swh.scheduler.interface import SchedulerInterface
from swh.scheduler.model import ListedOrigin
from ..pattern import Lister
logger = logging.getLogger(__name__)
Release = str
Component = str
PkgName = str
PkgVersion = str
RPMOrigin = str
RPMPageType = Optional[Tuple[Release, Component, repomd.Repo]]
"""Each page is a list of packages for a given (release, component) pair
from a Red Hat based distribution."""
class RPMSourceData(TypedDict):
"""Dictionary holding relevant data for listing RPM source packages.
See content of the lister config directory to get examples of RPM
source data for famous RedHat based distributions.
"""
base_url: str
"""Base URL of a RPM repository"""
releases: List[Release]
"""List of release identifiers for a Red Hat based distribution"""
components: List[Component]
"""List of components for a Red Hat based distribution"""
index_url_templates: List[str]
"""List of URL templates to discover source packages metadata, the
following variables can be substituted in them: ``base_url``, ``release``
and ``edition``, see :class:`string.Template` for more details about the
format. The generated URLs must target directories containing a sub-directory
named ``repodata``, which contains packages metadata, in order to be
successfully processed by the lister."""
def _get_last_modified(pkg: repomd.Package) -> datetime:
"""Get timezone aware last modified time in UTC from RPM package metadata."""
ts = pkg._element.find("common:time", namespaces=repomd._ns).get("build")
return datetime.utcfromtimestamp(int(ts)).replace(tzinfo=timezone.utc)
def _get_checksums(pkg: repomd.Package) -> Dict[str, str]:
"""Get checksums associated to rpm archive."""
cs = pkg._element.find("common:checksum", namespaces=repomd._ns)
cs_type = cs.get("type")
if cs_type == "sha":
cs_type = "sha1"
return {cs_type: cs.text}
@dataclass
class RPMListerState:
"""State of RPM lister"""
package_versions: Dict[PkgName, Set[PkgVersion]] = field(default_factory=dict)
"""Dictionary mapping a package name to all the versions found during
last listing"""
class RPMLister(Lister[RPMListerState, RPMPageType]):
"""
List source packages for a Red Hat based linux distribution.
The lister creates a snapshot for each package from all its available versions.
In incremental mode, only packages with different snapshot since the last listing
operation will be sent to the scheduler that will create loading tasks to archive
newly found source code.
Args:
scheduler: instance of SchedulerInterface
url: Red Hat based distribution info URL
instance: name of Red Hat based distribution
rpm_src_data: list of dictionaries holding data required to list RPM source packages,
see examples in the config directory.
incremental: if :const:`True`, only packages with new versions are sent to the
scheduler when relisting
"""
LISTER_NAME = "rpm"
def __init__(
self,
scheduler: SchedulerInterface,
url: str,
instance: str,
rpm_src_data: List[RPMSourceData],
incremental: bool = False,
max_origins_per_page: Optional[int] = None,
max_pages: Optional[int] = None,
enable_origins: bool = True,
):
super().__init__(
scheduler=scheduler,
url=url,
instance=instance,
credentials={},
max_origins_per_page=max_origins_per_page,
max_pages=max_pages,
enable_origins=enable_origins,
)
self.rpm_src_data = rpm_src_data
self.incremental = incremental
self.listed_origins: Dict[RPMOrigin, ListedOrigin] = {}
self.origins_to_send: Set[RPMOrigin] = set()
self.package_versions: Dict[PkgName, Set[PkgVersion]] = {}
def state_from_dict(self, d: Dict[str, Any]) -> RPMListerState:
return RPMListerState(package_versions={k: set(v) for k, v in d.items()})
def state_to_dict(self, state: RPMListerState) -> Dict[str, Any]:
return {k: list(v) for k, v in state.package_versions.items()}
def repo_request(
self,
index_url_template: Template,
base_url: str,
release: Release,
component: Component,
) -> Optional[RPMPageType]:
"""Return parsed packages for a given distribution release and component."""
index_url = index_url_template.substitute(
base_url=base_url.rstrip("/"), release=release, component=component
)
try:
repo = repomd.load(index_url) # throws error if no repomd.xml is not found
except Exception:
logger.debug("Repository metadata not found at URL %s", index_url)
return None
else:
logger.debug(
"Fetched metadata from url: %s, found %d packages", index_url, len(repo)
)
return repo
def get_pages(self) -> Iterator[RPMPageType]:
"""Return an iterator on parsed rpm packages, one page per (release, component) pair."""
for rpm_src_data in self.rpm_src_data:
index_url_templates = [
Template(index_url_template)
for index_url_template in rpm_src_data["index_url_templates"]
]
# try all possible package repository URLs for each (release, component) pair
for release, component, index_url_template in product(
rpm_src_data["releases"],
rpm_src_data["components"],
index_url_templates,
):
repo = self.repo_request(
index_url_template,
rpm_src_data["base_url"],
release,
component,
)
if repo is not None:
# valid package repository found, yield page
yield (release, component, repo)
yield None
def origin_url_for_package(self, package_name: PkgName) -> RPMOrigin:
"""Return the origin url for the given package."""
# TODO: Use a better origin URL before deploying the lister to production
# https://gitlab.softwareheritage.org/swh/devel/swh-model/-/issues/4632
return f"rpm://{self.instance}/packages/{package_name}"
def get_origins_from_page(self, page: RPMPageType) -> Iterator[ListedOrigin]:
"""Convert a page of rpm package sources into an iterator of ListedOrigin."""
assert self.lister_obj.id is not None
if page is None:
# all pages processed, yield listed origins
for origin_url in self.origins_to_send:
yield self.listed_origins[origin_url]
return
release, component, repo = page
logger.debug(
"Listing %s release %s component %s from repository metadata located at %s",
self.instance,
release,
component,
repo.baseurl,
)
origins_to_send = set()
new_origins_count = 0
# iterate on each package's metadata
for pkg_metadata in repo:
if pkg_metadata.arch != "src":
# not a source package, skip it
continue
# extract package metadata
package_name = pkg_metadata.name
# we extract the intrinsic version of the package for the rpm loader
# to avoid creating different releases targeting the same directory
# 2.12-10.el8 => 2.12-10
package_version_split = pkg_metadata.vr.rsplit("-", maxsplit=1)
package_version = "-".join(
[
package_version_split[0],
package_version_split[1].split(".", maxsplit=1)[0],
]
)
# create package version key as expected by the rpm loader
package_version_key = f"{release}/{component}/{package_version}"
package_build_time = _get_last_modified(pkg_metadata)
package_download_url = urljoin(
repo.baseurl.rstrip("/") + "/", pkg_metadata.location
)
checksums = _get_checksums(pkg_metadata)
# build origin url
origin_url = self.origin_url_for_package(package_name)
# this is the first time a package is listed
if origin_url not in self.listed_origins:
# create a ListedOrigin object for it that can be later
# updated with new package versions info
self.listed_origins[origin_url] = ListedOrigin(
lister_id=self.lister_obj.id,
url=origin_url,
visit_type="rpm",
extra_loader_arguments={"packages": {}},
last_update=package_build_time,
)
# init set that will contain all listed package versions
self.package_versions[package_name] = set()
new_origins_count += 1
# origins will be yielded when all pages processed
origins_to_send.add(origin_url)
# update package metadata in parameter that will be provided
# to the rpm loader
self.listed_origins[origin_url].extra_loader_arguments["packages"][
package_version_key
] = {
"name": package_name,
"version": package_version,
"url": package_download_url,
"build_time": package_build_time.isoformat(),
"checksums": checksums,
}
last_update = self.listed_origins[origin_url].last_update
if last_update is not None and package_build_time > last_update:
self.listed_origins[origin_url].last_update = package_build_time
# add package version key to the set of found versions
self.package_versions[package_name].add(package_version_key)
# package has already been listed during a previous listing process
if self.incremental and package_name in self.state.package_versions:
new_versions = (
self.package_versions[package_name]
- self.state.package_versions[package_name]
)
# no new versions so far, no need to send the origin to the scheduler
if not new_versions:
origins_to_send.remove(origin_url)
logger.debug(
"Found %s packages to update (%s new ones and %s packages with new versions).",
len(origins_to_send),
new_origins_count,
len(origins_to_send) - new_origins_count,
)
logger.debug(
"Current total number of listed source packages is equal to %s.",
len(self.listed_origins),
)
self.origins_to_send.update(origins_to_send)
def finalize(self):
if self.incremental:
# set mapping between listed package names and versions as lister state
self.state.package_versions = self.package_versions
self.updated = len(self.listed_origins) > 0

28
swh/lister/rpm/tasks.py Normal file
View file

@ -0,0 +1,28 @@
# Copyright (C) 2022-2023 the Software Heritage developers
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from typing import Dict
from celery import shared_task
from .lister import RPMLister
@shared_task(name=__name__ + ".FullRPMLister")
def list_rpm_full(**lister_args) -> Dict[str, int]:
"""Full listing of Red Hat based distribution source packages"""
lister = RPMLister.from_configfile(**lister_args)
return lister.run().dict()
@shared_task(name=__name__ + ".IncrementalRPMLister")
def list_rpm_incremental(**lister_args) -> Dict[str, int]:
"""Incremental listing of Red Hat based distribution source packages"""
lister = RPMLister.from_configfile(**lister_args, incremental=True)
return lister.run().dict()
@shared_task(name=__name__ + ".ping")
def _ping() -> str:
return "OK"

View file

View file

@ -0,0 +1,55 @@
<?xml version="1.0" encoding="UTF-8"?>
<repomd xmlns="http://linux.duke.edu/metadata/repo" xmlns:rpm="http://linux.duke.edu/metadata/rpm">
<revision>1499286311</revision>
<data type="primary">
<checksum type="sha256">4f677623c24912d86848f86837d398979b5adc2a51d9a2170f11fe42a257f3d3</checksum>
<open-checksum type="sha256">db616ad8e4219e23dfc05cd515e017cdc0d59144689ac606951fa42cbb06ae65</open-checksum>
<location href="repodata/4f677623c24912d86848f86837d398979b5adc2a51d9a2170f11fe42a257f3d3-primary.xml.gz"/>
<timestamp>1499286305</timestamp>
<size>5425131</size>
<open-size>30064034</open-size>
</data>
<data type="filelists">
<checksum type="sha256">17296af99a4b80bc67fccabe71ecefa02b76e8409372d936c054b8c9de312b6c</checksum>
<open-checksum type="sha256">7caabd1205a72d26422756211dcd536336cef643f7f73eb15a470b02ff09a194</open-checksum>
<location href="repodata/17296af99a4b80bc67fccabe71ecefa02b76e8409372d936c054b8c9de312b6c-filelists.xml.gz"/>
<timestamp>1499286305</timestamp>
<size>1650273</size>
<open-size>6419422</open-size>
</data>
<data type="other">
<checksum type="sha256">8f1ed139aeaa57f5bc280ce97b82f690e4008c122b4793791ca18e513268b6eb</checksum>
<open-checksum type="sha256">786b8d4fa759f0ade3eaab1bde390d12c950dfe217eda1773400f3a3d461522b</open-checksum>
<location href="repodata/8f1ed139aeaa57f5bc280ce97b82f690e4008c122b4793791ca18e513268b6eb-other.xml.gz"/>
<timestamp>1499286305</timestamp>
<size>4396102</size>
<open-size>33165783</open-size>
</data>
<data type="primary_db">
<checksum type="sha256">1d2c0be48c35e55669b410cb4dbe767ae4850b4c610e95ca9aee67f7eb31e457</checksum>
<open-checksum type="sha256">dc8dbac072ac1412f0ecface57fa57c5ddcac14acc880fe9b467164be733e963</open-checksum>
<location href="repodata/1d2c0be48c35e55669b410cb4dbe767ae4850b4c610e95ca9aee67f7eb31e457-primary.sqlite.bz2"/>
<timestamp>1499286309</timestamp>
<size>7071217</size>
<open-size>26177536</open-size>
<database_version>10</database_version>
</data>
<data type="filelists_db">
<checksum type="sha256">5e1259759b9bedefc1ff14b81760524841402776e6c1b33014f4f5d6feb40d11</checksum>
<open-checksum type="sha256">b293d51dd4e6eb4128e40b6ce228c62b169b1d47be535e56f69b8ad622c4a6ca</open-checksum>
<location href="repodata/5e1259759b9bedefc1ff14b81760524841402776e6c1b33014f4f5d6feb40d11-filelists.sqlite.bz2"/>
<timestamp>1499286307</timestamp>
<size>2227395</size>
<open-size>5529600</open-size>
<database_version>10</database_version>
</data>
<data type="other_db">
<checksum type="sha256">f6b30bdfe96d2137542704288de1345c01ea14397eb187126d4474648bad5292</checksum>
<open-checksum type="sha256">3f5d4619dcabe945b773c1c98ea40b8ead53340291bd504ab3faabfc7b57bb99</open-checksum>
<location href="repodata/f6b30bdfe96d2137542704288de1345c01ea14397eb187126d4474648bad5292-other.sqlite.bz2"/>
<timestamp>1499286311</timestamp>
<size>5264843</size>
<open-size>27930624</open-size>
<database_version>10</database_version>
</data>
</repomd>

View file

@ -0,0 +1,85 @@
<?xml version="1.0" encoding="UTF-8"?>
<repomd xmlns="http://linux.duke.edu/metadata/repo" xmlns:rpm="http://linux.duke.edu/metadata/rpm">
<revision>1651698851</revision>
<data type="primary">
<checksum type="sha256">42155056c6d7b1f0e5437bb2a92c48e6d21a02ee8f09acc726e705c26e960a3c</checksum>
<open-checksum type="sha256">a5841e7086be579d58e2dbb7628caebba32d9defa85739455d518bfaf90e39b0</open-checksum>
<location href="repodata/42155056c6d7b1f0e5437bb2a92c48e6d21a02ee8f09acc726e705c26e960a3c-primary.xml.gz"/>
<timestamp>1651698827</timestamp>
<size>7144060</size>
<open-size>45898728</open-size>
</data>
<data type="filelists">
<checksum type="sha256">fc915adcdf5710f9f80dfffcec8f03088f09cf80fbc9c801d5a8f45f1f31bb92</checksum>
<open-checksum type="sha256">a96a4739268e250e3c3461da716472503ed5ed8b27161fec9a143d4a8ccf5767</open-checksum>
<location href="repodata/fc915adcdf5710f9f80dfffcec8f03088f09cf80fbc9c801d5a8f45f1f31bb92-filelists.xml.gz"/>
<timestamp>1651698827</timestamp>
<size>1934835</size>
<open-size>7458268</open-size>
</data>
<data type="other">
<checksum type="sha256">461db9fa87e564d75d74c0dfbf006ea5d18ed646d4cb8dee1c69a4d95dd08d09</checksum>
<open-checksum type="sha256">1733c3011a0323fadac711dd25176c9934698176605c3e516b6aabb9b5775e00</open-checksum>
<location href="repodata/461db9fa87e564d75d74c0dfbf006ea5d18ed646d4cb8dee1c69a4d95dd08d09-other.xml.gz"/>
<timestamp>1651698827</timestamp>
<size>3779969</size>
<open-size>33166564</open-size>
</data>
<data type="primary_db">
<checksum type="sha256">ac60dd254bfc7557eb646a116bf8083b49fee8e942e1ef50dff7f74004897e74</checksum>
<open-checksum type="sha256">c752f5132f2cc5f4f137dade787154316f9503ae816212b8fabf5733cc2d344d</open-checksum>
<location href="repodata/ac60dd254bfc7557eb646a116bf8083b49fee8e942e1ef50dff7f74004897e74-primary.sqlite.xz"/>
<timestamp>1651698851</timestamp>
<size>9058624</size>
<open-size>41562112</open-size>
<database_version>10</database_version>
</data>
<data type="filelists_db">
<checksum type="sha256">1a279b88531d9c2e24c0bfc9a0d6b4357d70301c24fa42f649c726ed1af1d6a8</checksum>
<open-checksum type="sha256">e9b5c17e6004a78d20146aa54fa5ac93a01f4f2a95117588d649e92cfc008473</open-checksum>
<location href="repodata/1a279b88531d9c2e24c0bfc9a0d6b4357d70301c24fa42f649c726ed1af1d6a8-filelists.sqlite.xz"/>
<timestamp>1651698834</timestamp>
<size>1809496</size>
<open-size>6471680</open-size>
<database_version>10</database_version>
</data>
<data type="other_db">
<checksum type="sha256">850ad17efdebe5f9ccbef03c8aec4e7589bb6a1ca9a6249578968d60ad094a4f</checksum>
<open-checksum type="sha256">d13c6da8f7ad2c9060fd5b811b86facc9e926ec9273c0e135c4fe1110f784cdc</open-checksum>
<location href="repodata/850ad17efdebe5f9ccbef03c8aec4e7589bb6a1ca9a6249578968d60ad094a4f-other.sqlite.xz"/>
<timestamp>1651698838</timestamp>
<size>4285108</size>
<open-size>27897856</open-size>
<database_version>10</database_version>
</data>
<data type="primary_zck">
<checksum type="sha256">fc4205cf1cca7f0c157d1aa9a1348a1742ca7df671fbf7ccccd79221d473145b</checksum>
<open-checksum type="sha256">a5841e7086be579d58e2dbb7628caebba32d9defa85739455d518bfaf90e39b0</open-checksum>
<header-checksum type="sha256">2074f3da25ad0d45cf2776ad35dd22a6c63fafff319143c2f7dfefa98b99d651</header-checksum>
<location href="repodata/fc4205cf1cca7f0c157d1aa9a1348a1742ca7df671fbf7ccccd79221d473145b-primary.xml.zck"/>
<timestamp>1651698828</timestamp>
<size>6030441</size>
<open-size>45898728</open-size>
<header-size>231</header-size>
</data>
<data type="filelists_zck">
<checksum type="sha256">6c77673bb8823bf04fd4520c421fd0fc84567db9f23b8aa19f600b0688e46dd9</checksum>
<open-checksum type="sha256">a96a4739268e250e3c3461da716472503ed5ed8b27161fec9a143d4a8ccf5767</open-checksum>
<header-checksum type="sha256">55fc5e75acd903f01cf18328fec9c6f995bd8f80c5b085aa3e0fe116bb89e891</header-checksum>
<location href="repodata/6c77673bb8823bf04fd4520c421fd0fc84567db9f23b8aa19f600b0688e46dd9-filelists.xml.zck"/>
<timestamp>1651698829</timestamp>
<size>1735208</size>
<open-size>7458268</open-size>
<header-size>136</header-size>
</data>
<data type="other_zck">
<checksum type="sha256">c87c1b085ef287ba69b1f244d3fff56fc5efc01ffd1d7c10ee22328117651cd5</checksum>
<open-checksum type="sha256">1733c3011a0323fadac711dd25176c9934698176605c3e516b6aabb9b5775e00</open-checksum>
<header-checksum type="sha256">93624d227c24ff4eb2332fcb038e7157e08ed051b654820def75c5511a1ce191</header-checksum>
<location href="repodata/c87c1b085ef287ba69b1f244d3fff56fc5efc01ffd1d7c10ee22328117651cd5-other.xml.zck"/>
<timestamp>1651698829</timestamp>
<size>3019451</size>
<open-size>33166564</open-size>
<header-size>206</header-size>
</data>
</repomd>

View file

@ -0,0 +1,283 @@
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from pathlib import Path
from string import Template
from typing import List
import pytest
from urllib3.exceptions import HTTPError
from swh.lister.rpm.lister import Component, Release, RPMLister
from swh.scheduler.interface import SchedulerInterface
FEDORA_URL = "https://fedoraproject.org/"
FEDORA_ARCHIVE_URL = "https://archives.fedoraproject.org/pub/archive/fedora/linux"
FEDORA_INDEX_URL_TEMPLATES = [
"$base_url/releases/$release/$component/source/tree/",
"$base_url/updates/$release/$component/source/tree/",
"$base_url/releases/$release/$component/source/SRPMS/",
"$base_url/updates/$release/SRPMS/",
]
def mock_repomd(mocker, side_effect):
"""Mocks the .xml files fetched by repomd for the next lister run"""
cm = mocker.MagicMock()
cm.read.side_effect = side_effect
cm.__enter__.return_value = cm
mocker.patch("repomd.urllib.request.urlopen").return_value = cm
def mock_fedora_repomd(datadir, mocker, use_altered_fedora36=False):
repodata = [
["repomd26.xml", "primary26.xml.gz"],
["repomd36.xml", "primary36.xml.gz"],
]
if use_altered_fedora36:
repodata[1][1] = "primary36-altered.xml.gz"
side_effect = []
for paths in repodata:
side_effect += [
Path(datadir, "archives.fedoraproject.org", path).read_bytes()
for path in paths
]
side_effect += [HTTPError() for _ in range(len(FEDORA_INDEX_URL_TEMPLATES) - 1)]
mock_repomd(mocker, side_effect)
def rpm_repodata_url(release, component):
return Template(FEDORA_INDEX_URL_TEMPLATES[0]).substitute(
base_url=FEDORA_ARCHIVE_URL, release=release, component=component
)
def rpm_src_package_url(release, component, path):
return f"{rpm_repodata_url(release, component)}Packages/{path}"
def rpm_package_origin_url(package_name, instance="Fedora"):
return f"rpm://{instance}/packages/{package_name}"
@pytest.fixture
def pkg_versions():
return {
f"{rpm_package_origin_url('0install')}": {
"26/Everything/2.11-4": {
"name": "0install",
"version": "2.11-4",
"build_time": "2017-02-10T04:59:31+00:00",
"url": rpm_src_package_url(
release="26",
component="Everything",
path="0/0install-2.11-4.fc26.src.rpm",
),
"checksums": {
# note: we intentionally altered the original
# primary26.xml file to test sha1 usage
"sha1": "a6fdef5d1026dea208eeeba148f55ac2f545989b",
},
}
},
f"{rpm_package_origin_url('0xFFFF')}": {
"26/Everything/0.3.9-15": {
"name": "0xFFFF",
"version": "0.3.9-15",
"build_time": "2017-02-10T05:01:53+00:00",
"url": rpm_src_package_url(
release="26",
component="Everything",
path="0/0xFFFF-0.3.9-15.fc26.src.rpm",
),
"checksums": {
"sha256": "96f9c163c0402d2b30e5343c8397a6d50e146c85a446804396b119ef9698231f"
},
},
"36/Everything/0.9-4": {
"name": "0xFFFF",
"version": "0.9-4",
"build_time": "2022-01-19T19:13:53+00:00",
"url": rpm_src_package_url(
release="36",
component="Everything",
path="0/0xFFFF-0.9-4.fc36.src.rpm",
),
"checksums": {
"sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd"
},
},
},
f"{rpm_package_origin_url('2ping')}": {
"36/Everything/4.5.1-2": {
"name": "2ping",
"version": "4.5.1-2",
"build_time": "2022-01-19T19:12:21+00:00",
"url": rpm_src_package_url(
release="36",
component="Everything",
path="2/2ping-4.5.1-2.fc36.src.rpm",
),
"checksums": {
"sha256": "2ce028d944ebea1cab8c6203c9fed882792478b42fc34682b886a9db16e9de28"
},
}
},
}
def run_lister(
swh_scheduler: SchedulerInterface,
releases: List[Release],
components: List[Component],
pkg_versions: dict,
origin_count: int,
incremental: bool = False,
updated: bool = True,
):
"""Runs the lister and tests that the listed origins are correct."""
lister = RPMLister(
scheduler=swh_scheduler,
url=FEDORA_URL,
instance="Fedora",
rpm_src_data=[
{
"base_url": FEDORA_ARCHIVE_URL,
"releases": releases,
"components": components,
"index_url_templates": FEDORA_INDEX_URL_TEMPLATES,
}
],
incremental=incremental,
)
stats = lister.run()
scheduler_origins = swh_scheduler.get_listed_origins(lister.lister_obj.id).results
lister_state = lister.get_state_from_scheduler()
state_pkg_versions = {k.split("/")[-1]: set(v) for k, v in pkg_versions.items()}
# One component from each release plus extra null page to flush origins
assert stats.pages == (len(releases) + 1 if updated else 1)
assert stats.origins == origin_count
assert {
o.url: o.extra_loader_arguments["packages"] for o in scheduler_origins
} == pkg_versions
if incremental:
assert lister_state.package_versions == state_pkg_versions
assert lister.updated == updated
@pytest.mark.parametrize("status_code", [400, 404, 500])
def test_fedora_lister_http_error(swh_scheduler, mocker, status_code):
"""
Simulates handling of HTTP Errors while fetching packages for fedora releases.
"""
release = "18"
component = "Everything"
mock_repomd(
mocker,
side_effect=[HTTPError() for _ in range(len(FEDORA_INDEX_URL_TEMPLATES))],
)
run_lister(
swh_scheduler,
releases=[release],
components=[component],
pkg_versions={},
origin_count=0,
updated=False,
)
def test_full_rpm_lister(
swh_scheduler,
mocker,
datadir,
pkg_versions,
):
"""
Simulates a full listing of packages for fedora releases.
"""
mock_fedora_repomd(datadir, mocker)
run_lister(
swh_scheduler,
releases=["26", "36"],
components=["Everything"],
pkg_versions=pkg_versions,
origin_count=3,
)
def test_incremental_rpm_lister(
swh_scheduler,
mocker,
datadir,
pkg_versions,
):
"""
Simulates an incremental listing of packages for fedora releases.
"""
# First run
mock_fedora_repomd(datadir, mocker)
run_lister(
swh_scheduler,
releases=["26", "36"],
components=["Everything"],
pkg_versions=pkg_versions,
origin_count=3,
incremental=True,
)
# Second run (no updates)
mock_fedora_repomd(datadir, mocker)
run_lister(
swh_scheduler,
releases=["26", "36"],
components=["Everything"],
pkg_versions=pkg_versions,
origin_count=0,
incremental=True,
)
# Use an altered version of primary36.xml in which we updated the version
# of package 0xFFFF to 0.10:
mock_fedora_repomd(datadir, mocker, use_altered_fedora36=True)
# Add new version to the set of expected pkg versions:
pkg_versions[rpm_package_origin_url("0xFFFF")].update(
{
"36/Everything/0.10-4": {
"name": "0xFFFF",
"version": "0.10-4",
"build_time": "2022-01-19T19:13:53+00:00",
"url": rpm_src_package_url(
release="36",
component="Everything",
path="0/0xFFFF-0.10-4.fc36.src.rpm",
),
"checksums": {
"sha256": "45eee8d990d502324ae665233c320b8a5469c25d735f1862e094c1878d6ff2cd"
},
}
}
)
# Third run (0xFFFF in fedora36 component got updated and it needs to be listed)
run_lister(
swh_scheduler,
releases=["26", "36"],
components=["Everything"],
pkg_versions=pkg_versions,
origin_count=1,
incremental=True,
)

View file

@ -0,0 +1,67 @@
# Copyright (C) 2022-2023 The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information
from swh.lister.pattern import ListerStats
from .test_lister import FEDORA_ARCHIVE_URL, FEDORA_INDEX_URL_TEMPLATES, FEDORA_URL
def test_ping(swh_scheduler_celery_app, swh_scheduler_celery_worker):
res = swh_scheduler_celery_app.send_task("swh.lister.rpm.tasks.ping")
assert res
res.wait()
assert res.successful()
assert res.result == "OK"
LISTER_KWARGS = dict(
url=FEDORA_URL,
instance="fedora",
rpm_src_data=[
{
"base_url": FEDORA_ARCHIVE_URL,
"releases": ["36"],
"components": ["Everything"],
"index_url_templates": FEDORA_INDEX_URL_TEMPLATES,
}
],
)
def test_full_listing(swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker):
lister = mocker.patch("swh.lister.rpm.tasks.RPMLister")
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
res = swh_scheduler_celery_app.send_task(
"swh.lister.rpm.tasks.FullRPMLister",
kwargs=LISTER_KWARGS,
)
assert res
res.wait()
assert res.successful()
lister.from_configfile.assert_called_once_with(**LISTER_KWARGS)
lister.run.assert_called_once_with()
def test_incremental_listing(
swh_scheduler_celery_app, swh_scheduler_celery_worker, mocker
):
lister = mocker.patch("swh.lister.rpm.tasks.RPMLister")
lister.from_configfile.return_value = lister
lister.run.return_value = ListerStats(pages=10, origins=500)
res = swh_scheduler_celery_app.send_task(
"swh.lister.rpm.tasks.IncrementalRPMLister",
kwargs=LISTER_KWARGS,
)
assert res
res.wait()
assert res.successful()
lister.from_configfile.assert_called_once_with(**LISTER_KWARGS, incremental=True)
lister.run.assert_called_once_with()